Coverage for sparkle/configurator/implementations/smac2.py: 72%

189 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-03 10:42 +0000

1"""Configurator classes to implement SMAC2 in Sparkle.""" 

2from __future__ import annotations 

3from pathlib import Path 

4import glob 

5import shutil 

6import math 

7 

8import pandas as pd 

9 

10from runrunner import Runner, Run 

11 

12from sparkle.tools.parameters import PCSConvention 

13from sparkle.configurator.configurator import Configurator, ConfigurationScenario 

14from sparkle.solver import Solver 

15from sparkle.structures import PerformanceDataFrame, FeatureDataFrame 

16from sparkle.instance import InstanceSet, Instance_Set 

17from sparkle.types import SparkleObjective, resolve_objective 

18 

19 

20class SMAC2(Configurator): 

21 """Class for SMAC2 (Java) configurator.""" 

22 configurator_path = Path(__file__).parent.parent.parent.resolve() /\ 

23 "Components/smac2-v2.10.03-master-778" 

24 configurator_executable = configurator_path / "smac" 

25 configurator_target = configurator_path / "smac2_target_algorithm.py" 

26 

27 version = "2.10.03" 

28 full_name = "Sequential Model-based Algorithm Configuration" 

29 

30 def __init__(self: SMAC2, 

31 base_dir: Path, 

32 output_path: Path) -> None: 

33 """Returns the SMAC2 configurator, Java SMAC V2.10.03. 

34 

35 Args: 

36 objectives: The objectives to optimize. Only supports one objective. 

37 base_dir: The path where the configurator will be executed in. 

38 output_path: The path where the output will be placed. 

39 """ 

40 output_path = output_path / SMAC2.__name__ 

41 output_path.mkdir(parents=True, exist_ok=True) 

42 return super().__init__( 

43 output_path=output_path, 

44 base_dir=base_dir, 

45 tmp_path=output_path / "tmp", 

46 multi_objective_support=False) 

47 

48 @property 

49 def name(self: SMAC2) -> str: 

50 """Returns the name of the configurator.""" 

51 return SMAC2.__name__ 

52 

53 @staticmethod 

54 def scenario_class() -> ConfigurationScenario: 

55 """Returns the SMAC2 scenario class.""" 

56 return SMAC2Scenario 

57 

58 def configure(self: SMAC2, 

59 scenario: SMAC2Scenario, 

60 data_target: PerformanceDataFrame, 

61 validate_after: bool = True, 

62 sbatch_options: list[str] = [], 

63 slurm_prepend: str | list[str] | Path = None, 

64 num_parallel_jobs: int = None, 

65 base_dir: Path = None, 

66 run_on: Runner = Runner.SLURM) -> list[Run]: 

67 """Start configuration job. 

68 

69 Args: 

70 scenario: ConfigurationScenario object 

71 data_target: PerformanceDataFrame where to store the found configurations 

72 validate_after: Whether the configurations should be validated on the 

73 train set afterwards. 

74 sbatch_options: List of slurm batch options to use 

75 num_parallel_jobs: The maximum number of jobs to run parallel. 

76 base_dir: The path where the sbatch scripts will be created for Slurm. 

77 run_on: On which platform to run the jobs. Default: Slurm. 

78 

79 Returns: 

80 A RunRunner Run object. 

81 """ 

82 if shutil.which("java") is None: 

83 raise RuntimeError( 

84 "SMAC2 requires Java 1.8.0_402, but Java is not installed. " 

85 "Please ensure Java is installed and try again." 

86 ) 

87 scenario.create_scenario() 

88 # We set the seed over the last n run ids in the dataframe 

89 seeds = data_target.run_ids[data_target.num_runs - scenario.number_of_runs:] 

90 output = [f"{(scenario.results_directory).absolute()}/" 

91 f"{scenario.name}_seed_{seed}_smac.txt" 

92 for seed in seeds] 

93 cmds = [f"python3 {Configurator.configurator_cli_path.absolute()} " 

94 f"{SMAC2.__name__} {output_file} {data_target.csv_filepath} " 

95 f"{scenario.scenario_file_path} {seed} " 

96 f"{SMAC2.configurator_executable.absolute()} " 

97 f"--scenario-file {scenario.scenario_file_path} " 

98 f"--seed {seed} " 

99 for output_file, seed in zip(output, seeds)] 

100 if num_parallel_jobs is not None: 

101 num_parallel_jobs = max(num_parallel_jobs, len(cmds)) 

102 return super().configure( 

103 configuration_commands=cmds, 

104 data_target=data_target, 

105 output=output, 

106 slurm_prepend=slurm_prepend, 

107 num_parallel_jobs=num_parallel_jobs, 

108 scenario=scenario, 

109 validation_ids=seeds if validate_after else None, 

110 sbatch_options=sbatch_options, 

111 base_dir=base_dir, 

112 run_on=run_on 

113 ) 

114 

115 @staticmethod 

116 def organise_output(output_source: Path, 

117 output_target: Path, 

118 scenario: SMAC2Scenario, 

119 run_id: int) -> None | dict: 

120 """Retrieves configuration from SMAC file and places them in output.""" 

121 from filelock import FileLock 

122 call_key = SMAC2.configurator_target.name 

123 # Last line describing a call is the best found configuration 

124 for line in reversed(output_source.open("r").readlines()): 

125 if call_key in line: 

126 call_str = line.split(call_key, maxsplit=1)[1].strip() 

127 # The Configuration appears after the first 7 arguments 

128 configuration = call_str.split(" ", 8)[-1] 

129 break 

130 configuration = Solver.config_str_to_dict(configuration) 

131 if output_target is None or not output_target.exists(): 

132 return configuration 

133 time_stamp = scenario.scenario_file_path.stat().st_mtime 

134 configuration["configuration_id"] =\ 

135 f"{SMAC2.__name__}_{time_stamp}_{run_id}" 

136 instance_names = scenario.instance_set.instance_names 

137 lock = FileLock(f"{output_target}.lock") 

138 with lock.acquire(timeout=60): 

139 performance_data = PerformanceDataFrame(output_target) 

140 # Resolve absolute path to Solver column 

141 solver = [s for s in performance_data.solvers 

142 if Path(s).name == scenario.solver.name][0] 

143 # For some reason the instance paths in the instance set are absolute 

144 instances = [instance for instance in performance_data.instances 

145 if Path(instance).name in instance_names] 

146 # We don't set the seed in the dataframe, as that should be part of the conf 

147 performance_data.set_value( 

148 value=[str(configuration)], 

149 solver=solver, 

150 instance=instances, 

151 objective=None, 

152 run=run_id, 

153 solver_fields=[PerformanceDataFrame.column_configuration] 

154 ) 

155 performance_data.save_csv() 

156 

157 @staticmethod 

158 def get_smac_run_obj(objective: SparkleObjective) -> str: 

159 """Return the SMAC run objective based on the Performance Measure. 

160 

161 Returns: 

162 A string that represents the run objective set in the settings. 

163 """ 

164 if objective.time: 

165 return "RUNTIME" 

166 return "QUALITY" 

167 

168 def get_status_from_logs(self: SMAC2) -> None: 

169 """Method to scan the log files of the configurator for warnings.""" 

170 base_dir = self.output_path / "scenarios" 

171 if not base_dir.exists(): 

172 return 

173 print(f"Checking the log files of configurator {type(self).__name__} for " 

174 "warnings...") 

175 scenarios = [f for f in base_dir.iterdir() if f.is_dir()] 

176 for scenario in scenarios: 

177 log_dir = scenario / "outdir_train_configuration" \ 

178 / (scenario.name + "_scenario") 

179 warn_files = glob.glob(str(log_dir) + "/log-warn*") 

180 non_empty = [log_file for log_file in warn_files 

181 if Path(log_file).stat().st_size > 0] 

182 if len(non_empty) > 0: 

183 print(f"Scenario {scenario.name} has {len(non_empty)} warning(s), see " 

184 "the following log file(s) for more information:") 

185 for log_file in non_empty: 

186 print(f"\t-{log_file}") 

187 else: 

188 print(f"Scenario {scenario.name} has no warnings.") 

189 

190 

191class SMAC2Scenario(ConfigurationScenario): 

192 """Class to handle SMAC2 configuration scenarios.""" 

193 def __init__(self: SMAC2Scenario, 

194 solver: Solver, 

195 instance_set: InstanceSet, 

196 sparkle_objectives: list[SparkleObjective], 

197 parent_directory: Path, 

198 number_of_runs: int = None, 

199 solver_calls: int = None, 

200 max_iterations: int = None, 

201 cpu_time: int = None, 

202 wallclock_time: int = None, 

203 cutoff_time: int = None, 

204 target_cutoff_length: str = None, 

205 cli_cores: int = None, 

206 use_cpu_time_in_tunertime: bool = None, 

207 feature_data: FeatureDataFrame | Path = None)\ 

208 -> None: 

209 """Initialize scenario paths and names. 

210 

211 Args: 

212 solver: Solver that should be configured. 

213 instance_set: Instances object for the scenario. 

214 sparkle_objectives: SparkleObjectives used for each run of the configuration. 

215 Will be simplified to the first objective. 

216 parent_directory: Directory in which the scenario should be created. 

217 number_of_runs: The number of configurator runs to perform 

218 for configuring the solver. 

219 solver_calls: The number of times the solver is called for each 

220 configuration run 

221 max_iterations: The maximum number of iterations allowed for each 

222 configuration run. [iteration-limit, numIterations, numberOfIterations] 

223 cpu_time: The time budget allocated for each configuration run. (cpu) 

224 wallclock_time: The time budget allocated for each configuration run. 

225 (wallclock) 

226 cutoff_time: The maximum time allowed for each individual run during 

227 configuration. 

228 target_cutoff_length: A domain specific measure of when the algorithm 

229 should consider itself done. 

230 cli_cores: int 

231 The number of cores to use to execute runs. Defaults in SMAC2 to 1. 

232 use_cpu_time_in_tunertime: Whether to calculate SMAC2's own used time for 

233 budget deduction. Defaults in SMAC2 to True. 

234 feature_data: If features are used, this contains the feature data. 

235 If it is a FeatureDataFrame, will convert values to SMAC2 format. 

236 If it is a Path, will pass the path to SMAC2. 

237 Defaults to None. 

238 """ 

239 super().__init__(solver, instance_set, sparkle_objectives, parent_directory) 

240 self.solver = solver 

241 self.instance_set = instance_set 

242 self.name = f"{self.solver.name}_{self.instance_set.name}" 

243 

244 if sparkle_objectives is not None: 

245 self.sparkle_objective = sparkle_objectives[0] 

246 else: 

247 self.sparkle_objective = None 

248 

249 self.number_of_runs = number_of_runs 

250 self.solver_calls = solver_calls 

251 self.cpu_time = cpu_time 

252 self.wallclock_time = wallclock_time 

253 self.cutoff_time = cutoff_time 

254 self.cutoff_length = target_cutoff_length 

255 self.max_iterations = max_iterations 

256 self.cli_cores = cli_cores 

257 self.use_cpu_time_in_tunertime = use_cpu_time_in_tunertime 

258 

259 self.feature_data = feature_data 

260 self.feature_file_path = None 

261 if self.feature_data: 

262 if isinstance(self.feature_data, FeatureDataFrame): 

263 # Convert feature data to SMAC2 format 

264 data_dict = {} 

265 for instance in self.instance_set.instance_paths: 

266 data_dict[str(instance)] = feature_data.get_instance(str(instance)) 

267 

268 self.feature_data = pd.DataFrame.from_dict( 

269 data_dict, orient="index", 

270 columns=[f"Feature{index+1}" 

271 for index in range(feature_data.num_features)]) 

272 

273 def map_nan(x: str) -> int: 

274 """Map non-numeric values with -512 (Pre-defined by SMAC2).""" 

275 if math.isnan(x): 

276 return -512.0 

277 try: 

278 return float(x) 

279 except Exception: 

280 return -512.0 

281 

282 self.feature_data = self.feature_data.map(map_nan) 

283 self.feature_file_path =\ 

284 self.directory / f"{self.instance_set.name}_features.csv" 

285 elif isinstance(self.feature_data, Path): # Read from Path 

286 self.feature_file_path = feature_data 

287 self.feature_data = pd.read_csv(self.feature_file_path, 

288 index_col=0) 

289 else: 

290 print(f"WARNING: Feature data is of type {type(feature_data)}. " 

291 "Expected FeatureDataFrame or Path.") 

292 

293 # Scenario Paths 

294 self.instance_file_path = self.directory / f"{self.instance_set.name}.txt" 

295 

296 # SMAC2 Specific 

297 self.outdir_train = self.directory / "outdir_train_configuration" 

298 

299 def create_scenario(self: SMAC2Scenario) -> None: 

300 """Create scenario with solver and instances in the parent directory. 

301 

302 This prepares all the necessary subdirectories related to configuration. 

303 

304 Args: 

305 parent_directory: Directory in which the scenario should be created. 

306 """ 

307 # Prepare scenario directory 

308 shutil.rmtree(self.directory, ignore_errors=True) 

309 self.directory.mkdir(parents=True) 

310 # Create empty directories as needed 

311 self.outdir_train.mkdir() 

312 self.tmp.mkdir() 

313 self.validation.mkdir() 

314 self.results_directory.mkdir(parents=True) # Prepare results directory 

315 

316 self._prepare_instances() 

317 

318 if self.feature_data is not None: 

319 self._create_feature_file() 

320 

321 self.create_scenario_file() 

322 

323 def create_scenario_file( 

324 self: SMAC2Scenario, 

325 configurator_target: Path = SMAC2.configurator_target, 

326 pcs_port: PCSConvention = PCSConvention.SMAC) -> Path: 

327 """Create a file with the configuration scenario. 

328 

329 Writes supplementary information to the target algorithm (algo =) as: 

330 algo = {configurator_target} {solver_directory} {sparkle_objective} 

331 """ 

332 with self.scenario_file_path.open("w") as file: 

333 file.write(f"algo = {configurator_target.absolute()} " 

334 f"{self.solver.directory} {self.tmp} {self.sparkle_objective} \n" 

335 f"deterministic = {1 if self.solver.deterministic else 0}\n" 

336 f"run_obj = {self._get_performance_measure()}\n" 

337 f"cutoffTime = {self.cutoff_time}\n" 

338 f"paramfile = {self.solver.get_pcs_file(pcs_port)}\n" 

339 f"outdir = {self.outdir_train}\n" 

340 f"instance_file = {self.instance_file_path}\n" 

341 f"test_instance_file = {self.instance_file_path}\n") 

342 if self.cutoff_length is not None: 

343 file.write(f"cutoff_length = {self.cutoff_length}\n") 

344 if self.max_iterations is not None: 

345 file.write(f"iteration-limit = {self.max_iterations}\n") 

346 if self.wallclock_time is not None: 

347 file.write(f"wallclock-limit = {self.wallclock_time}\n") 

348 if self.cpu_time is not None: 

349 file.write(f"cputime-limit = {self.cpu_time}\n") 

350 if self.solver_calls is not None: 

351 file.write(f"runcount-limit = {self.solver_calls}\n") 

352 if self.cli_cores is not None: 

353 file.write(f"cli-cores = {self.cli_cores}") 

354 if self.feature_data is not None: 

355 file.write(f"feature_file = {self.feature_file_path}\n") 

356 if self.use_cpu_time_in_tunertime is not None: 

357 file.write("use-cpu-time-in-tunertime = " 

358 f"{self.use_cpu_time_in_tunertime}\n") 

359 # We don't let SMAC do the validation 

360 file.write("validation = false" + "\n") 

361 return self.scenario_file_path 

362 

363 def _prepare_instances(self: SMAC2Scenario) -> None: 

364 """Create instance list file without instance specifics.""" 

365 self.instance_file_path.parent.mkdir(exist_ok=True, parents=True) 

366 with self.instance_file_path.open("w+") as file: 

367 for instance_path in self.instance_set._instance_paths: 

368 file.write(f"{instance_path}\n") 

369 

370 def _create_feature_file(self: SMAC2Scenario) -> None: 

371 """Create CSV file from feature data.""" 

372 self.feature_data.to_csv(self.feature_file_path, 

373 index_label="INSTANCE_NAME") 

374 

375 def _get_performance_measure(self: SMAC2Scenario) -> str: 

376 """Retrieve the performance measure of the SparkleObjective. 

377 

378 Returns: 

379 Performance measure of the sparkle objective 

380 """ 

381 if self.sparkle_objective.time: 

382 return "RUNTIME" 

383 return "QUALITY" 

384 

385 def serialize_scenario(self: SMAC2Scenario) -> dict: 

386 """Transform ConfigurationScenario to dictionary format.""" 

387 return { 

388 "number_of_runs": self.number_of_runs, 

389 "solver_calls": self.solver_calls, 

390 "cpu_time": self.cpu_time, 

391 "wallclock_time": self.wallclock_time, 

392 "cutoff_time": self.cutoff_time, 

393 "cutoff_length": self.cutoff_length, 

394 "max_iterations": self.max_iterations, 

395 "sparkle_objective": self.sparkle_objective.name, 

396 "feature_data": self.feature_data_path, 

397 "use_cpu_time_in_tunertime": self.use_cpu_time_in_tunertime 

398 } 

399 

400 @staticmethod 

401 def from_file(scenario_file: Path) -> SMAC2Scenario: 

402 """Reads scenario file and initalises SMAC2Scenario.""" 

403 config = {keyvalue[0]: keyvalue[1] 

404 for keyvalue in (line.strip().split(" = ", maxsplit=1) 

405 for line in scenario_file.open().readlines() 

406 if line.strip() != "")} 

407 

408 # Collect relevant settings 

409 cpu_time = int(config["cpu_time"]) if "cpu_time" in config else None 

410 wallclock_limit = int(config["wallclock-limit"]) if "wallclock-limit" in config \ 

411 else None 

412 solver_calls = int(config["runcount-limit"]) if "runcount-limit" in config \ 

413 else None 

414 max_iterations = int(config["iteration-limit"]) if "iteration-limit" in config \ 

415 else None 

416 use_cpu_time_in_tunertime = config["use-cputime-in-tunertime"]\ 

417 if "use-cputime-in-tunertime" in config else None 

418 cli_cores = config["cli-cores"] if "cli-cores" in config else None 

419 

420 _, solver_path, _, objective_str = config["algo"].split(" ") 

421 objective = resolve_objective(objective_str) 

422 solver = Solver(Path(solver_path.strip())) 

423 # Extract the instance set from the instance file 

424 instance_file_path = Path(config["instance_file"]) 

425 instance_set_path = Path(instance_file_path.open().readline().strip()).parent 

426 instance_set = Instance_Set(Path(instance_set_path)) 

427 results_folder = scenario_file.parent / "results" 

428 state_run_dirs = [p for p in results_folder.iterdir() if p.is_file()] 

429 number_of_runs = len(state_run_dirs) 

430 feature_data_path = None 

431 if "feature_file" in config: 

432 feature_data_path = Path(config["feature_file"]) 

433 return SMAC2Scenario(solver, 

434 instance_set, 

435 [objective], 

436 instance_file_path.parent.parent, 

437 number_of_runs, 

438 solver_calls, 

439 max_iterations, 

440 cpu_time, 

441 wallclock_limit, 

442 int(config["cutoffTime"]), 

443 config["cutoff_length"], 

444 cli_cores, 

445 use_cpu_time_in_tunertime, 

446 feature_data_path)