Coverage for sparkle/configurator/implementations/smac3.py: 78%

143 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-03 10:42 +0000

1"""Configurator classes to implement SMAC3 in Sparkle.""" 

2from __future__ import annotations 

3from pathlib import Path 

4import shutil 

5 

6from smac import version as smac_version 

7from smac import Scenario as SmacScenario 

8from smac import facade as smacfacades 

9from smac.runhistory.enumerations import StatusType as SmacStatusType 

10import numpy as np 

11 

12from runrunner import Runner, Run 

13 

14from sparkle.configurator.configurator import Configurator, ConfigurationScenario 

15from sparkle.solver import Solver 

16from sparkle.structures import FeatureDataFrame, PerformanceDataFrame 

17from sparkle.instance import InstanceSet, Instance_Set 

18from sparkle.types import SparkleObjective, resolve_objective, SolverStatus 

19 

20 

21class SMAC3(Configurator): 

22 """Class for SMAC3 (Python) configurator.""" 

23 configurator_path = Path(__file__).parent.parent.parent.resolve() /\ 

24 "Components/smac3-v2.2.0" 

25 configurator_executable = configurator_path / "smac3_target_algorithm.py" 

26 

27 version = smac_version 

28 full_name = "Sequential Model-based Algorithm Configuration" 

29 

30 def __init__(self: SMAC3, 

31 base_dir: Path, 

32 output_path: Path) -> None: 

33 """Returns the SMAC3 configurator, Python SMAC V2.2.0. 

34 

35 Args: 

36 objectives: The objectives to optimize. Only supports one objective. 

37 base_dir: The path where the configurator will be executed in. 

38 output_path: The path where the output will be placed. 

39 """ 

40 output_path = output_path / SMAC3.__name__ 

41 output_path.mkdir(parents=True, exist_ok=True) 

42 return super().__init__( 

43 output_path=output_path, 

44 base_dir=base_dir, 

45 tmp_path=output_path / "tmp", 

46 multi_objective_support=False) 

47 

48 @property 

49 def name(self: SMAC3) -> str: 

50 """Returns the name of the configurator.""" 

51 return SMAC3.__name__ 

52 

53 @staticmethod 

54 def scenario_class() -> ConfigurationScenario: 

55 """Returns the SMAC3 scenario class.""" 

56 return SMAC3Scenario 

57 

58 def configure(self: SMAC3, 

59 scenario: SMAC3Scenario, 

60 data_target: PerformanceDataFrame, 

61 validate_after: bool = True, 

62 sbatch_options: list[str] = [], 

63 slurm_prepend: str | list[str] | Path = None, 

64 num_parallel_jobs: int = None, 

65 base_dir: Path = None, 

66 run_on: Runner = Runner.SLURM) -> list[Run]: 

67 """Start configuration job. 

68 

69 Args: 

70 scenario: ConfigurationScenario object 

71 data_target: PerformanceDataFrame where to store the found configurations 

72 validate_after: Whether the Validator will be called after the configuration 

73 sbatch_options: List of slurm batch options to use 

74 slurm_prepend: Slurm script to prepend to the sbatch 

75 num_parallel_jobs: The maximum number of jobs to run parallel. 

76 base_dir: The path where the sbatch scripts will be created for Slurm. 

77 run_on: On which platform to run the jobs. Default: Slurm. 

78 

79 Returns: 

80 A RunRunner Run object. 

81 """ 

82 if (scenario.smac3_scenario.walltime_limit 

83 == scenario.smac3_scenario.cputime_limit == np.inf): 

84 print("WARNING: Starting SMAC3 scenario without any time limit.") 

85 scenario.create_scenario() 

86 # We set the seed over the last n run ids in the dataframe 

87 seeds = data_target.run_ids[data_target.num_runs - scenario.number_of_runs:] 

88 num_parallel_jobs = num_parallel_jobs or scenario.number_of_runs 

89 # We do not require the configurator CLI as its already our own python wrapper 

90 cmds = [f"python3 {self.configurator_executable.absolute()} " 

91 f"{scenario.scenario_file_path.absolute()} {seed} " 

92 f"{data_target.csv_filepath}" 

93 for seed in seeds] 

94 return super().configure( 

95 configuration_commands=cmds, 

96 data_target=data_target, 

97 output=None, 

98 scenario=scenario, 

99 validation_ids=seeds if validate_after else None, 

100 sbatch_options=sbatch_options, 

101 slurm_prepend=slurm_prepend, 

102 num_parallel_jobs=num_parallel_jobs, 

103 base_dir=base_dir, 

104 run_on=run_on 

105 ) 

106 

107 @staticmethod 

108 def organise_output(output_source: Path, 

109 output_target: Path, 

110 scenario: SMAC3Scenario, 

111 run_id: int) -> None | str: 

112 """Method to restructure and clean up after a single configurator call.""" 

113 import json 

114 from filelock import FileLock 

115 if not output_source.exists(): 

116 print(f"SMAC3 ERROR: Output source file does not exist! [{output_source}]") 

117 return 

118 results_dict = json.load(output_source.open("r")) 

119 configurations = [value for _, value in results_dict["configs"].items()] 

120 config_evals = [[] for _ in range(len(configurations))] 

121 objective = scenario.sparkle_objective 

122 for entry in results_dict["data"]: 

123 config_id, _, _, _, score, _, _, _, _, _ = entry 

124 # SMAC3 configuration ids start at 1 

125 config_evals[config_id - 1].append(score) 

126 config_evals = [objective.instance_aggregator(evaluations) 

127 for evaluations in config_evals] 

128 best_config = configurations[ 

129 config_evals.index(objective.solver_aggregator(config_evals))] 

130 if output_target is None or not output_target.exists(): 

131 return best_config 

132 

133 time_stamp = scenario.scenario_file_path.stat().st_mtime 

134 best_config["configuration_id"] =\ 

135 f"{SMAC3.__name__}_{time_stamp}_{run_id}" 

136 instance_names = scenario.instance_set.instance_names 

137 lock = FileLock(f"{output_target}.lock") 

138 with lock.acquire(timeout=60): 

139 performance_data = PerformanceDataFrame(output_target) 

140 # Resolve absolute path to Solver column 

141 solver = [s for s in performance_data.solvers 

142 if Path(s).name == scenario.solver.name][0] 

143 # For some reason the instance paths in the instance set are absolute 

144 instances = [instance for instance in performance_data.instances 

145 if Path(instance).name in instance_names] 

146 # We don't set the seed in the dataframe, as that should be part of the conf 

147 performance_data.set_value( 

148 value=[str(best_config)], 

149 solver=solver, 

150 instance=instances, 

151 objective=None, 

152 run=run_id, 

153 solver_fields=[PerformanceDataFrame.column_configuration] 

154 ) 

155 performance_data.save_csv() 

156 lock.release() 

157 

158 def get_status_from_logs(self: SMAC3) -> None: 

159 """Method to scan the log files of the configurator for warnings.""" 

160 raise NotImplementedError 

161 

162 @staticmethod 

163 def convert_status(status: SolverStatus) -> SmacStatusType: 

164 """Converts Sparkle Solver status to SMAC3 target status.""" 

165 mapping = { 

166 SolverStatus.SUCCESS: SmacStatusType.SUCCESS, 

167 SolverStatus.CRASHED: SmacStatusType.CRASHED, 

168 SolverStatus.TIMEOUT: SmacStatusType.TIMEOUT, 

169 SolverStatus.WRONG: SmacStatusType.CRASHED, 

170 SolverStatus.UNKNOWN: SmacStatusType.CRASHED, 

171 SolverStatus.ERROR: SmacStatusType.CRASHED, 

172 SolverStatus.KILLED: SmacStatusType.TIMEOUT, 

173 SolverStatus.SAT: SmacStatusType.SUCCESS, 

174 SolverStatus.UNSAT: SmacStatusType.SUCCESS 

175 } 

176 return mapping[status] 

177 

178 

179class SMAC3Scenario(ConfigurationScenario): 

180 """Class to handle SMAC3 configuration scenarios.""" 

181 

182 def __init__(self: SMAC3Scenario, 

183 solver: Solver, 

184 instance_set: InstanceSet, 

185 sparkle_objectives: list[SparkleObjective], 

186 parent_directory: Path, 

187 cutoff_time: int = None, 

188 number_of_runs: int = None, 

189 smac_facade: smacfacades.AbstractFacade | str = 

190 smacfacades.AlgorithmConfigurationFacade, 

191 crash_cost: float | list[float] = np.inf, 

192 termination_cost_threshold: float | list[float] = np.inf, 

193 walltime_limit: float = np.inf, 

194 cputime_limit: float = np.inf, 

195 solver_calls: int = None, 

196 use_default_config: bool = False, 

197 feature_data: FeatureDataFrame | Path = None, 

198 min_budget: float | int | None = None, 

199 max_budget: float | int | None = None, 

200 seed: int = -1, 

201 n_workers: int = 1, 

202 max_ratio: float = None, 

203 smac3_output_directory: Path = Path(), 

204 ) -> None: 

205 """Initialize scenario paths and names. 

206 

207 Args: 

208 solver: Solver 

209 The solver to use for configuration. 

210 instance_set: InstanceSet 

211 The instance set to use for configuration. 

212 sparkle_objectives: list[SparkleObjective] 

213 The objectives to optimize. 

214 parent_directory: Path 

215 The parent directory where the configuration files will be stored. 

216 cutoff_time: int 

217 Maximum CPU runtime in seconds that each solver call (trial) 

218 is allowed to run. Is managed by RunSolver, not pynisher. 

219 number_of_runs: int 

220 The number of times this scenario will be executed with different seeds. 

221 smac_facade: AbstractFacade, defaults to AlgorithmConfigurationFacade 

222 The SMAC facade to use for Optimisation. 

223 crash_cost: float | list[float], defaults to np.inf 

224 Defines the cost for a failed trial. In case of multi-objective, 

225 each objective can be associated with a different cost. 

226 termination_cost_threshold: float | list[float], defaults to np.inf 

227 Defines a cost threshold when the optimization should stop. In case of 

228 multi-objective, each objective *must* be associated with a cost. 

229 The optimization stops when all objectives crossed the threshold. 

230 walltime_limit: float, defaults to np.inf 

231 The maximum time in seconds that SMAC is allowed to run. Only counts 

232 solver time. 

233 cputime_limit: float, defaults to np.inf 

234 The maximum CPU time in seconds that SMAC is allowed to run. Only counts 

235 solver time. WARNING: SMAC3 uses "runtime" (walltime) for CPU time 

236 when determining cputime budget. 

237 solver_calls: int, defaults to None 

238 The maximum number of trials (combination of configuration, seed, budget, 

239 and instance, depending on the task) to run. If left as None, will be 

240 calculated as int(cutoff time / cputime or walltime limit) 

241 use_default_config: bool, defaults to False 

242 If True, the configspace's default configuration is evaluated in the 

243 initial design. For historic benchmark reasons, this is False by default. 

244 Notice, that this will result in n_configs + 1 for the initial design. 

245 Respecting n_trials, this will result in one fewer evaluated 

246 configuration in the optimization. 

247 instances: list[str] | None, defaults to None 

248 Names of the instances to use. If None, no instances are used. Instances 

249 could be dataset names, seeds, subsets, etc. 

250 feature_data: FeatureDataFrame or Path, defaults to None 

251 Instances can be associated with features. For example, meta data of 

252 the dataset (mean, var, ...) can be incorporated which are then further 

253 used to expand the training data of the surrogate model. If Path, loaded 

254 from file. When no features are given, uses index as instance features. 

255 min_budget: float | int | None, defaults to None 

256 The minimum budget (epochs, subset size, number of instances, ...) that 

257 is used for the optimization. Use this argument if you use multi-fidelity 

258 or instance optimization. 

259 max_budget: float | int | None, defaults to None 

260 The maximum budget (epochs, subset size, number of instances, ...) that 

261 is used for the optimization. Use this argument if you use multi-fidelity 

262 or instance optimization. 

263 seed: int, defaults to -1 

264 The seed is used to make results reproducible. 

265 If seed is -1, SMAC will generate a random seed. 

266 n_workers: int, defaults to 1 

267 The number of workers to use for parallelization. 

268 If `n_workers` is greather than 1, SMAC will use DASK to parallelize the 

269 optimization. 

270 max_ratio: float, defaults to None. 

271 Facade uses at most scenario.n_trials * max_ratio number of 

272 configurations in the initial design. Additional configurations are not 

273 affected by this parameter. Not applicable to each facade. 

274 smac3_output_directory: Path, defaults to Path() 

275 The output subdirectory for the SMAC3 scenario. Defaults to the scenario 

276 results directory. 

277 """ 

278 super().__init__(solver, instance_set, sparkle_objectives, parent_directory) 

279 # The files are saved in `./output_directory/name/seed`. 

280 self.log_dir = self.directory / "logs" 

281 self.number_of_runs = number_of_runs 

282 self.feature_data = feature_data 

283 if isinstance(self.feature_data, Path): # Load from file 

284 self.feature_data = FeatureDataFrame(self.feature_data) 

285 

286 # Facade parameters 

287 self.smac_facade = smac_facade 

288 if isinstance(self.smac_facade, str): 

289 self.smac_facade = getattr(smacfacades, self.smac_facade) 

290 self.max_ratio = max_ratio 

291 

292 if self.feature_data is not None: 

293 instance_features =\ 

294 {instance: self.feature_data.get_instance(str(instance)) 

295 for instance in self.instance_set.instance_paths} 

296 else: 

297 # 'If no instance features are passed, the runhistory encoder can not 

298 # distinguish between different instances and therefore returns the same data 

299 # points with different values, all of which are used to train the surrogate 

300 # model. Consider using instance indices as features.' 

301 instance_features = {name: [index] for index, name 

302 in enumerate(instance_set.instance_paths)} 

303 

304 # NOTE: Patchfix; SMAC3 can handle MO but Sparkle also gives non-user specified 

305 # objectives but not all class methods can handle it here yet 

306 self.sparkle_objective = sparkle_objectives[0] 

307 

308 # NOTE: We don't use trial_walltime_limit as a way of managing resources 

309 # As it uses pynisher to do it (python based) and our targets are maybe not 

310 # RunSolver is the better option for accuracy. 

311 self.cutoff_time = cutoff_time 

312 if solver_calls is None: # If solver calls is None, try to calculate it 

313 if self.cutoff_time is not None and (cputime_limit or walltime_limit): 

314 if cputime_limit: 

315 solver_calls = int(cputime_limit / self.cutoff_time) 

316 elif walltime_limit: 

317 solver_calls = int(walltime_limit / self.cutoff_time) 

318 else: 

319 solver_calls = 100 # SMAC3 Default value 

320 self.smac3_scenario = SmacScenario( 

321 configspace=solver.get_cs(), 

322 name=self.name, 

323 output_directory=self.results_directory / smac3_output_directory, 

324 deterministic=solver.deterministic, 

325 objectives=[self.sparkle_objective.name], 

326 crash_cost=crash_cost, 

327 termination_cost_threshold=termination_cost_threshold, 

328 walltime_limit=walltime_limit, 

329 cputime_limit=cputime_limit, 

330 n_trials=solver_calls, 

331 use_default_config=use_default_config, 

332 instances=instance_set.instance_paths, 

333 instance_features=instance_features, 

334 min_budget=min_budget, 

335 max_budget=max_budget, 

336 seed=seed, 

337 n_workers=n_workers 

338 ) 

339 

340 def create_scenario(self: ConfigurationScenario) -> None: 

341 """Create scenario with solver and instances in the parent directory. 

342 

343 This prepares all the necessary subdirectories related to configuration. 

344 

345 Args: 

346 parent_directory: Directory in which the scenario should be created. 

347 """ 

348 shutil.rmtree(self.directory, ignore_errors=True) 

349 self.directory.mkdir(parents=True) 

350 # Create empty directories as needed 

351 self.results_directory.mkdir(parents=True) # Prepare results directory 

352 self.log_dir.mkdir(parents=True) 

353 self.validation.mkdir(parents=True, exist_ok=True) 

354 self.create_scenario_file() 

355 

356 def create_scenario_file(self: ConfigurationScenario) -> Path: 

357 """Create a file with the configuration scenario.""" 

358 with self.scenario_file_path.open("w") as file: 

359 for key, value in self.serialize().items(): 

360 file.write(f"{key} = {value}\n") 

361 

362 def serialize(self: ConfigurationScenario) -> dict: 

363 """Serialize the configuration scenario.""" 

364 feature_data =\ 

365 self.feature_data.csv_filepath if self.feature_data else None 

366 return { 

367 "solver": self.solver.directory, 

368 "instance_set": self.instance_set.directory, 

369 "sparkle_objectives": ",".join(self.smac3_scenario.objectives), 

370 "cutoff_time": self.cutoff_time, 

371 "number_of_runs": self.number_of_runs, 

372 "smac_facade": self.smac_facade.__name__, 

373 "crash_cost": self.smac3_scenario.crash_cost, 

374 "termination_cost_threshold": self.smac3_scenario.termination_cost_threshold, 

375 "walltime_limit": self.smac3_scenario.walltime_limit, 

376 "cputime_limit": self.smac3_scenario.cputime_limit, 

377 "solver_calls": self.smac3_scenario.n_trials, 

378 "use_default_config": self.smac3_scenario.use_default_config, 

379 "feature_data": feature_data, 

380 "min_budget": self.smac3_scenario.min_budget, 

381 "max_budget": self.smac3_scenario.max_budget, 

382 "seed": self.smac3_scenario.seed, 

383 "n_workers": self.smac3_scenario.n_workers, 

384 } 

385 

386 @staticmethod 

387 def from_file(scenario_file: Path, 

388 run_index: int = None) -> ConfigurationScenario: 

389 """Reads scenario file and initalises ConfigurationScenario. 

390 

391 Args: 

392 scenario_file: Path to scenario file. 

393 run_index: If given, reads as the scenario with run_index for offset 

394 in output directory and seed. 

395 

396 Returns: 

397 ConfigurationScenario. 

398 """ 

399 import ast 

400 variables = {keyvalue[0]: keyvalue[1].strip() 

401 for keyvalue in (line.split(" = ", maxsplit=1) 

402 for line in scenario_file.open().readlines() 

403 if line.strip() != "")} 

404 variables["solver"] = Solver(Path(variables["solver"])) 

405 variables["instance_set"] = Instance_Set(Path(variables["instance_set"])) 

406 variables["sparkle_objectives"] = [ 

407 resolve_objective(o) 

408 for o in variables["sparkle_objectives"].split(",")] 

409 variables["parent_directory"] = scenario_file.parent.parent 

410 variables["cutoff_time"] = int(variables["cutoff_time"]) 

411 variables["number_of_runs"] = int(variables["number_of_runs"]) 

412 variables["smac_facade"] = getattr(smacfacades, variables["smac_facade"]) 

413 

414 # We need to support both lists of floats and single float (np.inf is fine) 

415 if variables["crash_cost"].startswith("["): 

416 variables["crash_cost"] =\ 

417 [float(v) for v in ast.literal_eval(variables["crash_cost"])] 

418 else: 

419 variables["crash_cost"] = float(variables["crash_cost"]) 

420 if variables["termination_cost_threshold"].startswith("["): 

421 variables["termination_cost_threshold"] =\ 

422 [float(v) for v in ast.literal_eval( 

423 variables["termination_cost_threshold"])] 

424 else: 

425 variables["termination_cost_threshold"] =\ 

426 float(variables["termination_cost_threshold"]) 

427 

428 variables["walltime_limit"] = float(variables["walltime_limit"]) 

429 variables["cputime_limit"] = float(variables["cputime_limit"]) 

430 variables["solver_calls"] = ast.literal_eval(variables["solver_calls"]) 

431 variables["use_default_config"] =\ 

432 ast.literal_eval(variables["use_default_config"]) 

433 

434 if variables["feature_data"] != "None": 

435 variables["feature_data"] = Path(variables["feature_data"]) 

436 else: 

437 variables["feature_data"] = None 

438 

439 variables["min_budget"] = ast.literal_eval(variables["min_budget"]) 

440 variables["max_budget"] = ast.literal_eval(variables["max_budget"]) 

441 

442 variables["seed"] = ast.literal_eval(variables["seed"]) 

443 variables["n_workers"] = ast.literal_eval(variables["n_workers"]) 

444 if run_index is not None: # Offset 

445 variables["seed"] += run_index 

446 variables["smac3_output_directory"] = Path(f"run_{run_index}") 

447 

448 return SMAC3Scenario(**variables)