Coverage for sparkle/configurator/implementations/smac3.py: 78%

143 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-07 15:22 +0000

1"""Configurator classes to implement SMAC3 in Sparkle.""" 

2from __future__ import annotations 

3from pathlib import Path 

4import shutil 

5 

6from smac import version as smac_version 

7from smac import Scenario as SmacScenario 

8from smac import facade as smacfacades 

9from smac.runhistory.enumerations import StatusType as SmacStatusType 

10import numpy as np 

11 

12from runrunner import Runner, Run 

13 

14from sparkle.configurator.configurator import Configurator, ConfigurationScenario 

15from sparkle.solver import Solver 

16from sparkle.structures import FeatureDataFrame, PerformanceDataFrame 

17from sparkle.instance import InstanceSet, Instance_Set 

18from sparkle.types import SparkleObjective, resolve_objective, SolverStatus 

19 

20 

21class SMAC3(Configurator): 

22 """Class for SMAC3 (Python) configurator.""" 

23 configurator_path = Path(__file__).parent.parent.parent.resolve() /\ 

24 "Components/smac3-v2.2.0" 

25 configurator_executable = configurator_path / "smac3_target_algorithm.py" 

26 

27 version = smac_version 

28 full_name = "Sequential Model-based Algorithm Configuration" 

29 

30 def __init__(self: SMAC3, 

31 base_dir: Path, 

32 output_path: Path) -> None: 

33 """Returns the SMAC3 configurator, Python SMAC V2.2.0. 

34 

35 Args: 

36 objectives: The objectives to optimize. Only supports one objective. 

37 base_dir: The path where the configurator will be executed in. 

38 output_path: The path where the output will be placed. 

39 """ 

40 output_path = output_path / SMAC3.__name__ 

41 output_path.mkdir(parents=True, exist_ok=True) 

42 return super().__init__( 

43 output_path=output_path, 

44 base_dir=base_dir, 

45 tmp_path=output_path / "tmp", 

46 multi_objective_support=False) 

47 

48 @property 

49 def name(self: SMAC3) -> str: 

50 """Returns the name of the configurator.""" 

51 return SMAC3.__name__ 

52 

53 @staticmethod 

54 def scenario_class() -> ConfigurationScenario: 

55 """Returns the SMAC3 scenario class.""" 

56 return SMAC3Scenario 

57 

58 def configure(self: SMAC3, 

59 scenario: SMAC3Scenario, 

60 data_target: PerformanceDataFrame, 

61 validate_after: bool = True, 

62 sbatch_options: list[str] = [], 

63 num_parallel_jobs: int = None, 

64 base_dir: Path = None, 

65 run_on: Runner = Runner.SLURM) -> list[Run]: 

66 """Start configuration job. 

67 

68 Args: 

69 scenario: ConfigurationScenario object 

70 data_target: PerformanceDataFrame where to store the found configurations 

71 validate_after: Whether the Validator will be called after the configuration 

72 sbatch_options: List of slurm batch options to use 

73 num_parallel_jobs: The maximum number of jobs to run parallel. 

74 base_dir: The path where the sbatch scripts will be created for Slurm. 

75 run_on: On which platform to run the jobs. Default: Slurm. 

76 

77 Returns: 

78 A RunRunner Run object. 

79 """ 

80 if (scenario.smac3_scenario.walltime_limit 

81 == scenario.smac3_scenario.cputime_limit == np.inf): 

82 print("WARNING: Starting SMAC3 scenario without any time limit.") 

83 scenario.create_scenario() 

84 # We set the seed over the last n run ids in the dataframe 

85 seeds = data_target.run_ids[data_target.num_runs - scenario.number_of_runs:] 

86 num_parallel_jobs = num_parallel_jobs or scenario.number_of_runs 

87 # We do not require the configurator CLI as its already our own python wrapper 

88 cmds = [f"python3 {self.configurator_executable.absolute()} " 

89 f"{scenario.scenario_file_path.absolute()} {seed} " 

90 f"{data_target.csv_filepath}" 

91 for seed in seeds] 

92 return super().configure( 

93 configuration_commands=cmds, 

94 data_target=data_target, 

95 output=None, 

96 scenario=scenario, 

97 validation_ids=seeds if validate_after else None, 

98 sbatch_options=sbatch_options, 

99 num_parallel_jobs=num_parallel_jobs, 

100 base_dir=base_dir, 

101 run_on=run_on 

102 ) 

103 

104 @staticmethod 

105 def organise_output(output_source: Path, 

106 output_target: Path, 

107 scenario: SMAC3Scenario, 

108 run_id: int) -> None | str: 

109 """Method to restructure and clean up after a single configurator call.""" 

110 import json 

111 from filelock import FileLock 

112 if not output_source.exists(): 

113 print(f"SMAC3 ERROR: Output source file does not exist! [{output_source}]") 

114 return 

115 results_dict = json.load(output_source.open("r")) 

116 configurations = [value for _, value in results_dict["configs"].items()] 

117 config_evals = [[] for _ in range(len(configurations))] 

118 objective = scenario.sparkle_objective 

119 for entry in results_dict["data"]: 

120 config_id, _, _, _, score, _, _, _, _, _ = entry 

121 # SMAC3 configuration ids start at 1 

122 config_evals[config_id - 1].append(score) 

123 config_evals = [objective.instance_aggregator(evaluations) 

124 for evaluations in config_evals] 

125 best_config = configurations[ 

126 config_evals.index(objective.solver_aggregator(config_evals))] 

127 if output_target is None or not output_target.exists(): 

128 return best_config 

129 

130 time_stamp = scenario.scenario_file_path.stat().st_mtime 

131 best_config["configuration_id"] =\ 

132 f"{SMAC3.__name__}_{time_stamp}_{run_id}" 

133 instance_names = scenario.instance_set.instance_names 

134 lock = FileLock(f"{output_target}.lock") 

135 with lock.acquire(timeout=60): 

136 performance_data = PerformanceDataFrame(output_target) 

137 # Resolve absolute path to Solver column 

138 solver = [s for s in performance_data.solvers 

139 if Path(s).name == scenario.solver.name][0] 

140 # For some reason the instance paths in the instance set are absolute 

141 instances = [instance for instance in performance_data.instances 

142 if Path(instance).name in instance_names] 

143 # We don't set the seed in the dataframe, as that should be part of the conf 

144 performance_data.set_value( 

145 value=[str(best_config)], 

146 solver=solver, 

147 instance=instances, 

148 objective=None, 

149 run=run_id, 

150 solver_fields=[PerformanceDataFrame.column_configuration] 

151 ) 

152 performance_data.save_csv() 

153 lock.release() 

154 

155 def get_status_from_logs(self: SMAC3) -> None: 

156 """Method to scan the log files of the configurator for warnings.""" 

157 raise NotImplementedError 

158 

159 @staticmethod 

160 def convert_status(status: SolverStatus) -> SmacStatusType: 

161 """Converts Sparkle Solver status to SMAC3 target status.""" 

162 mapping = { 

163 SolverStatus.SUCCESS: SmacStatusType.SUCCESS, 

164 SolverStatus.CRASHED: SmacStatusType.CRASHED, 

165 SolverStatus.TIMEOUT: SmacStatusType.TIMEOUT, 

166 SolverStatus.WRONG: SmacStatusType.CRASHED, 

167 SolverStatus.UNKNOWN: SmacStatusType.CRASHED, 

168 SolverStatus.ERROR: SmacStatusType.CRASHED, 

169 SolverStatus.KILLED: SmacStatusType.TIMEOUT, 

170 } 

171 return mapping[status] 

172 

173 

174class SMAC3Scenario(ConfigurationScenario): 

175 """Class to handle SMAC3 configuration scenarios.""" 

176 

177 def __init__(self: SMAC3Scenario, 

178 solver: Solver, 

179 instance_set: InstanceSet, 

180 sparkle_objectives: list[SparkleObjective], 

181 parent_directory: Path, 

182 cutoff_time: int = None, 

183 number_of_runs: int = None, 

184 smac_facade: smacfacades.AbstractFacade | str = 

185 smacfacades.AlgorithmConfigurationFacade, 

186 crash_cost: float | list[float] = np.inf, 

187 termination_cost_threshold: float | list[float] = np.inf, 

188 walltime_limit: float = np.inf, 

189 cputime_limit: float = np.inf, 

190 solver_calls: int = None, 

191 use_default_config: bool = False, 

192 feature_data: FeatureDataFrame | Path = None, 

193 min_budget: float | int | None = None, 

194 max_budget: float | int | None = None, 

195 seed: int = -1, 

196 n_workers: int = 1, 

197 max_ratio: float = None, 

198 smac3_output_directory: Path = Path(), 

199 ) -> None: 

200 """Initialize scenario paths and names. 

201 

202 Args: 

203 solver: Solver 

204 The solver to use for configuration. 

205 instance_set: InstanceSet 

206 The instance set to use for configuration. 

207 sparkle_objectives: list[SparkleObjective] 

208 The objectives to optimize. 

209 parent_directory: Path 

210 The parent directory where the configuration files will be stored. 

211 cutoff_time: int 

212 Maximum CPU runtime in seconds that each solver call (trial) 

213 is allowed to run. Is managed by RunSolver, not pynisher. 

214 number_of_runs: int 

215 The number of times this scenario will be executed with different seeds. 

216 smac_facade: AbstractFacade, defaults to AlgorithmConfigurationFacade 

217 The SMAC facade to use for Optimisation. 

218 crash_cost: float | list[float], defaults to np.inf 

219 Defines the cost for a failed trial. In case of multi-objective, 

220 each objective can be associated with a different cost. 

221 termination_cost_threshold: float | list[float], defaults to np.inf 

222 Defines a cost threshold when the optimization should stop. In case of 

223 multi-objective, each objective *must* be associated with a cost. 

224 The optimization stops when all objectives crossed the threshold. 

225 walltime_limit: float, defaults to np.inf 

226 The maximum time in seconds that SMAC is allowed to run. Only counts 

227 solver time. 

228 cputime_limit: float, defaults to np.inf 

229 The maximum CPU time in seconds that SMAC is allowed to run. Only counts 

230 solver time. WARNING: SMAC3 uses "runtime" (walltime) for CPU time 

231 when determining cputime budget. 

232 solver_calls: int, defaults to None 

233 The maximum number of trials (combination of configuration, seed, budget, 

234 and instance, depending on the task) to run. If left as None, will be 

235 calculated as int(cutoff time / cputime or walltime limit) 

236 use_default_config: bool, defaults to False 

237 If True, the configspace's default configuration is evaluated in the 

238 initial design. For historic benchmark reasons, this is False by default. 

239 Notice, that this will result in n_configs + 1 for the initial design. 

240 Respecting n_trials, this will result in one fewer evaluated 

241 configuration in the optimization. 

242 instances: list[str] | None, defaults to None 

243 Names of the instances to use. If None, no instances are used. Instances 

244 could be dataset names, seeds, subsets, etc. 

245 feature_data: FeatureDataFrame or Path, defaults to None 

246 Instances can be associated with features. For example, meta data of 

247 the dataset (mean, var, ...) can be incorporated which are then further 

248 used to expand the training data of the surrogate model. If Path, loaded 

249 from file. When no features are given, uses index as instance features. 

250 min_budget: float | int | None, defaults to None 

251 The minimum budget (epochs, subset size, number of instances, ...) that 

252 is used for the optimization. Use this argument if you use multi-fidelity 

253 or instance optimization. 

254 max_budget: float | int | None, defaults to None 

255 The maximum budget (epochs, subset size, number of instances, ...) that 

256 is used for the optimization. Use this argument if you use multi-fidelity 

257 or instance optimization. 

258 seed: int, defaults to -1 

259 The seed is used to make results reproducible. 

260 If seed is -1, SMAC will generate a random seed. 

261 n_workers: int, defaults to 1 

262 The number of workers to use for parallelization. 

263 If `n_workers` is greather than 1, SMAC will use DASK to parallelize the 

264 optimization. 

265 max_ratio: float, defaults to None. 

266 Facade uses at most scenario.n_trials * max_ratio number of 

267 configurations in the initial design. Additional configurations are not 

268 affected by this parameter. Not applicable to each facade. 

269 smac3_output_directory: Path, defaults to Path() 

270 The output subdirectory for the SMAC3 scenario. Defaults to the scenario 

271 results directory. 

272 """ 

273 super().__init__(solver, instance_set, sparkle_objectives, parent_directory) 

274 # The files are saved in `./output_directory/name/seed`. 

275 self.log_dir = self.directory / "logs" 

276 self.number_of_runs = number_of_runs 

277 self.feature_data = feature_data 

278 if isinstance(self.feature_data, Path): # Load from file 

279 self.feature_data = FeatureDataFrame(self.feature_data) 

280 

281 # Facade parameters 

282 self.smac_facade = smac_facade 

283 if isinstance(self.smac_facade, str): 

284 self.smac_facade = getattr(smacfacades, self.smac_facade) 

285 self.max_ratio = max_ratio 

286 

287 if self.feature_data is not None: 

288 instance_features =\ 

289 {instance: self.feature_data.get_instance(str(instance)) 

290 for instance in self.instance_set.instance_paths} 

291 else: 

292 # 'If no instance features are passed, the runhistory encoder can not 

293 # distinguish between different instances and therefore returns the same data 

294 # points with different values, all of which are used to train the surrogate 

295 # model. Consider using instance indices as features.' 

296 instance_features = {name: [index] for index, name 

297 in enumerate(instance_set.instance_paths)} 

298 

299 # NOTE: Patchfix; SMAC3 can handle MO but Sparkle also gives non-user specified 

300 # objectives but not all class methods can handle it here yet 

301 self.sparkle_objective = sparkle_objectives[0] 

302 

303 # NOTE: We don't use trial_walltime_limit as a way of managing resources 

304 # As it uses pynisher to do it (python based) and our targets are maybe not 

305 # RunSolver is the better option for accuracy. 

306 self.cutoff_time = cutoff_time 

307 if solver_calls is None: # If solver calls is None, try to calculate it 

308 if self.cutoff_time is not None and (cputime_limit or walltime_limit): 

309 if cputime_limit: 

310 solver_calls = int(cputime_limit / self.cutoff_time) 

311 elif walltime_limit: 

312 solver_calls = int(walltime_limit / self.cutoff_time) 

313 else: 

314 solver_calls = 100 # SMAC3 Default value 

315 self.smac3_scenario = SmacScenario( 

316 configspace=solver.get_configspace(), 

317 name=self.name, 

318 output_directory=self.results_directory / smac3_output_directory, 

319 deterministic=solver.deterministic, 

320 objectives=[self.sparkle_objective.name], 

321 crash_cost=crash_cost, 

322 termination_cost_threshold=termination_cost_threshold, 

323 walltime_limit=walltime_limit, 

324 cputime_limit=cputime_limit, 

325 n_trials=solver_calls, 

326 use_default_config=use_default_config, 

327 instances=instance_set.instance_paths, 

328 instance_features=instance_features, 

329 min_budget=min_budget, 

330 max_budget=max_budget, 

331 seed=seed, 

332 n_workers=n_workers 

333 ) 

334 

335 def create_scenario(self: ConfigurationScenario) -> None: 

336 """Create scenario with solver and instances in the parent directory. 

337 

338 This prepares all the necessary subdirectories related to configuration. 

339 

340 Args: 

341 parent_directory: Directory in which the scenario should be created. 

342 """ 

343 shutil.rmtree(self.directory, ignore_errors=True) 

344 self.directory.mkdir(parents=True) 

345 # Create empty directories as needed 

346 self.results_directory.mkdir(parents=True) # Prepare results directory 

347 self.log_dir.mkdir(parents=True) 

348 self.validation.mkdir(parents=True, exist_ok=True) 

349 self.create_scenario_file() 

350 

351 def create_scenario_file(self: ConfigurationScenario) -> Path: 

352 """Create a file with the configuration scenario.""" 

353 with self.scenario_file_path.open("w") as file: 

354 for key, value in self.serialize().items(): 

355 file.write(f"{key} = {value}\n") 

356 

357 def serialize(self: ConfigurationScenario) -> dict: 

358 """Serialize the configuration scenario.""" 

359 feature_data =\ 

360 self.feature_data.csv_filepath if self.feature_data else None 

361 return { 

362 "solver": self.solver.directory, 

363 "instance_set": self.instance_set.directory, 

364 "sparkle_objectives": ",".join(self.smac3_scenario.objectives), 

365 "cutoff_time": self.cutoff_time, 

366 "number_of_runs": self.number_of_runs, 

367 "smac_facade": self.smac_facade.__name__, 

368 "crash_cost": self.smac3_scenario.crash_cost, 

369 "termination_cost_threshold": self.smac3_scenario.termination_cost_threshold, 

370 "walltime_limit": self.smac3_scenario.walltime_limit, 

371 "cputime_limit": self.smac3_scenario.cputime_limit, 

372 "solver_calls": self.smac3_scenario.n_trials, 

373 "use_default_config": self.smac3_scenario.use_default_config, 

374 "feature_data": feature_data, 

375 "min_budget": self.smac3_scenario.min_budget, 

376 "max_budget": self.smac3_scenario.max_budget, 

377 "seed": self.smac3_scenario.seed, 

378 "n_workers": self.smac3_scenario.n_workers, 

379 } 

380 

381 @staticmethod 

382 def from_file(scenario_file: Path, 

383 run_index: int = None) -> ConfigurationScenario: 

384 """Reads scenario file and initalises ConfigurationScenario. 

385 

386 Args: 

387 scenario_file: Path to scenario file. 

388 run_index: If given, reads as the scenario with run_index for offset 

389 in output directory and seed. 

390 

391 Returns: 

392 ConfigurationScenario. 

393 """ 

394 import ast 

395 variables = {keyvalue[0]: keyvalue[1].strip() 

396 for keyvalue in (line.split(" = ", maxsplit=1) 

397 for line in scenario_file.open().readlines() 

398 if line.strip() != "")} 

399 variables["solver"] = Solver(Path(variables["solver"])) 

400 variables["instance_set"] = Instance_Set(Path(variables["instance_set"])) 

401 variables["sparkle_objectives"] = [ 

402 resolve_objective(o) 

403 for o in variables["sparkle_objectives"].split(",")] 

404 variables["parent_directory"] = scenario_file.parent.parent 

405 variables["cutoff_time"] = int(variables["cutoff_time"]) 

406 variables["number_of_runs"] = int(variables["number_of_runs"]) 

407 variables["smac_facade"] = getattr(smacfacades, variables["smac_facade"]) 

408 

409 # We need to support both lists of floats and single float (np.inf is fine) 

410 if variables["crash_cost"].startswith("["): 

411 variables["crash_cost"] =\ 

412 [float(v) for v in ast.literal_eval(variables["crash_cost"])] 

413 else: 

414 variables["crash_cost"] = float(variables["crash_cost"]) 

415 if variables["termination_cost_threshold"].startswith("["): 

416 variables["termination_cost_threshold"] =\ 

417 [float(v) for v in ast.literal_eval( 

418 variables["termination_cost_threshold"])] 

419 else: 

420 variables["termination_cost_threshold"] =\ 

421 float(variables["termination_cost_threshold"]) 

422 

423 variables["walltime_limit"] = float(variables["walltime_limit"]) 

424 variables["cputime_limit"] = float(variables["cputime_limit"]) 

425 variables["solver_calls"] = ast.literal_eval(variables["solver_calls"]) 

426 variables["use_default_config"] =\ 

427 ast.literal_eval(variables["use_default_config"]) 

428 

429 if variables["feature_data"] != "None": 

430 variables["feature_data"] = Path(variables["feature_data"]) 

431 else: 

432 variables["feature_data"] = None 

433 

434 variables["min_budget"] = ast.literal_eval(variables["min_budget"]) 

435 variables["max_budget"] = ast.literal_eval(variables["max_budget"]) 

436 

437 variables["seed"] = ast.literal_eval(variables["seed"]) 

438 variables["n_workers"] = ast.literal_eval(variables["n_workers"]) 

439 if run_index is not None: # Offset 

440 variables["seed"] += run_index 

441 variables["smac3_output_directory"] = Path(f"run_{run_index}") 

442 

443 return SMAC3Scenario(**variables)