Coverage for sparkle/configurator/implementations/smac3.py: 84%

1"""Configurator classes to implement SMAC3 in Sparkle."""

2from __future__ import annotations

3from pathlib import Path

4import shutil

6from smac import version as smac_version

7from smac import Scenario as SmacScenario

8from smac import facade as smacfacades

9from smac.runhistory.enumerations import StatusType as SmacStatusType

10import numpy as np

12from runrunner import Runner, Run

14from sparkle.configurator.configurator import Configurator, ConfigurationScenario

15from sparkle.solver import Solver

16from sparkle.structures import FeatureDataFrame, PerformanceDataFrame

17from sparkle.instance import InstanceSet, Instance_Set

18from sparkle.types import SparkleObjective, resolve_objective, SolverStatus

21class SMAC3(Configurator):

22 """Class for SMAC3 (Python) configurator."""

23 configurator_path = Path(__file__).parent.resolve() / "SMAC3"

24 configurator_target = configurator_path / "smac3_target_algorithm.py"

26 full_name = "Sequential Model-based Algorithm Configuration"

27 version = smac_version

29 def __init__(self: SMAC3) -> None:

30 """Returns the SMAC3 configurator, Python SMAC V2.3.1."""

31 return super().__init__(multi_objective_support=False)

33 @property

34 def name(self: SMAC3) -> str:

35 """Returns the name of the configurator."""

36 return SMAC3.__name__

38 @staticmethod

39 def scenario_class() -> ConfigurationScenario:

40 """Returns the SMAC3 scenario class."""

41 return SMAC3Scenario

43 @staticmethod

44 def check_requirements(verbose: bool = False) -> bool:

45 """Check that SMAC3 is installed."""

46 return True # Is automatically installed with Sparkle

48 @staticmethod

49 def download_requirements() -> None:

50 """Download SMAC3."""

51 return # Nothing to do

53 def configure(self: SMAC3,

54 scenario: SMAC3Scenario,

55 data_target: PerformanceDataFrame,

56 validate_after: bool = True,

57 sbatch_options: list[str] = [],

58 slurm_prepend: str | list[str] | Path = None,

59 num_parallel_jobs: int = None,

60 base_dir: Path = None,

61 run_on: Runner = Runner.SLURM) -> list[Run]:

62 """Start configuration job.

64 Args:

65 scenario: ConfigurationScenario object

66 data_target: PerformanceDataFrame where to store the found configurations

67 validate_after: Whether the Validator will be called after the configuration

68 sbatch_options: List of slurm batch options to use

69 slurm_prepend: Slurm script to prepend to the sbatch

70 num_parallel_jobs: The maximum number of jobs to run parallel.

71 base_dir: The path where the sbatch scripts will be created for Slurm.

72 run_on: On which platform to run the jobs. Default: Slurm.

74 Returns:

75 A RunRunner Run object.

76 """

77 if (scenario.smac3_scenario.walltime_limit

78 == scenario.smac3_scenario.cputime_limit == np.inf):

79 print("WARNING: Starting SMAC3 scenario without any time limit.")

80 scenario.create_scenario()

81 configuration_ids = scenario.configuration_ids

82 # TODO: Setting seeds like this is weird and should be inspected.

83 # It could be good to take perhaps a seed from the scenario and use that

84 # to generate a seed per run

85 seeds = [i for i in range(scenario.number_of_runs)]

86 num_parallel_jobs = num_parallel_jobs or scenario.number_of_runs

87 # We do not require the configurator CLI as its already our own python wrapper

88 cmds = [f"python3 {self.configurator_target.absolute()} "

89 f"{scenario.scenario_file_path.absolute()} {configuration_id} {seed} "

90 f"{data_target.csv_filepath}"

91 for configuration_id, seed in zip(configuration_ids, seeds)]

92 return super().configure(

93 configuration_commands=cmds,

94 data_target=data_target,

95 output=None,

96 scenario=scenario,

97 configuration_ids=configuration_ids,

98 validate_after=validate_after,

99 sbatch_options=sbatch_options,

100 slurm_prepend=slurm_prepend,

101 num_parallel_jobs=num_parallel_jobs,

102 base_dir=base_dir,

103 run_on=run_on

104 )

105

106 @staticmethod

107 def organise_output(output_source: Path,

108 output_target: Path,

109 scenario: SMAC3Scenario,

110 configuration_id: str) -> None | str:

111 """Method to restructure and clean up after a single configurator call."""

112 import json

113 if not output_source.exists():

114 print(f"SMAC3 ERROR: Output source file does not exist! [{output_source}]")

115 return

116 results_dict = json.load(output_source.open("r"))

117 configurations = [value for _, value in results_dict["configs"].items()]

118 config_evals = [[] for _ in range(len(configurations))]

119 objective = scenario.sparkle_objective

120 for entry in results_dict["data"]:

121 smac_conf_id = entry["config_id"]

122 score = entry["cost"]

123 # SMAC3 configuration ids start at 1

124 config_evals[smac_conf_id - 1].append(score)

125 config_evals = [objective.instance_aggregator(evaluations)

126 for evaluations in config_evals]

127 best_config = configurations[

128 config_evals.index(objective.solver_aggregator(config_evals))]

129 return Configurator.save_configuration(scenario, configuration_id,

130 best_config, output_target)

131

132 def get_status_from_logs(self: SMAC3) -> None:

133 """Method to scan the log files of the configurator for warnings."""

134 raise NotImplementedError

135

136 @staticmethod

137 def convert_status(status: SolverStatus) -> SmacStatusType:

138 """Converts Sparkle Solver status to SMAC3 target status."""

139 mapping = {

140 SolverStatus.SUCCESS: SmacStatusType.SUCCESS,

141 SolverStatus.CRASHED: SmacStatusType.CRASHED,

142 SolverStatus.TIMEOUT: SmacStatusType.TIMEOUT,

143 SolverStatus.WRONG: SmacStatusType.CRASHED,

144 SolverStatus.UNKNOWN: SmacStatusType.CRASHED,

145 SolverStatus.ERROR: SmacStatusType.CRASHED,

146 SolverStatus.KILLED: SmacStatusType.TIMEOUT,

147 SolverStatus.SAT: SmacStatusType.SUCCESS,

148 SolverStatus.UNSAT: SmacStatusType.SUCCESS

149 }

150 return mapping[status]

151

152

153class SMAC3Scenario(ConfigurationScenario):

154 """Class to handle SMAC3 configuration scenarios."""

155

156 def __init__(self: SMAC3Scenario,

157 solver: Solver,

158 instance_set: InstanceSet,

159 sparkle_objectives: list[SparkleObjective],

160 number_of_runs: int,

161 parent_directory: Path,

162 solver_cutoff_time: int = None,

163 smac_facade: smacfacades.AbstractFacade | str =

164 smacfacades.AlgorithmConfigurationFacade,

165 crash_cost: float | list[float] = np.inf,

166 termination_cost_threshold: float | list[float] = np.inf,

167 walltime_limit: float = np.inf,

168 cputime_limit: float = np.inf,

169 solver_calls: int = None,

170 use_default_config: bool = False,

171 feature_data: FeatureDataFrame | Path = None,

172 min_budget: float | int | None = None,

173 max_budget: float | int | None = None,

174 seed: int = -1,

175 n_workers: int = 1,

176 max_ratio: float = None,

177 smac3_output_directory: Path = Path(),

178 ) -> None:

179 """Initialize scenario paths and names.

180

181 Args:

182 solver: Solver

183 The solver to use for configuration.

184 instance_set: InstanceSet

185 The instance set to use for configuration.

186 sparkle_objectives: list[SparkleObjective]

187 The objectives to optimize.

188 number_of_runs: int

189 The number of times this scenario will be executed with different seeds.

190 parent_directory: Path

191 The parent directory where the configuration files will be stored.

192 solver_cutoff_time: int

193 Maximum CPU runtime in seconds that each solver call (trial)

194 is allowed to run. Is managed by RunSolver, not pynisher.

195 smac_facade: AbstractFacade, defaults to AlgorithmConfigurationFacade

196 The SMAC facade to use for Optimisation.

197 crash_cost: float | list[float], defaults to np.inf

198 Defines the cost for a failed trial. In case of multi-objective,

199 each objective can be associated with a different cost.

200 termination_cost_threshold: float | list[float], defaults to np.inf

201 Defines a cost threshold when the optimization should stop. In case of

202 multi-objective, each objective *must* be associated with a cost.

203 The optimization stops when all objectives crossed the threshold.

204 walltime_limit: float, defaults to np.inf

205 The maximum time in seconds that SMAC is allowed to run. Only counts

206 solver time.

207 cputime_limit: float, defaults to np.inf

208 The maximum CPU time in seconds that SMAC is allowed to run. Only counts

209 solver time.

210 solver_calls: int, defaults to None

211 The maximum number of trials (combination of configuration, seed, budget,

212 and instance, depending on the task) to run. If left as None, will be

213 calculated as int(cutoff time / cputime or walltime limit)

214 use_default_config: bool, defaults to False

215 If True, the configspace's default configuration is evaluated in the

216 initial design. For historic benchmark reasons, this is False by default.

217 Notice, that this will result in n_configs + 1 for the initial design.

218 Respecting n_trials, this will result in one fewer evaluated

219 configuration in the optimization.

220 instances: list[str] | None, defaults to None

221 Names of the instances to use. If None, no instances are used. Instances

222 could be dataset names, seeds, subsets, etc.

223 feature_data: FeatureDataFrame or Path, defaults to None

224 Instances can be associated with features. For example, meta data of

225 the dataset (mean, var, ...) can be incorporated which are then further

226 used to expand the training data of the surrogate model. If Path, loaded

227 from file. When no features are given, uses index as instance features.

228 min_budget: float | int | None, defaults to None

229 The minimum budget (epochs, subset size, number of instances, ...) that

230 is used for the optimization. Use this argument if you use multi-fidelity

231 or instance optimization.

232 max_budget: float | int | None, defaults to None

233 The maximum budget (epochs, subset size, number of instances, ...) that

234 is used for the optimization. Use this argument if you use multi-fidelity

235 or instance optimization.

236 seed: int, defaults to -1

237 The seed is used to make results reproducible.

238 If seed is -1, SMAC will generate a random seed.

239 n_workers: int, defaults to 1

240 The number of workers to use for parallelization.

241 If `n_workers` is greather than 1, SMAC will use DASK to parallelize the

242 optimization.

243 max_ratio: float, defaults to None.

244 Facade uses at most scenario.n_trials * max_ratio number of

245 configurations in the initial design. Additional configurations are not

246 affected by this parameter. Not applicable to each facade.

247 smac3_output_directory: Path, defaults to Path()

248 The output subdirectory for the SMAC3 scenario. Defaults to the scenario

249 results directory.

250 """

251 super().__init__(solver, instance_set, sparkle_objectives,

252 number_of_runs, parent_directory)

253 # The files are saved in `./output_directory/name/seed`.

254 self.log_dir = self.directory / "logs"

255 self.feature_data = feature_data

256 if isinstance(self.feature_data, Path): # Load from file

257 self.feature_data = FeatureDataFrame(self.feature_data)

258

259 # Facade parameters

260 self.smac_facade = smac_facade

261 if isinstance(self.smac_facade, str):

262 self.smac_facade = getattr(smacfacades, self.smac_facade)

263 self.max_ratio = max_ratio

264

265 if self.feature_data is not None:

266 instance_features =\

267 {instance: self.feature_data.get_instance(str(instance))

268 for instance in self.instance_set.instance_paths}

269 else:

270 # 'If no instance features are passed, the runhistory encoder can not

271 # distinguish between different instances and therefore returns the same data

272 # points with different values, all of which are used to train the surrogate

273 # model. Consider using instance indices as features.'

274 instance_features = {name: [index] for index, name

275 in enumerate(instance_set.instance_paths)}

276

277 # NOTE: Patchfix; SMAC3 can handle MO but Sparkle also gives non-user specified

278 # objectives but not all class methods can handle it here yet

279 self.sparkle_objective = sparkle_objectives[0]

280

281 # NOTE: We don't use trial_walltime_limit as a way of managing resources

282 # As it uses pynisher to do it (python based) and our targets are maybe not

283 # RunSolver is the better option for accuracy.

284 self.solver_cutoff_time = solver_cutoff_time

285 if solver_calls is None: # If solver calls is None, try to calculate it

286 if self.solver_cutoff_time is not None and (cputime_limit or walltime_limit):

287 if cputime_limit:

288 solver_calls = int(cputime_limit / self.solver_cutoff_time)

289 elif walltime_limit:

290 solver_calls = int(walltime_limit / self.solver_cutoff_time)

291 else:

292 solver_calls = 100 # SMAC3 Default value

293 self.smac3_scenario = SmacScenario(

294 configspace=solver.get_configuration_space(),

295 name=self.name,

296 output_directory=self.results_directory / smac3_output_directory,

297 deterministic=solver.deterministic,

298 objectives=[self.sparkle_objective.name],

299 crash_cost=crash_cost,

300 termination_cost_threshold=termination_cost_threshold,

301 walltime_limit=walltime_limit,

302 cputime_limit=cputime_limit,

303 n_trials=solver_calls,

304 use_default_config=use_default_config,

305 instances=instance_set.instance_paths,

306 instance_features=instance_features,

307 min_budget=min_budget,

308 max_budget=max_budget,

309 seed=seed,

310 n_workers=n_workers

311 )

312

313 def create_scenario(self: ConfigurationScenario) -> None:

314 """Create scenario with solver and instances in the parent directory.

315

316 This prepares all the necessary subdirectories related to configuration.

317

318 Args:

319 parent_directory: Directory in which the scenario should be created.

320 """

321 shutil.rmtree(self.directory, ignore_errors=True)

322 self.directory.mkdir(parents=True)

323 # Create empty directories as needed

324 self.results_directory.mkdir(parents=True) # Prepare results directory

325 self.log_dir.mkdir(parents=True)

326 self.validation.mkdir(parents=True, exist_ok=True)

327 self.create_scenario_file()

328

329 @property

330 def configurator(self: SMAC3Scenario) -> SMAC3:

331 """Return the type of configurator the scenario belongs to."""

332 return SMAC3

333

334 def create_scenario_file(self: SMAC3Scenario) -> Path:

335 """Create a file with the configuration scenario."""

336 super().create_scenario_file()

337 with self.scenario_file_path.open("w") as file:

338 for key, value in self.serialise().items():

339 file.write(f"{key} = {value}\n")

340

341 def serialise(self: SMAC3Scenario) -> dict:

342 """Serialize the configuration scenario."""

343 feature_data =\

344 self.feature_data.csv_filepath if self.feature_data else None

345 return {

346 "solver": self.solver.directory,

347 "instance_set": self.instance_set.directory,

348 "sparkle_objectives": ",".join(self.smac3_scenario.objectives),

349 "solver_cutoff_time": self.solver_cutoff_time,

350 "number_of_runs": self.number_of_runs,

351 "smac_facade": self.smac_facade.__name__,

352 "crash_cost": self.smac3_scenario.crash_cost,

353 "termination_cost_threshold": self.smac3_scenario.termination_cost_threshold,

354 "walltime_limit": self.smac3_scenario.walltime_limit,

355 "cputime_limit": self.smac3_scenario.cputime_limit,

356 "solver_calls": self.smac3_scenario.n_trials,

357 "use_default_config": self.smac3_scenario.use_default_config,

358 "feature_data": feature_data,

359 "min_budget": self.smac3_scenario.min_budget,

360 "max_budget": self.smac3_scenario.max_budget,

361 "seed": self.smac3_scenario.seed,

362 "n_workers": self.smac3_scenario.n_workers,

363 }

364

365 @staticmethod

366 def from_file(scenario_file: Path,

367 run_index: int = None) -> SMAC3Scenario:

368 """Reads scenario file and initalises ConfigurationScenario.

369

370 Args:

371 scenario_file: Path to scenario file.

372 run_index: If given, reads as the scenario with run_index for offset

373 in output directory and seed.

374

375 Returns:

376 ConfigurationScenario.

377 """

378 import ast

379 variables = {keyvalue[0]: keyvalue[1].strip()

380 for keyvalue in (line.split(" = ", maxsplit=1)

381 for line in scenario_file.open().readlines()

382 if line.strip() != "")}

383 variables["solver"] = Solver(Path(variables["solver"]))

384 variables["instance_set"] = Instance_Set(Path(variables["instance_set"]))

385 variables["sparkle_objectives"] = [

386 resolve_objective(o)

387 for o in variables["sparkle_objectives"].split(",")]

388 variables["parent_directory"] = scenario_file.parent.parent

389 variables["solver_cutoff_time"] = int(variables["solver_cutoff_time"])

390 variables["number_of_runs"] = int(variables["number_of_runs"])

391 variables["smac_facade"] = getattr(smacfacades, variables["smac_facade"])

392

393 # We need to support both lists of floats and single float (np.inf is fine)

394 if variables["crash_cost"].startswith("["):

395 variables["crash_cost"] =\

396 [float(v) for v in ast.literal_eval(variables["crash_cost"])]

397 else:

398 variables["crash_cost"] = float(variables["crash_cost"])

399 if variables["termination_cost_threshold"].startswith("["):

400 variables["termination_cost_threshold"] =\

401 [float(v) for v in ast.literal_eval(

402 variables["termination_cost_threshold"])]

403 else:

404 variables["termination_cost_threshold"] =\

405 float(variables["termination_cost_threshold"])

406

407 variables["walltime_limit"] = float(variables["walltime_limit"])

408 variables["cputime_limit"] = float(variables["cputime_limit"])

409 variables["solver_calls"] = ast.literal_eval(variables["solver_calls"])

410 variables["use_default_config"] =\

411 ast.literal_eval(variables["use_default_config"])

412

413 if variables["feature_data"] != "None":

414 variables["feature_data"] = Path(variables["feature_data"])

415 else:

416 variables["feature_data"] = None

417

418 variables["min_budget"] = ast.literal_eval(variables["min_budget"])

419 variables["max_budget"] = ast.literal_eval(variables["max_budget"])

420

421 variables["seed"] = ast.literal_eval(variables["seed"])

422 variables["n_workers"] = ast.literal_eval(variables["n_workers"])

423 if run_index is not None: # Offset

424 variables["seed"] += run_index

425 variables["smac3_output_directory"] = Path(f"run_{run_index}")

426

427 return SMAC3Scenario(**variables)