Coverage for sparkle/configurator/implementations/smac3.py: 78%
143 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-07 15:22 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-07 15:22 +0000
1"""Configurator classes to implement SMAC3 in Sparkle."""
2from __future__ import annotations
3from pathlib import Path
4import shutil
6from smac import version as smac_version
7from smac import Scenario as SmacScenario
8from smac import facade as smacfacades
9from smac.runhistory.enumerations import StatusType as SmacStatusType
10import numpy as np
12from runrunner import Runner, Run
14from sparkle.configurator.configurator import Configurator, ConfigurationScenario
15from sparkle.solver import Solver
16from sparkle.structures import FeatureDataFrame, PerformanceDataFrame
17from sparkle.instance import InstanceSet, Instance_Set
18from sparkle.types import SparkleObjective, resolve_objective, SolverStatus
21class SMAC3(Configurator):
22 """Class for SMAC3 (Python) configurator."""
23 configurator_path = Path(__file__).parent.parent.parent.resolve() /\
24 "Components/smac3-v2.2.0"
25 configurator_executable = configurator_path / "smac3_target_algorithm.py"
27 version = smac_version
28 full_name = "Sequential Model-based Algorithm Configuration"
30 def __init__(self: SMAC3,
31 base_dir: Path,
32 output_path: Path) -> None:
33 """Returns the SMAC3 configurator, Python SMAC V2.2.0.
35 Args:
36 objectives: The objectives to optimize. Only supports one objective.
37 base_dir: The path where the configurator will be executed in.
38 output_path: The path where the output will be placed.
39 """
40 output_path = output_path / SMAC3.__name__
41 output_path.mkdir(parents=True, exist_ok=True)
42 return super().__init__(
43 output_path=output_path,
44 base_dir=base_dir,
45 tmp_path=output_path / "tmp",
46 multi_objective_support=False)
48 @property
49 def name(self: SMAC3) -> str:
50 """Returns the name of the configurator."""
51 return SMAC3.__name__
53 @staticmethod
54 def scenario_class() -> ConfigurationScenario:
55 """Returns the SMAC3 scenario class."""
56 return SMAC3Scenario
58 def configure(self: SMAC3,
59 scenario: SMAC3Scenario,
60 data_target: PerformanceDataFrame,
61 validate_after: bool = True,
62 sbatch_options: list[str] = [],
63 num_parallel_jobs: int = None,
64 base_dir: Path = None,
65 run_on: Runner = Runner.SLURM) -> list[Run]:
66 """Start configuration job.
68 Args:
69 scenario: ConfigurationScenario object
70 data_target: PerformanceDataFrame where to store the found configurations
71 validate_after: Whether the Validator will be called after the configuration
72 sbatch_options: List of slurm batch options to use
73 num_parallel_jobs: The maximum number of jobs to run parallel.
74 base_dir: The path where the sbatch scripts will be created for Slurm.
75 run_on: On which platform to run the jobs. Default: Slurm.
77 Returns:
78 A RunRunner Run object.
79 """
80 if (scenario.smac3_scenario.walltime_limit
81 == scenario.smac3_scenario.cputime_limit == np.inf):
82 print("WARNING: Starting SMAC3 scenario without any time limit.")
83 scenario.create_scenario()
84 # We set the seed over the last n run ids in the dataframe
85 seeds = data_target.run_ids[data_target.num_runs - scenario.number_of_runs:]
86 num_parallel_jobs = num_parallel_jobs or scenario.number_of_runs
87 # We do not require the configurator CLI as its already our own python wrapper
88 cmds = [f"python3 {self.configurator_executable.absolute()} "
89 f"{scenario.scenario_file_path.absolute()} {seed} "
90 f"{data_target.csv_filepath}"
91 for seed in seeds]
92 return super().configure(
93 configuration_commands=cmds,
94 data_target=data_target,
95 output=None,
96 scenario=scenario,
97 validation_ids=seeds if validate_after else None,
98 sbatch_options=sbatch_options,
99 num_parallel_jobs=num_parallel_jobs,
100 base_dir=base_dir,
101 run_on=run_on
102 )
104 @staticmethod
105 def organise_output(output_source: Path,
106 output_target: Path,
107 scenario: SMAC3Scenario,
108 run_id: int) -> None | str:
109 """Method to restructure and clean up after a single configurator call."""
110 import json
111 from filelock import FileLock
112 if not output_source.exists():
113 print(f"SMAC3 ERROR: Output source file does not exist! [{output_source}]")
114 return
115 results_dict = json.load(output_source.open("r"))
116 configurations = [value for _, value in results_dict["configs"].items()]
117 config_evals = [[] for _ in range(len(configurations))]
118 objective = scenario.sparkle_objective
119 for entry in results_dict["data"]:
120 config_id, _, _, _, score, _, _, _, _, _ = entry
121 # SMAC3 configuration ids start at 1
122 config_evals[config_id - 1].append(score)
123 config_evals = [objective.instance_aggregator(evaluations)
124 for evaluations in config_evals]
125 best_config = configurations[
126 config_evals.index(objective.solver_aggregator(config_evals))]
127 if output_target is None or not output_target.exists():
128 return best_config
130 time_stamp = scenario.scenario_file_path.stat().st_mtime
131 best_config["configuration_id"] =\
132 f"{SMAC3.__name__}_{time_stamp}_{run_id}"
133 instance_names = scenario.instance_set.instance_names
134 lock = FileLock(f"{output_target}.lock")
135 with lock.acquire(timeout=60):
136 performance_data = PerformanceDataFrame(output_target)
137 # Resolve absolute path to Solver column
138 solver = [s for s in performance_data.solvers
139 if Path(s).name == scenario.solver.name][0]
140 # For some reason the instance paths in the instance set are absolute
141 instances = [instance for instance in performance_data.instances
142 if Path(instance).name in instance_names]
143 # We don't set the seed in the dataframe, as that should be part of the conf
144 performance_data.set_value(
145 value=[str(best_config)],
146 solver=solver,
147 instance=instances,
148 objective=None,
149 run=run_id,
150 solver_fields=[PerformanceDataFrame.column_configuration]
151 )
152 performance_data.save_csv()
153 lock.release()
155 def get_status_from_logs(self: SMAC3) -> None:
156 """Method to scan the log files of the configurator for warnings."""
157 raise NotImplementedError
159 @staticmethod
160 def convert_status(status: SolverStatus) -> SmacStatusType:
161 """Converts Sparkle Solver status to SMAC3 target status."""
162 mapping = {
163 SolverStatus.SUCCESS: SmacStatusType.SUCCESS,
164 SolverStatus.CRASHED: SmacStatusType.CRASHED,
165 SolverStatus.TIMEOUT: SmacStatusType.TIMEOUT,
166 SolverStatus.WRONG: SmacStatusType.CRASHED,
167 SolverStatus.UNKNOWN: SmacStatusType.CRASHED,
168 SolverStatus.ERROR: SmacStatusType.CRASHED,
169 SolverStatus.KILLED: SmacStatusType.TIMEOUT,
170 }
171 return mapping[status]
174class SMAC3Scenario(ConfigurationScenario):
175 """Class to handle SMAC3 configuration scenarios."""
177 def __init__(self: SMAC3Scenario,
178 solver: Solver,
179 instance_set: InstanceSet,
180 sparkle_objectives: list[SparkleObjective],
181 parent_directory: Path,
182 cutoff_time: int = None,
183 number_of_runs: int = None,
184 smac_facade: smacfacades.AbstractFacade | str =
185 smacfacades.AlgorithmConfigurationFacade,
186 crash_cost: float | list[float] = np.inf,
187 termination_cost_threshold: float | list[float] = np.inf,
188 walltime_limit: float = np.inf,
189 cputime_limit: float = np.inf,
190 solver_calls: int = None,
191 use_default_config: bool = False,
192 feature_data: FeatureDataFrame | Path = None,
193 min_budget: float | int | None = None,
194 max_budget: float | int | None = None,
195 seed: int = -1,
196 n_workers: int = 1,
197 max_ratio: float = None,
198 smac3_output_directory: Path = Path(),
199 ) -> None:
200 """Initialize scenario paths and names.
202 Args:
203 solver: Solver
204 The solver to use for configuration.
205 instance_set: InstanceSet
206 The instance set to use for configuration.
207 sparkle_objectives: list[SparkleObjective]
208 The objectives to optimize.
209 parent_directory: Path
210 The parent directory where the configuration files will be stored.
211 cutoff_time: int
212 Maximum CPU runtime in seconds that each solver call (trial)
213 is allowed to run. Is managed by RunSolver, not pynisher.
214 number_of_runs: int
215 The number of times this scenario will be executed with different seeds.
216 smac_facade: AbstractFacade, defaults to AlgorithmConfigurationFacade
217 The SMAC facade to use for Optimisation.
218 crash_cost: float | list[float], defaults to np.inf
219 Defines the cost for a failed trial. In case of multi-objective,
220 each objective can be associated with a different cost.
221 termination_cost_threshold: float | list[float], defaults to np.inf
222 Defines a cost threshold when the optimization should stop. In case of
223 multi-objective, each objective *must* be associated with a cost.
224 The optimization stops when all objectives crossed the threshold.
225 walltime_limit: float, defaults to np.inf
226 The maximum time in seconds that SMAC is allowed to run. Only counts
227 solver time.
228 cputime_limit: float, defaults to np.inf
229 The maximum CPU time in seconds that SMAC is allowed to run. Only counts
230 solver time. WARNING: SMAC3 uses "runtime" (walltime) for CPU time
231 when determining cputime budget.
232 solver_calls: int, defaults to None
233 The maximum number of trials (combination of configuration, seed, budget,
234 and instance, depending on the task) to run. If left as None, will be
235 calculated as int(cutoff time / cputime or walltime limit)
236 use_default_config: bool, defaults to False
237 If True, the configspace's default configuration is evaluated in the
238 initial design. For historic benchmark reasons, this is False by default.
239 Notice, that this will result in n_configs + 1 for the initial design.
240 Respecting n_trials, this will result in one fewer evaluated
241 configuration in the optimization.
242 instances: list[str] | None, defaults to None
243 Names of the instances to use. If None, no instances are used. Instances
244 could be dataset names, seeds, subsets, etc.
245 feature_data: FeatureDataFrame or Path, defaults to None
246 Instances can be associated with features. For example, meta data of
247 the dataset (mean, var, ...) can be incorporated which are then further
248 used to expand the training data of the surrogate model. If Path, loaded
249 from file. When no features are given, uses index as instance features.
250 min_budget: float | int | None, defaults to None
251 The minimum budget (epochs, subset size, number of instances, ...) that
252 is used for the optimization. Use this argument if you use multi-fidelity
253 or instance optimization.
254 max_budget: float | int | None, defaults to None
255 The maximum budget (epochs, subset size, number of instances, ...) that
256 is used for the optimization. Use this argument if you use multi-fidelity
257 or instance optimization.
258 seed: int, defaults to -1
259 The seed is used to make results reproducible.
260 If seed is -1, SMAC will generate a random seed.
261 n_workers: int, defaults to 1
262 The number of workers to use for parallelization.
263 If `n_workers` is greather than 1, SMAC will use DASK to parallelize the
264 optimization.
265 max_ratio: float, defaults to None.
266 Facade uses at most scenario.n_trials * max_ratio number of
267 configurations in the initial design. Additional configurations are not
268 affected by this parameter. Not applicable to each facade.
269 smac3_output_directory: Path, defaults to Path()
270 The output subdirectory for the SMAC3 scenario. Defaults to the scenario
271 results directory.
272 """
273 super().__init__(solver, instance_set, sparkle_objectives, parent_directory)
274 # The files are saved in `./output_directory/name/seed`.
275 self.log_dir = self.directory / "logs"
276 self.number_of_runs = number_of_runs
277 self.feature_data = feature_data
278 if isinstance(self.feature_data, Path): # Load from file
279 self.feature_data = FeatureDataFrame(self.feature_data)
281 # Facade parameters
282 self.smac_facade = smac_facade
283 if isinstance(self.smac_facade, str):
284 self.smac_facade = getattr(smacfacades, self.smac_facade)
285 self.max_ratio = max_ratio
287 if self.feature_data is not None:
288 instance_features =\
289 {instance: self.feature_data.get_instance(str(instance))
290 for instance in self.instance_set.instance_paths}
291 else:
292 # 'If no instance features are passed, the runhistory encoder can not
293 # distinguish between different instances and therefore returns the same data
294 # points with different values, all of which are used to train the surrogate
295 # model. Consider using instance indices as features.'
296 instance_features = {name: [index] for index, name
297 in enumerate(instance_set.instance_paths)}
299 # NOTE: Patchfix; SMAC3 can handle MO but Sparkle also gives non-user specified
300 # objectives but not all class methods can handle it here yet
301 self.sparkle_objective = sparkle_objectives[0]
303 # NOTE: We don't use trial_walltime_limit as a way of managing resources
304 # As it uses pynisher to do it (python based) and our targets are maybe not
305 # RunSolver is the better option for accuracy.
306 self.cutoff_time = cutoff_time
307 if solver_calls is None: # If solver calls is None, try to calculate it
308 if self.cutoff_time is not None and (cputime_limit or walltime_limit):
309 if cputime_limit:
310 solver_calls = int(cputime_limit / self.cutoff_time)
311 elif walltime_limit:
312 solver_calls = int(walltime_limit / self.cutoff_time)
313 else:
314 solver_calls = 100 # SMAC3 Default value
315 self.smac3_scenario = SmacScenario(
316 configspace=solver.get_configspace(),
317 name=self.name,
318 output_directory=self.results_directory / smac3_output_directory,
319 deterministic=solver.deterministic,
320 objectives=[self.sparkle_objective.name],
321 crash_cost=crash_cost,
322 termination_cost_threshold=termination_cost_threshold,
323 walltime_limit=walltime_limit,
324 cputime_limit=cputime_limit,
325 n_trials=solver_calls,
326 use_default_config=use_default_config,
327 instances=instance_set.instance_paths,
328 instance_features=instance_features,
329 min_budget=min_budget,
330 max_budget=max_budget,
331 seed=seed,
332 n_workers=n_workers
333 )
335 def create_scenario(self: ConfigurationScenario) -> None:
336 """Create scenario with solver and instances in the parent directory.
338 This prepares all the necessary subdirectories related to configuration.
340 Args:
341 parent_directory: Directory in which the scenario should be created.
342 """
343 shutil.rmtree(self.directory, ignore_errors=True)
344 self.directory.mkdir(parents=True)
345 # Create empty directories as needed
346 self.results_directory.mkdir(parents=True) # Prepare results directory
347 self.log_dir.mkdir(parents=True)
348 self.validation.mkdir(parents=True, exist_ok=True)
349 self.create_scenario_file()
351 def create_scenario_file(self: ConfigurationScenario) -> Path:
352 """Create a file with the configuration scenario."""
353 with self.scenario_file_path.open("w") as file:
354 for key, value in self.serialize().items():
355 file.write(f"{key} = {value}\n")
357 def serialize(self: ConfigurationScenario) -> dict:
358 """Serialize the configuration scenario."""
359 feature_data =\
360 self.feature_data.csv_filepath if self.feature_data else None
361 return {
362 "solver": self.solver.directory,
363 "instance_set": self.instance_set.directory,
364 "sparkle_objectives": ",".join(self.smac3_scenario.objectives),
365 "cutoff_time": self.cutoff_time,
366 "number_of_runs": self.number_of_runs,
367 "smac_facade": self.smac_facade.__name__,
368 "crash_cost": self.smac3_scenario.crash_cost,
369 "termination_cost_threshold": self.smac3_scenario.termination_cost_threshold,
370 "walltime_limit": self.smac3_scenario.walltime_limit,
371 "cputime_limit": self.smac3_scenario.cputime_limit,
372 "solver_calls": self.smac3_scenario.n_trials,
373 "use_default_config": self.smac3_scenario.use_default_config,
374 "feature_data": feature_data,
375 "min_budget": self.smac3_scenario.min_budget,
376 "max_budget": self.smac3_scenario.max_budget,
377 "seed": self.smac3_scenario.seed,
378 "n_workers": self.smac3_scenario.n_workers,
379 }
381 @staticmethod
382 def from_file(scenario_file: Path,
383 run_index: int = None) -> ConfigurationScenario:
384 """Reads scenario file and initalises ConfigurationScenario.
386 Args:
387 scenario_file: Path to scenario file.
388 run_index: If given, reads as the scenario with run_index for offset
389 in output directory and seed.
391 Returns:
392 ConfigurationScenario.
393 """
394 import ast
395 variables = {keyvalue[0]: keyvalue[1].strip()
396 for keyvalue in (line.split(" = ", maxsplit=1)
397 for line in scenario_file.open().readlines()
398 if line.strip() != "")}
399 variables["solver"] = Solver(Path(variables["solver"]))
400 variables["instance_set"] = Instance_Set(Path(variables["instance_set"]))
401 variables["sparkle_objectives"] = [
402 resolve_objective(o)
403 for o in variables["sparkle_objectives"].split(",")]
404 variables["parent_directory"] = scenario_file.parent.parent
405 variables["cutoff_time"] = int(variables["cutoff_time"])
406 variables["number_of_runs"] = int(variables["number_of_runs"])
407 variables["smac_facade"] = getattr(smacfacades, variables["smac_facade"])
409 # We need to support both lists of floats and single float (np.inf is fine)
410 if variables["crash_cost"].startswith("["):
411 variables["crash_cost"] =\
412 [float(v) for v in ast.literal_eval(variables["crash_cost"])]
413 else:
414 variables["crash_cost"] = float(variables["crash_cost"])
415 if variables["termination_cost_threshold"].startswith("["):
416 variables["termination_cost_threshold"] =\
417 [float(v) for v in ast.literal_eval(
418 variables["termination_cost_threshold"])]
419 else:
420 variables["termination_cost_threshold"] =\
421 float(variables["termination_cost_threshold"])
423 variables["walltime_limit"] = float(variables["walltime_limit"])
424 variables["cputime_limit"] = float(variables["cputime_limit"])
425 variables["solver_calls"] = ast.literal_eval(variables["solver_calls"])
426 variables["use_default_config"] =\
427 ast.literal_eval(variables["use_default_config"])
429 if variables["feature_data"] != "None":
430 variables["feature_data"] = Path(variables["feature_data"])
431 else:
432 variables["feature_data"] = None
434 variables["min_budget"] = ast.literal_eval(variables["min_budget"])
435 variables["max_budget"] = ast.literal_eval(variables["max_budget"])
437 variables["seed"] = ast.literal_eval(variables["seed"])
438 variables["n_workers"] = ast.literal_eval(variables["n_workers"])
439 if run_index is not None: # Offset
440 variables["seed"] += run_index
441 variables["smac3_output_directory"] = Path(f"run_{run_index}")
443 return SMAC3Scenario(**variables)