Coverage for sparkle/configurator/implementations/smac2.py: 72%

188 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-07 15:22 +0000

1"""Configurator classes to implement SMAC2 in Sparkle.""" 

2from __future__ import annotations 

3from pathlib import Path 

4import glob 

5import shutil 

6import math 

7 

8import pandas as pd 

9 

10from runrunner import Runner, Run 

11 

12from sparkle.configurator.configurator import Configurator, ConfigurationScenario 

13from sparkle.solver import Solver 

14from sparkle.structures import PerformanceDataFrame, FeatureDataFrame 

15from sparkle.instance import InstanceSet, Instance_Set 

16from sparkle.types import SparkleObjective, resolve_objective 

17 

18 

19class SMAC2(Configurator): 

20 """Class for SMAC2 (Java) configurator.""" 

21 configurator_path = Path(__file__).parent.parent.parent.resolve() /\ 

22 "Components/smac2-v2.10.03-master-778" 

23 configurator_executable = configurator_path / "smac" 

24 configurator_target = configurator_path / "smac2_target_algorithm.py" 

25 

26 version = "2.10.03" 

27 full_name = "Sequential Model-based Algorithm Configuration" 

28 

29 def __init__(self: SMAC2, 

30 base_dir: Path, 

31 output_path: Path) -> None: 

32 """Returns the SMAC2 configurator, Java SMAC V2.10.03. 

33 

34 Args: 

35 objectives: The objectives to optimize. Only supports one objective. 

36 base_dir: The path where the configurator will be executed in. 

37 output_path: The path where the output will be placed. 

38 """ 

39 output_path = output_path / SMAC2.__name__ 

40 output_path.mkdir(parents=True, exist_ok=True) 

41 return super().__init__( 

42 output_path=output_path, 

43 base_dir=base_dir, 

44 tmp_path=output_path / "tmp", 

45 multi_objective_support=False) 

46 

47 @property 

48 def name(self: SMAC2) -> str: 

49 """Returns the name of the configurator.""" 

50 return SMAC2.__name__ 

51 

52 @staticmethod 

53 def scenario_class() -> ConfigurationScenario: 

54 """Returns the SMAC2 scenario class.""" 

55 return SMAC2Scenario 

56 

57 def configure(self: Configurator, 

58 scenario: ConfigurationScenario, 

59 data_target: PerformanceDataFrame, 

60 validate_after: bool = True, 

61 sbatch_options: list[str] = [], 

62 num_parallel_jobs: int = None, 

63 base_dir: Path = None, 

64 run_on: Runner = Runner.SLURM) -> list[Run]: 

65 """Start configuration job. 

66 

67 Args: 

68 scenario: ConfigurationScenario object 

69 validate_after: Whether the configurations should be validated on the 

70 train set afterwards. 

71 sbatch_options: List of slurm batch options to use 

72 num_parallel_jobs: The maximum number of jobs to run parallel. 

73 base_dir: The path where the sbatch scripts will be created for Slurm. 

74 run_on: On which platform to run the jobs. Default: Slurm. 

75 

76 Returns: 

77 A RunRunner Run object. 

78 """ 

79 if shutil.which("java") is None: 

80 raise RuntimeError( 

81 "SMAC2 requires Java 1.8.0_402, but Java is not installed. " 

82 "Please ensure Java is installed and try again." 

83 ) 

84 scenario.create_scenario() 

85 # We set the seed over the last n run ids in the dataframe 

86 seeds = data_target.run_ids[data_target.num_runs - scenario.number_of_runs:] 

87 output = [f"{(scenario.results_directory).absolute()}/" 

88 f"{scenario.name}_seed_{seed}_smac.txt" 

89 for seed in seeds] 

90 cmds = [f"python3 {Configurator.configurator_cli_path.absolute()} " 

91 f"{SMAC2.__name__} {output_file} {data_target.csv_filepath} " 

92 f"{scenario.scenario_file_path} {seed} " 

93 f"{SMAC2.configurator_executable.absolute()} " 

94 f"--scenario-file {scenario.scenario_file_path} " 

95 f"--seed {seed} " 

96 for output_file, seed in zip(output, seeds)] 

97 if num_parallel_jobs is not None: 

98 num_parallel_jobs = max(num_parallel_jobs, scenario.number_of_runs) 

99 return super().configure( 

100 configuration_commands=cmds, 

101 data_target=data_target, 

102 output=output, 

103 num_parallel_jobs=num_parallel_jobs, 

104 scenario=scenario, 

105 validation_ids=seeds if validate_after else None, 

106 sbatch_options=sbatch_options, 

107 base_dir=base_dir, 

108 run_on=run_on 

109 ) 

110 

111 @staticmethod 

112 def organise_output(output_source: Path, 

113 output_target: Path, 

114 scenario: SMAC2Scenario, 

115 run_id: int) -> None | dict: 

116 """Retrieves configuration from SMAC file and places them in output.""" 

117 from filelock import FileLock 

118 call_key = SMAC2.configurator_target.name 

119 # Last line describing a call is the best found configuration 

120 for line in reversed(output_source.open("r").readlines()): 

121 if call_key in line: 

122 call_str = line.split(call_key, maxsplit=1)[1].strip() 

123 # The Configuration appears after the first 7 arguments 

124 configuration = call_str.split(" ", 8)[-1] 

125 break 

126 configuration = Solver.config_str_to_dict(configuration) 

127 if output_target is None or not output_target.exists(): 

128 return configuration 

129 time_stamp = scenario.scenario_file_path.stat().st_mtime 

130 configuration["configuration_id"] =\ 

131 f"{SMAC2.__name__}_{time_stamp}_{run_id}" 

132 instance_names = scenario.instance_set.instance_names 

133 lock = FileLock(f"{output_target}.lock") 

134 with lock.acquire(timeout=60): 

135 performance_data = PerformanceDataFrame(output_target) 

136 # Resolve absolute path to Solver column 

137 solver = [s for s in performance_data.solvers 

138 if Path(s).name == scenario.solver.name][0] 

139 # For some reason the instance paths in the instance set are absolute 

140 instances = [instance for instance in performance_data.instances 

141 if Path(instance).name in instance_names] 

142 # We don't set the seed in the dataframe, as that should be part of the conf 

143 performance_data.set_value( 

144 value=[str(configuration)], 

145 solver=solver, 

146 instance=instances, 

147 objective=None, 

148 run=run_id, 

149 solver_fields=[PerformanceDataFrame.column_configuration] 

150 ) 

151 performance_data.save_csv() 

152 

153 @staticmethod 

154 def get_smac_run_obj(objective: SparkleObjective) -> str: 

155 """Return the SMAC run objective based on the Performance Measure. 

156 

157 Returns: 

158 A string that represents the run objective set in the settings. 

159 """ 

160 if objective.time: 

161 return "RUNTIME" 

162 return "QUALITY" 

163 

164 def get_status_from_logs(self: SMAC2) -> None: 

165 """Method to scan the log files of the configurator for warnings.""" 

166 base_dir = self.output_path / "scenarios" 

167 if not base_dir.exists(): 

168 return 

169 print(f"Checking the log files of configurator {type(self).__name__} for " 

170 "warnings...") 

171 scenarios = [f for f in base_dir.iterdir() if f.is_dir()] 

172 for scenario in scenarios: 

173 log_dir = scenario / "outdir_train_configuration" \ 

174 / (scenario.name + "_scenario") 

175 warn_files = glob.glob(str(log_dir) + "/log-warn*") 

176 non_empty = [log_file for log_file in warn_files 

177 if Path(log_file).stat().st_size > 0] 

178 if len(non_empty) > 0: 

179 print(f"Scenario {scenario.name} has {len(non_empty)} warning(s), see " 

180 "the following log file(s) for more information:") 

181 for log_file in non_empty: 

182 print(f"\t-{log_file}") 

183 else: 

184 print(f"Scenario {scenario.name} has no warnings.") 

185 

186 

187class SMAC2Scenario(ConfigurationScenario): 

188 """Class to handle SMAC2 configuration scenarios.""" 

189 def __init__(self: SMAC2Scenario, 

190 solver: Solver, 

191 instance_set: InstanceSet, 

192 sparkle_objectives: list[SparkleObjective], 

193 parent_directory: Path, 

194 number_of_runs: int = None, 

195 solver_calls: int = None, 

196 max_iterations: int = None, 

197 cpu_time: int = None, 

198 wallclock_time: int = None, 

199 cutoff_time: int = None, 

200 target_cutoff_length: str = None, 

201 cli_cores: int = None, 

202 use_cpu_time_in_tunertime: bool = None, 

203 feature_data: FeatureDataFrame | Path = None)\ 

204 -> None: 

205 """Initialize scenario paths and names. 

206 

207 Args: 

208 solver: Solver that should be configured. 

209 instance_set: Instances object for the scenario. 

210 sparkle_objectives: SparkleObjectives used for each run of the configuration. 

211 Will be simplified to the first objective. 

212 parent_directory: Directory in which the scenario should be created. 

213 number_of_runs: The number of configurator runs to perform 

214 for configuring the solver. 

215 solver_calls: The number of times the solver is called for each 

216 configuration run 

217 max_iterations: The maximum number of iterations allowed for each 

218 configuration run. [iteration-limit, numIterations, numberOfIterations] 

219 cpu_time: The time budget allocated for each configuration run. (cpu) 

220 wallclock_time: The time budget allocated for each configuration run. 

221 (wallclock) 

222 cutoff_time: The maximum time allowed for each individual run during 

223 configuration. 

224 target_cutoff_length: A domain specific measure of when the algorithm 

225 should consider itself done. 

226 cli_cores: int 

227 The number of cores to use to execute runs. Defaults in SMAC2 to 1. 

228 use_cpu_time_in_tunertime: Whether to calculate SMAC2's own used time for 

229 budget deduction. Defaults in SMAC2 to True. 

230 feature_data: If features are used, this contains the feature data. 

231 If it is a FeatureDataFrame, will convert values to SMAC2 format. 

232 If it is a Path, will pass the path to SMAC2. 

233 Defaults to None. 

234 """ 

235 super().__init__(solver, instance_set, sparkle_objectives, parent_directory) 

236 self.solver = solver 

237 self.instance_set = instance_set 

238 self.name = f"{self.solver.name}_{self.instance_set.name}" 

239 

240 if sparkle_objectives is not None: 

241 self.sparkle_objective = sparkle_objectives[0] 

242 if len(sparkle_objectives) > 1: 

243 print("WARNING: SMAC2 does not have multi objective support. Only the " 

244 f"first objective ({self.sparkle_objective}) will be optimised.") 

245 else: 

246 self.sparkle_objective = None 

247 

248 self.number_of_runs = number_of_runs 

249 self.solver_calls = solver_calls 

250 self.cpu_time = cpu_time 

251 self.wallclock_time = wallclock_time 

252 self.cutoff_time = cutoff_time 

253 self.cutoff_length = target_cutoff_length 

254 self.max_iterations = max_iterations 

255 self.cli_cores = cli_cores 

256 self.use_cpu_time_in_tunertime = use_cpu_time_in_tunertime 

257 

258 self.feature_data = feature_data 

259 self.feature_file_path = None 

260 if self.feature_data: 

261 if isinstance(self.feature_data, FeatureDataFrame): 

262 # Convert feature data to SMAC2 format 

263 data_dict = {} 

264 for instance in self.instance_set.instance_paths: 

265 data_dict[str(instance)] = feature_data.get_instance(str(instance)) 

266 

267 self.feature_data = pd.DataFrame.from_dict( 

268 data_dict, orient="index", 

269 columns=[f"Feature{index+1}" 

270 for index in range(feature_data.num_features)]) 

271 

272 def map_nan(x: str) -> int: 

273 """Map non-numeric values with -512 (Pre-defined by SMAC2).""" 

274 if math.isnan(x): 

275 return -512.0 

276 try: 

277 return float(x) 

278 except Exception: 

279 return -512.0 

280 

281 self.feature_data = self.feature_data.map(map_nan) 

282 self.feature_file_path =\ 

283 self.directory / f"{self.instance_set.name}_features.csv" 

284 elif isinstance(self.feature_data, Path): # Read from Path 

285 self.feature_file_path = feature_data 

286 self.feature_data = pd.read_csv(self.feature_file_path, 

287 index_col=0) 

288 else: 

289 print(f"WARNING: Feature data is of type {type(feature_data)}. " 

290 "Expected FeatureDataFrame or Path.") 

291 

292 # Scenario Paths 

293 self.instance_file_path = self.directory / f"{self.instance_set.name}.txt" 

294 

295 # SMAC2 Specific 

296 self.outdir_train = self.directory / "outdir_train_configuration" 

297 

298 def create_scenario(self: SMAC2Scenario) -> None: 

299 """Create scenario with solver and instances in the parent directory. 

300 

301 This prepares all the necessary subdirectories related to configuration. 

302 

303 Args: 

304 parent_directory: Directory in which the scenario should be created. 

305 """ 

306 # Prepare scenario directory 

307 shutil.rmtree(self.directory, ignore_errors=True) 

308 self.directory.mkdir(parents=True) 

309 # Create empty directories as needed 

310 self.outdir_train.mkdir() 

311 self.tmp.mkdir() 

312 self.validation.mkdir() 

313 self.results_directory.mkdir(parents=True) # Prepare results directory 

314 

315 self._prepare_instances() 

316 

317 if self.feature_data is not None: 

318 self._create_feature_file() 

319 

320 self.create_scenario_file() 

321 

322 def create_scenario_file(self: SMAC2Scenario) -> Path: 

323 """Create a file with the configuration scenario. 

324 

325 Writes supplementary information to the target algorithm (algo =) as: 

326 algo = {configurator_target} {solver_directory} {sparkle_objective} 

327 """ 

328 with self.scenario_file_path.open("w") as file: 

329 file.write(f"algo = {SMAC2.configurator_target.absolute()} " 

330 f"{self.solver.directory} {self.tmp} {self.sparkle_objective} \n" 

331 f"deterministic = {1 if self.solver.deterministic else 0}\n" 

332 f"run_obj = {self._get_performance_measure()}\n" 

333 f"cutoffTime = {self.cutoff_time}\n" 

334 f"cutoff_length = {self.cutoff_length}\n" 

335 f"paramfile = {self.solver.get_pcs_file()}\n" 

336 f"outdir = {self.outdir_train}\n" 

337 f"instance_file = {self.instance_file_path}\n" 

338 f"test_instance_file = {self.instance_file_path}\n") 

339 if self.max_iterations is not None: 

340 file.write(f"iteration-limit = {self.max_iterations}\n") 

341 if self.wallclock_time is not None: 

342 file.write(f"wallclock-limit = {self.wallclock_time}\n") 

343 if self.cpu_time is not None: 

344 file.write(f"cputime-limit = {self.cpu_time}\n") 

345 if self.solver_calls is not None: 

346 file.write(f"runcount-limit = {self.solver_calls}\n") 

347 if self.cli_cores is not None: 

348 file.write(f"cli-cores = {self.cli_cores}") 

349 if self.feature_data is not None: 

350 file.write(f"feature_file = {self.feature_file_path}\n") 

351 if self.use_cpu_time_in_tunertime is not None: 

352 file.write("use-cpu-time-in-tunertime = " 

353 f"{self.use_cpu_time_in_tunertime}\n") 

354 # We don't let SMAC do the validation 

355 file.write("validation = false" + "\n") 

356 return self.scenario_file_path 

357 

358 def _prepare_instances(self: SMAC2Scenario) -> None: 

359 """Create instance list file without instance specifics.""" 

360 self.instance_file_path.parent.mkdir(exist_ok=True, parents=True) 

361 with self.instance_file_path.open("w+") as file: 

362 for instance_path in self.instance_set._instance_paths: 

363 file.write(f"{instance_path}\n") 

364 

365 def _create_feature_file(self: SMAC2Scenario) -> None: 

366 """Create CSV file from feature data.""" 

367 self.feature_data.to_csv(self.feature_file_path, 

368 index_label="INSTANCE_NAME") 

369 

370 def _get_performance_measure(self: SMAC2Scenario) -> str: 

371 """Retrieve the performance measure of the SparkleObjective. 

372 

373 Returns: 

374 Performance measure of the sparkle objective 

375 """ 

376 if self.sparkle_objective.time: 

377 return "RUNTIME" 

378 return "QUALITY" 

379 

380 def serialize_scenario(self: SMAC2Scenario) -> dict: 

381 """Transform ConfigurationScenario to dictionary format.""" 

382 return { 

383 "number_of_runs": self.number_of_runs, 

384 "solver_calls": self.solver_calls, 

385 "cpu_time": self.cpu_time, 

386 "wallclock_time": self.wallclock_time, 

387 "cutoff_time": self.cutoff_time, 

388 "cutoff_length": self.cutoff_length, 

389 "max_iterations": self.max_iterations, 

390 "sparkle_objective": self.sparkle_objective.name, 

391 "feature_data": self.feature_data_path, 

392 "use_cpu_time_in_tunertime": self.use_cpu_time_in_tunertime 

393 } 

394 

395 @staticmethod 

396 def from_file(scenario_file: Path) -> SMAC2Scenario: 

397 """Reads scenario file and initalises SMAC2Scenario.""" 

398 config = {keyvalue[0]: keyvalue[1] 

399 for keyvalue in (line.strip().split(" = ", maxsplit=1) 

400 for line in scenario_file.open().readlines() 

401 if line.strip() != "")} 

402 

403 # Collect relevant settings 

404 cpu_time = int(config["cpu_time"]) if "cpu_time" in config else None 

405 wallclock_limit = int(config["wallclock-limit"]) if "wallclock-limit" in config \ 

406 else None 

407 solver_calls = int(config["runcount-limit"]) if "runcount-limit" in config \ 

408 else None 

409 max_iterations = int(config["iteration-limit"]) if "iteration-limit" in config \ 

410 else None 

411 use_cpu_time_in_tunertime = config["use-cputime-in-tunertime"]\ 

412 if "use-cputime-in-tunertime" in config else None 

413 cli_cores = config["cli-cores"] if "cli-cores" in config else None 

414 

415 _, solver_path, _, objective_str = config["algo"].split(" ") 

416 objective = resolve_objective(objective_str) 

417 solver = Solver(Path(solver_path.strip())) 

418 # Extract the instance set from the instance file 

419 instance_file_path = Path(config["instance_file"]) 

420 instance_set_path = Path(instance_file_path.open().readline().strip()).parent 

421 instance_set = Instance_Set(Path(instance_set_path)) 

422 results_folder = scenario_file.parent / "results" 

423 state_run_dirs = [p for p in results_folder.iterdir() if p.is_file()] 

424 number_of_runs = len(state_run_dirs) 

425 feature_data_path = None 

426 if "feature_file" in config: 

427 feature_data_path = Path(config["feature_file"]) 

428 return SMAC2Scenario(solver, 

429 instance_set, 

430 [objective], 

431 instance_file_path.parent.parent, 

432 number_of_runs, 

433 solver_calls, 

434 max_iterations, 

435 cpu_time, 

436 wallclock_limit, 

437 int(config["cutoffTime"]), 

438 config["cutoff_length"], 

439 cli_cores, 

440 use_cpu_time_in_tunertime, 

441 feature_data_path)