Coverage for sparkle/configurator/implementations/smac2.py: 73%

206 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-09-29 10:17 +0000

1"""Configurator classes to implement SMAC2 in Sparkle.""" 

2 

3from __future__ import annotations 

4from pathlib import Path 

5import glob 

6import shutil 

7import math 

8import random 

9 

10import pandas as pd 

11 

12from runrunner import Runner, Run 

13 

14from sparkle.tools.parameters import PCSConvention 

15from sparkle.configurator.configurator import Configurator, ConfigurationScenario 

16from sparkle.solver import Solver 

17from sparkle.structures import PerformanceDataFrame, FeatureDataFrame 

18from sparkle.instance import InstanceSet, Instance_Set 

19from sparkle.types import SparkleObjective, resolve_objective 

20 

21 

22class SMAC2(Configurator): 

23 """Class for SMAC2 (Java) configurator.""" 

24 

25 configurator_path = Path(__file__).parent.resolve() / "SMAC2" 

26 configurator_executable = configurator_path / "smac" 

27 configurator_target = configurator_path / "smac2_target_algorithm.py" 

28 

29 full_name = "Sequential Model-based Algorithm Configuration" 

30 version = "2.10.03" 

31 

32 def __init__(self: SMAC2) -> None: 

33 """Returns the SMAC2 configurator, Java SMAC V2.10.03.""" 

34 return super().__init__(multi_objective_support=False) 

35 

36 @property 

37 def name(self: SMAC2) -> str: 

38 """Returns the name of the configurator.""" 

39 return SMAC2.__name__ 

40 

41 @staticmethod 

42 def scenario_class() -> ConfigurationScenario: 

43 """Returns the SMAC2 scenario class.""" 

44 return SMAC2Scenario 

45 

46 @staticmethod 

47 def check_requirements(verbose: bool = False) -> bool: 

48 """Check that SMAC2 is installed.""" 

49 import warnings 

50 

51 if no_java := shutil.which("java") is None: 

52 if verbose: 

53 warnings.warn( 

54 "SMAC2 requires Java 1.8.0_402, but Java is not installed. " 

55 "Please ensure Java is installed." 

56 ) 

57 if no_smac := not SMAC2.configurator_executable.exists(): 

58 if verbose: 

59 warnings.warn( 

60 "SMAC2 executable not found. Please ensure SMAC2 is installed " 

61 f"in the expected Path ({SMAC2.configurator_path})." 

62 ) 

63 return not (no_java or no_smac) 

64 

65 @staticmethod 

66 def download_requirements( 

67 smac2_zip_url: str = "https://github.com/ADA-research/Sparkle/raw/refs/heads/" 

68 "development/Resources/Configurators/SMAC2-v2.10.03.zip", 

69 ) -> None: 

70 """Download SMAC2.""" 

71 if SMAC2.configurator_executable.exists(): 

72 return # Already installed 

73 from urllib.request import urlopen 

74 import zipfile 

75 import io 

76 

77 r = urlopen(smac2_zip_url, timeout=60) 

78 z = zipfile.ZipFile(io.BytesIO(r.read())) 

79 z.extractall(SMAC2.configurator_path) 

80 # Ensure execution rights 

81 SMAC2.configurator_executable.chmod(0o755) 

82 

83 def configure( 

84 self: SMAC2, 

85 scenario: SMAC2Scenario, 

86 data_target: PerformanceDataFrame, 

87 validate_after: bool = True, 

88 sbatch_options: list[str] = [], 

89 slurm_prepend: str | list[str] | Path = None, 

90 num_parallel_jobs: int = None, 

91 base_dir: Path = None, 

92 run_on: Runner = Runner.SLURM, 

93 ) -> list[Run]: 

94 """Start configuration job. 

95 

96 Args: 

97 scenario: ConfigurationScenario object 

98 data_target: PerformanceDataFrame where to store the found configurations 

99 validate_after: Whether the configurations should be validated on the 

100 train set afterwards. 

101 sbatch_options: List of slurm batch options to use. 

102 slurm_prepend: Slurm script to prepend to the sbatch. 

103 num_parallel_jobs: The maximum number of jobs to run parallel. 

104 base_dir: The path where the sbatch scripts will be created for Slurm. 

105 run_on: On which platform to run the jobs. Default: Slurm. 

106 

107 Returns: 

108 A RunRunner Run object. 

109 """ 

110 scenario.create_scenario() 

111 configuration_ids = scenario.configuration_ids 

112 

113 # The maximum seed size for SMAC2 is 999 999 999 

114 seeds = [random.randint(0, 10**9 - 1) for _ in range(scenario.number_of_runs)] 

115 output = [ 

116 f"{(scenario.results_directory).absolute()}/" 

117 f"{scenario.name}_{config_id}_smac.txt" 

118 for config_id in configuration_ids 

119 ] 

120 cmds = [ 

121 f"python3 {Configurator.configurator_cli_path.absolute()} " 

122 f"{SMAC2.__name__} {output_file} {data_target.csv_filepath} " 

123 f"{scenario.scenario_file_path} {configuration_id} " 

124 f"{SMAC2.configurator_executable.absolute()} " 

125 f"--scenario-file {scenario.scenario_file_path} " 

126 f"--seed {seed} " 

127 for output_file, configuration_id, seed in zip( 

128 output, configuration_ids, seeds 

129 ) 

130 ] 

131 if num_parallel_jobs is not None: 

132 num_parallel_jobs = max(num_parallel_jobs, len(cmds)) 

133 return super().configure( 

134 configuration_commands=cmds, 

135 data_target=data_target, 

136 output=output, 

137 num_parallel_jobs=num_parallel_jobs, 

138 scenario=scenario, 

139 configuration_ids=configuration_ids, 

140 validate_after=validate_after, 

141 sbatch_options=sbatch_options, 

142 slurm_prepend=slurm_prepend, 

143 base_dir=base_dir, 

144 run_on=run_on, 

145 ) 

146 

147 @staticmethod 

148 def organise_output( 

149 output_source: Path, 

150 output_target: Path, 

151 scenario: SMAC2Scenario, 

152 configuration_id: str, 

153 ) -> None | dict: 

154 """Retrieves configuration from SMAC file and places them in output.""" 

155 call_key = SMAC2.configurator_target.name 

156 # Last line describing a call is the best found configuration 

157 for line in reversed(output_source.open("r").readlines()): 

158 if call_key in line: 

159 call_str = line.split(call_key, maxsplit=1)[1].strip() 

160 # The Configuration appears after the first 7 arguments 

161 configuration = call_str.split(" ", 8)[-1] 

162 break 

163 configuration = Solver.config_str_to_dict(configuration) 

164 configuration["configuration_id"] = configuration_id 

165 return Configurator.save_configuration( 

166 scenario, configuration_id, configuration, output_target 

167 ) 

168 

169 @staticmethod 

170 def get_smac_run_obj(objective: SparkleObjective) -> str: 

171 """Return the SMAC run objective based on the Performance Measure. 

172 

173 Returns: 

174 A string that represents the run objective set in the settings. 

175 """ 

176 if objective.time: 

177 return "RUNTIME" 

178 return "QUALITY" 

179 

180 def get_status_from_logs(self: SMAC2, base_dir: Path) -> None: 

181 """Method to scan the log files of the configurator for warnings.""" 

182 if not base_dir.exists(): 

183 return 

184 print( 

185 f"Checking the log files of configurator {type(self).__name__} for " 

186 "warnings..." 

187 ) 

188 scenarios = [f for f in base_dir.iterdir() if f.is_dir()] 

189 for scenario in scenarios: 

190 log_dir = ( 

191 scenario / "outdir_train_configuration" / (scenario.name + "_scenario") 

192 ) 

193 warn_files = glob.glob(str(log_dir) + "/log-warn*") 

194 non_empty = [ 

195 log_file for log_file in warn_files if Path(log_file).stat().st_size > 0 

196 ] 

197 if len(non_empty) > 0: 

198 print( 

199 f"Scenario {scenario.name} has {len(non_empty)} warning(s), see " 

200 "the following log file(s) for more information:" 

201 ) 

202 for log_file in non_empty: 

203 print(f"\t-{log_file}") 

204 else: 

205 print(f"Scenario {scenario.name} has no warnings.") 

206 

207 

208class SMAC2Scenario(ConfigurationScenario): 

209 """Class to handle SMAC2 configuration scenarios.""" 

210 

211 def __init__( 

212 self: SMAC2Scenario, 

213 solver: Solver, 

214 instance_set: InstanceSet, 

215 sparkle_objectives: list[SparkleObjective], 

216 number_of_runs: int, 

217 parent_directory: Path, 

218 solver_calls: int = None, 

219 max_iterations: int = None, 

220 cpu_time: int = None, 

221 wallclock_time: int = None, 

222 solver_cutoff_time: int = None, 

223 target_cutoff_length: str = None, 

224 cli_cores: int = None, 

225 use_cpu_time_in_tunertime: bool = None, 

226 feature_data: FeatureDataFrame | Path = None, 

227 timestamp: str = None, 

228 ) -> None: 

229 """Initialize scenario paths and names. 

230 

231 Args: 

232 solver: Solver that should be configured. 

233 instance_set: Instances object for the scenario. 

234 sparkle_objectives: SparkleObjectives used for each run of the configuration. 

235 Will be simplified to the first objective. 

236 number_of_runs: The number of configurator runs to perform 

237 for configuring the solver. 

238 parent_directory: Directory in which the scenario should be created. 

239 solver_calls: The number of times the solver is called for each 

240 configuration run 

241 max_iterations: The maximum number of iterations allowed for each 

242 configuration run. [iteration-limit, numIterations, numberOfIterations] 

243 cpu_time: The time budget allocated for each configuration run. (cpu) 

244 wallclock_time: The time budget allocated for each configuration run. 

245 (wallclock) 

246 solver_cutoff_time: The maximum time allowed for each solver call run during 

247 configuration. 

248 target_cutoff_length: A domain specific measure of when the algorithm 

249 should consider itself done. 

250 cli_cores: int 

251 The number of cores to use to execute runs. Defaults in SMAC2 to 1. 

252 use_cpu_time_in_tunertime: Whether to calculate SMAC2's own used time for 

253 budget deduction. Defaults in SMAC2 to True. 

254 feature_data: If features are used, this contains the feature data. 

255 If it is a FeatureDataFrame, will convert values to SMAC2 format. 

256 If it is a Path, will pass the path to SMAC2. 

257 Defaults to None. 

258 timestamp: An optional timestamp for the directory name. 

259 """ 

260 super().__init__( 

261 solver, 

262 instance_set, 

263 sparkle_objectives, 

264 number_of_runs, 

265 parent_directory, 

266 timestamp, 

267 ) 

268 self.solver = solver 

269 self.instance_set = instance_set 

270 

271 self.sparkle_objective = sparkle_objectives[0] 

272 self.solver_calls = solver_calls 

273 self.cpu_time = cpu_time 

274 self.wallclock_time = wallclock_time 

275 self.solver_cutoff_time = solver_cutoff_time 

276 self.cutoff_length = target_cutoff_length 

277 self.max_iterations = max_iterations 

278 self.cli_cores = cli_cores 

279 self.use_cpu_time_in_tunertime = use_cpu_time_in_tunertime 

280 

281 self.feature_data = feature_data 

282 self._feature_file_path = None 

283 if self.feature_data: 

284 if isinstance(self.feature_data, FeatureDataFrame): 

285 # Convert feature data to SMAC2 format 

286 data_dict = {} 

287 for instance in self.instance_set.instance_paths: 

288 data_dict[str(instance)] = feature_data.get_instance(str(instance)) 

289 

290 self.feature_data = pd.DataFrame.from_dict( 

291 data_dict, 

292 orient="index", 

293 columns=[ 

294 f"Feature{index + 1}" 

295 for index in range(feature_data.num_features) 

296 ], 

297 ) 

298 

299 def map_nan(x: str) -> int: 

300 """Map non-numeric values with -512 (Pre-defined by SMAC2).""" 

301 if math.isnan(x): 

302 return -512.0 

303 try: 

304 return float(x) 

305 except Exception: 

306 return -512.0 

307 

308 self.feature_data = self.feature_data.map(map_nan) 

309 elif isinstance(self.feature_data, Path): # Read from Path 

310 self._feature_file_path = feature_data 

311 self.feature_data = pd.read_csv(self.feature_file_path, index_col=0) 

312 else: 

313 print( 

314 f"WARNING: Feature data is of type {type(feature_data)}. " 

315 "Expected FeatureDataFrame or Path." 

316 ) 

317 

318 @property 

319 def instance_file_path(self: SMAC2Scenario) -> Path: 

320 """Return the path of the instance file.""" 

321 if self.directory: 

322 return self.directory / f"{self.instance_set.name}.txt" 

323 return None 

324 

325 @property 

326 def outdir_train(self: SMAC2Scenario) -> Path: 

327 """Return the path of the train out directory.""" 

328 # SMAC2 Specific directory 

329 if self.directory: 

330 return self.directory / "outdir_train_configuration" 

331 return None 

332 

333 @property 

334 def feature_file_path(self: SMAC2Scenario) -> Path: 

335 """Return the path of the feature file.""" 

336 if self._feature_file_path: 

337 return self._feature_file_path 

338 elif self.directory: 

339 return self.directory / f"{self.instance_set.name}_features.csv" 

340 else: 

341 return None 

342 

343 @property 

344 def configurator(self: SMAC2Scenario) -> SMAC2: 

345 """Return the type of configurator the scenario belongs to.""" 

346 return SMAC2 

347 

348 def create_scenario(self: SMAC2Scenario) -> None: 

349 """Create scenario with solver and instances in the parent directory. 

350 

351 This prepares all the necessary subdirectories related to configuration. 

352 

353 Args: 

354 parent_directory: Directory in which the scenario should be created. 

355 """ 

356 super().create_scenario() 

357 self.outdir_train.mkdir() 

358 self._prepare_instances() 

359 

360 if self.feature_data is not None: 

361 self._create_feature_file() 

362 

363 self.create_scenario_file() 

364 

365 def create_scenario_file( 

366 self: SMAC2Scenario, 

367 configurator_target: Path = SMAC2.configurator_target, 

368 pcs_port: PCSConvention = PCSConvention.SMAC, 

369 ) -> Path: 

370 """Create a file with the configuration scenario. 

371 

372 Writes supplementary information to the target algorithm (algo =) as: 

373 algo = {configurator_target} {solver_directory} {sparkle_objective} 

374 """ 

375 with self.scenario_file_path.open("w") as file: 

376 file.write( 

377 f"algo = {configurator_target.absolute()} " 

378 f"{self.solver.directory} {self.tmp} {self.sparkle_objective} \n" 

379 f"deterministic = {1 if self.solver.deterministic else 0}\n" 

380 f"run_obj = {self._get_performance_measure()}\n" 

381 f"cutoffTime = {self.solver_cutoff_time}\n" 

382 f"paramfile = {self.solver.get_pcs_file(pcs_port)}\n" 

383 f"outdir = {self.outdir_train}\n" 

384 f"instance_file = {self.instance_file_path}\n" 

385 f"test_instance_file = {self.instance_file_path}\n" 

386 ) 

387 if self.cutoff_length is not None: 

388 file.write(f"cutoff_length = {self.cutoff_length}\n") 

389 if self.max_iterations is not None: 

390 file.write(f"iteration-limit = {self.max_iterations}\n") 

391 if self.wallclock_time is not None: 

392 file.write(f"wallclock-limit = {self.wallclock_time}\n") 

393 if self.cpu_time is not None: 

394 file.write(f"cputime-limit = {self.cpu_time}\n") 

395 if self.solver_calls is not None: 

396 file.write(f"runcount-limit = {self.solver_calls}\n") 

397 if self.cli_cores is not None: 

398 file.write(f"cli-cores = {self.cli_cores}") 

399 if self.feature_data is not None: 

400 file.write(f"feature_file = {self.feature_file_path}\n") 

401 if self.use_cpu_time_in_tunertime is not None: 

402 file.write( 

403 f"use-cpu-time-in-tunertime = {self.use_cpu_time_in_tunertime}\n" 

404 ) 

405 # We don't let SMAC do the validation 

406 file.write("validation = false" + "\n") 

407 return self.scenario_file_path 

408 

409 def _prepare_instances(self: SMAC2Scenario) -> None: 

410 """Create instance list file without instance specifics.""" 

411 self.instance_file_path.parent.mkdir(exist_ok=True, parents=True) 

412 with self.instance_file_path.open("w+") as file: 

413 for instance_path in self.instance_set._instance_paths: 

414 file.write(f"{instance_path}\n") 

415 

416 def _create_feature_file(self: SMAC2Scenario) -> None: 

417 """Create CSV file from feature data.""" 

418 self.feature_data.to_csv(self.feature_file_path, index_label="INSTANCE_NAME") 

419 

420 def _get_performance_measure(self: SMAC2Scenario) -> str: 

421 """Retrieve the performance measure of the SparkleObjective. 

422 

423 Returns: 

424 Performance measure of the sparkle objective 

425 """ 

426 if self.sparkle_objective.time: 

427 return "RUNTIME" 

428 return "QUALITY" 

429 

430 def serialise(self: SMAC2Scenario) -> dict: 

431 """Transform ConfigurationScenario to dictionary format.""" 

432 return { 

433 "number_of_runs": self.number_of_runs, 

434 "solver_calls": self.solver_calls, 

435 "cpu_time": self.cpu_time, 

436 "wallclock_time": self.wallclock_time, 

437 "solver_cutoff_time": self.solver_cutoff_time, 

438 "cutoff_length": self.cutoff_length, 

439 "max_iterations": self.max_iterations, 

440 "sparkle_objective": self.sparkle_objective.name, 

441 "feature_data": str(self.feature_file_path), 

442 "use_cpu_time_in_tunertime": self.use_cpu_time_in_tunertime, 

443 } 

444 

445 @staticmethod 

446 def from_file(scenario_file: Path) -> SMAC2Scenario: 

447 """Reads scenario file and initalises SMAC2Scenario.""" 

448 config = { 

449 keyvalue[0]: keyvalue[1] 

450 for keyvalue in ( 

451 line.strip().split(" = ", maxsplit=1) 

452 for line in scenario_file.open().readlines() 

453 if line.strip() != "" 

454 ) 

455 } 

456 

457 # Collect relevant settings 

458 cpu_time = int(config["cpu_time"]) if "cpu_time" in config else None 

459 wallclock_limit = ( 

460 int(config["wallclock-limit"]) if "wallclock-limit" in config else None 

461 ) 

462 solver_calls = ( 

463 int(config["runcount-limit"]) if "runcount-limit" in config else None 

464 ) 

465 max_iterations = ( 

466 int(config["iteration-limit"]) if "iteration-limit" in config else None 

467 ) 

468 use_cpu_time_in_tunertime = ( 

469 config["use-cputime-in-tunertime"] 

470 if "use-cputime-in-tunertime" in config 

471 else None 

472 ) 

473 cli_cores = config["cli-cores"] if "cli-cores" in config else None 

474 

475 _, solver_path, _, objective_str = config["algo"].split(" ") 

476 objective = resolve_objective(objective_str) 

477 solver = Solver(Path(solver_path.strip())) 

478 # Extract the instance set from the instance file 

479 instance_file_path = Path(config["instance_file"]) 

480 instance_set_path = Path(instance_file_path.open().readline().strip()).parent 

481 instance_set = Instance_Set(Path(instance_set_path)) 

482 results_folder = scenario_file.parent / "results" 

483 state_run_dirs = [p for p in results_folder.iterdir() if p.is_file()] 

484 number_of_runs = len(state_run_dirs) 

485 feature_data_path = None 

486 if "feature_file" in config: 

487 feature_data_path = Path(config["feature_file"]) 

488 # Get the timestamp from the scenario dir name 

489 timestamp = scenario_file.parent.name.split("_")[-1] 

490 return SMAC2Scenario( 

491 solver, 

492 instance_set, 

493 [objective], 

494 number_of_runs, 

495 instance_file_path.parent.parent, 

496 solver_calls, 

497 max_iterations, 

498 cpu_time, 

499 wallclock_limit, 

500 int(config["cutoffTime"]), 

501 config["cutoff_length"], 

502 cli_cores, 

503 use_cpu_time_in_tunertime, 

504 feature_data_path, 

505 timestamp, 

506 )