Coverage for src/sparkle/configurator/implementations/smac2.py: 73%

1"""Configurator classes to implement SMAC2 in Sparkle."""

3from __future__ import annotations

4from pathlib import Path

5import shutil

6import math

7import random

9import pandas as pd

11from runrunner import Runner, Run

13from sparkle.tools.parameters import PCSConvention

14from sparkle.configurator.configurator import Configurator, ConfigurationScenario

15from sparkle.solver import Solver

16from sparkle.structures import PerformanceDataFrame, FeatureDataFrame

17from sparkle.instance import InstanceSet, Instance_Set

18from sparkle.types import SparkleObjective, resolve_objective

21class SMAC2(Configurator):

22 """Class for SMAC2 (Java) configurator."""

24 configurator_path = Path(__file__).parent.resolve() / "SMAC2"

25 configurator_executable = configurator_path / "smac"

26 configurator_target = configurator_path / "smac2_target_algorithm.py"

28 full_name = "Sequential Model-based Algorithm Configuration"

29 version = "2.10.03"

31 def __init__(self: SMAC2) -> None:

32 """Returns the SMAC2 configurator, Java SMAC V2.10.03."""

33 return super().__init__(multi_objective_support=False)

35 @property

36 def name(self: SMAC2) -> str:

37 """Returns the name of the configurator."""

38 return SMAC2.__name__

40 @staticmethod

41 def scenario_class() -> ConfigurationScenario:

42 """Returns the SMAC2 scenario class."""

43 return SMAC2Scenario

45 @staticmethod

46 def check_requirements(verbose: bool = False) -> bool:

47 """Check that SMAC2 is installed."""

48 import warnings

50 if no_java := shutil.which("java") is None:

51 if verbose:

52 warnings.warn(

53 "SMAC2 requires Java 1.8.0_402, but Java is not installed. "

54 "Please ensure Java is installed."

55 )

56 if no_smac := not SMAC2.configurator_executable.exists():

57 if verbose:

58 warnings.warn(

59 "SMAC2 executable not found. Please ensure SMAC2 is installed "

60 f"in the expected Path ({SMAC2.configurator_path})."

61 )

62 return not (no_java or no_smac)

64 @staticmethod

65 def download_requirements(

66 smac2_zip_url: str = "https://github.com/ADA-research/Sparkle/raw/refs/heads/"

67 "development/Resources/Configurators/SMAC2-v2.10.03.zip",

68 ) -> None:

69 """Download SMAC2."""

70 if SMAC2.configurator_executable.exists():

71 return # Already installed

72 from urllib.request import urlopen

73 import zipfile

74 import io

76 r = urlopen(smac2_zip_url, timeout=60)

77 z = zipfile.ZipFile(io.BytesIO(r.read()))

78 z.extractall(SMAC2.configurator_path)

79 # Ensure execution rights

80 SMAC2.configurator_executable.chmod(0o755)

82 def configure(

83 self: SMAC2,

84 scenario: SMAC2Scenario,

85 data_target: PerformanceDataFrame,

86 validate_after: bool = True,

87 sbatch_options: list[str] = [],

88 slurm_prepend: str | list[str] | Path = None,

89 num_parallel_jobs: int = None,

90 base_dir: Path = None,

91 run_on: Runner = Runner.SLURM,

92 ) -> list[Run]:

93 """Start configuration job.

95 Args:

96 scenario: ConfigurationScenario object

97 data_target: PerformanceDataFrame where to store the found configurations

98 validate_after: Whether the configurations should be validated on the

99 train set afterwards.

100 sbatch_options: List of slurm batch options to use.

101 slurm_prepend: Slurm script to prepend to the sbatch.

102 num_parallel_jobs: The maximum number of jobs to run parallel.

103 base_dir: The path where the sbatch scripts will be created for Slurm.

104 run_on: On which platform to run the jobs. Default: Slurm.

105

106 Returns:

107 A RunRunner Run object.

108 """

109 scenario.create_scenario()

110 configuration_ids = scenario.configuration_ids

111

112 # The maximum seed size for SMAC2 is 999 999 999

113 seeds = [random.randint(0, 10**9 - 1) for _ in range(scenario.number_of_runs)]

114 output = [

115 f"{(scenario.results_directory).absolute()}/"

116 f"{scenario.name}_{config_id}_smac.txt"

117 for config_id in configuration_ids

118 ]

119 cmds = [

120 f"python3 {Configurator.configurator_cli_path.absolute()} "

121 f"{SMAC2.__name__} {output_file} {data_target.csv_filepath} "

122 f"{scenario.scenario_file_path} {configuration_id} "

123 f"{SMAC2.configurator_executable.absolute()} "

124 f"--scenario-file {scenario.scenario_file_path} "

125 f"--seed {seed} "

126 for output_file, configuration_id, seed in zip(

127 output, configuration_ids, seeds

128 )

129 ]

130 if num_parallel_jobs is not None:

131 num_parallel_jobs = max(num_parallel_jobs, len(cmds))

132 return super().configure(

133 configuration_commands=cmds,

134 data_target=data_target,

135 output=output,

136 num_parallel_jobs=num_parallel_jobs,

137 scenario=scenario,

138 configuration_ids=configuration_ids,

139 validate_after=validate_after,

140 sbatch_options=sbatch_options,

141 slurm_prepend=slurm_prepend,

142 base_dir=base_dir,

143 run_on=run_on,

144 )

145

146 @staticmethod

147 def organise_output(

148 output_source: Path,

149 output_target: Path,

150 scenario: SMAC2Scenario,

151 configuration_id: str,

152 ) -> None | dict:

153 """Retrieves configuration from SMAC file and places them in output."""

154 call_key = SMAC2.configurator_target.name

155 # Last line describing a call is the best found configuration

156 for line in reversed(output_source.open("r").readlines()):

157 if call_key in line:

158 call_str = line.split(call_key, maxsplit=1)[1].strip()

159 # The Configuration appears after the first 7 arguments

160 configuration = call_str.split(" ", 8)[-1]

161 break

162 configuration = Solver.config_str_to_dict(configuration)

163 configuration["configuration_id"] = configuration_id

164 return Configurator.save_configuration(

165 scenario, configuration_id, configuration, output_target

166 )

167

168 @staticmethod

169 def get_smac_run_obj(objective: SparkleObjective) -> str:

170 """Return the SMAC run objective based on the Performance Measure.

171

172 Returns:

173 A string that represents the run objective set in the settings.

174 """

175 if objective.time:

176 return "RUNTIME"

177 return "QUALITY"

178

179 def get_status_from_logs(self: SMAC2, base_dir: Path) -> None:

180 """Method to scan the log files of the configurator for warnings."""

181 if not base_dir.exists():

182 return

183 print(

184 f"Checking the log files of configurator {type(self).__name__} for "

185 "warnings..."

186 )

187 scenarios = [f for f in base_dir.iterdir() if f.is_dir()]

188 for scenario in scenarios:

189 log_dir = (

190 scenario / "outdir_train_configuration" / (scenario.name + "_scenario")

191 )

192 # Collect all non empty log files paths

193 warn_files = [f for f in log_dir.glob("log-warn*") if f.stat().st_size > 0]

194 if len(warn_files) > 0:

195 print(

196 f"Scenario {scenario.name} has {len(warn_files)} warning(s), see "

197 "the following log file(s) for more information:"

198 )

199 for log_file in warn_files:

200 print(f"\t-{log_file}")

201 else:

202 print(f"Scenario {scenario.name} has no warnings.")

203

204

205class SMAC2Scenario(ConfigurationScenario):

206 """Class to handle SMAC2 configuration scenarios."""

207

208 def __init__(

209 self: SMAC2Scenario,

210 solver: Solver,

211 instance_set: InstanceSet,

212 sparkle_objectives: list[SparkleObjective],

213 number_of_runs: int,

214 parent_directory: Path,

215 solver_calls: int = None,

216 max_iterations: int = None,

217 cpu_time: int = None,

218 wallclock_time: int = None,

219 solver_cutoff_time: int = None,

220 target_cutoff_length: str = None,

221 cli_cores: int = None,

222 use_cpu_time_in_tunertime: bool = None,

223 feature_data: FeatureDataFrame | Path = None,

224 timestamp: str = None,

225 ) -> None:

226 """Initialize scenario paths and names.

227

228 Args:

229 solver: Solver that should be configured.

230 instance_set: Instances object for the scenario.

231 sparkle_objectives: SparkleObjectives used for each run of the configuration.

232 Will be simplified to the first objective.

233 number_of_runs: The number of configurator runs to perform

234 for configuring the solver.

235 parent_directory: Directory in which the scenario should be created.

236 solver_calls: The number of times the solver is called for each

237 configuration run

238 max_iterations: The maximum number of iterations allowed for each

239 configuration run. [iteration-limit, numIterations, numberOfIterations]

240 cpu_time: The time budget allocated for each configuration run. (cpu)

241 wallclock_time: The time budget allocated for each configuration run.

242 (wallclock)

243 solver_cutoff_time: The maximum time allowed for each solver call run during

244 configuration.

245 target_cutoff_length: A domain specific measure of when the algorithm

246 should consider itself done.

247 cli_cores: int

248 The number of cores to use to execute runs. Defaults in SMAC2 to 1.

249 use_cpu_time_in_tunertime: Whether to calculate SMAC2's own used time for

250 budget deduction. Defaults in SMAC2 to True.

251 feature_data: If features are used, this contains the feature data.

252 If it is a FeatureDataFrame, will convert values to SMAC2 format.

253 If it is a Path, will pass the path to SMAC2.

254 Defaults to None.

255 timestamp: An optional timestamp for the directory name.

256 """

257 super().__init__(

258 solver,

259 instance_set,

260 sparkle_objectives,

261 number_of_runs,

262 parent_directory,

263 timestamp,

264 )

265 self.solver = solver

266 self.instance_set = instance_set

267

268 self.sparkle_objective = sparkle_objectives[0]

269 self.solver_calls = solver_calls

270 self.cpu_time = cpu_time

271 self.wallclock_time = wallclock_time

272 self.solver_cutoff_time = solver_cutoff_time

273 self.cutoff_length = target_cutoff_length

274 self.max_iterations = max_iterations

275 self.cli_cores = cli_cores

276 self.use_cpu_time_in_tunertime = use_cpu_time_in_tunertime

277

278 self.feature_data = feature_data

279 self._feature_file_path = None

280 if self.feature_data:

281 if isinstance(self.feature_data, FeatureDataFrame):

282 # Convert feature data to SMAC2 format

283 data_dict = {}

284 for instance in self.instance_set.instance_paths:

285 data_dict[str(instance)] = feature_data.get_instance(str(instance))

286

287 self.feature_data = pd.DataFrame.from_dict(

288 data_dict,

289 orient="index",

290 columns=[

291 f"Feature{index + 1}"

292 for index in range(feature_data.num_features)

293 ],

294 )

295

296 def map_nan(x: str) -> int:

297 """Map non-numeric values with -512 (Pre-defined by SMAC2)."""

298 if math.isnan(x):

299 return -512.0

300 try:

301 return float(x)

302 except Exception:

303 return -512.0

304

305 self.feature_data = self.feature_data.map(map_nan)

306 elif isinstance(self.feature_data, Path): # Read from Path

307 self._feature_file_path = feature_data

308 self.feature_data = pd.read_csv(self.feature_file_path, index_col=0)

309 else:

310 print(

311 f"WARNING: Feature data is of type {type(feature_data)}. "

312 "Expected FeatureDataFrame or Path."

313 )

314

315 @property

316 def instance_file_path(self: SMAC2Scenario) -> Path:

317 """Return the path of the instance file."""

318 if self.directory:

319 return self.directory / f"{self.instance_set.name}.txt"

320 return None

321

322 @property

323 def outdir_train(self: SMAC2Scenario) -> Path:

324 """Return the path of the train out directory."""

325 # SMAC2 Specific directory

326 if self.directory:

327 return self.directory / "outdir_train_configuration"

328 return None

329

330 @property

331 def feature_file_path(self: SMAC2Scenario) -> Path:

332 """Return the path of the feature file."""

333 if self._feature_file_path:

334 return self._feature_file_path

335 elif self.directory:

336 return self.directory / f"{self.instance_set.name}_features.csv"

337 else:

338 return None

339

340 @property

341 def configurator(self: SMAC2Scenario) -> SMAC2:

342 """Return the type of configurator the scenario belongs to."""

343 return SMAC2

344

345 def create_scenario(self: SMAC2Scenario) -> None:

346 """Create scenario with solver and instances in the parent directory.

347

348 This prepares all the necessary subdirectories related to configuration.

349

350 Args:

351 parent_directory: Directory in which the scenario should be created.

352 """

353 super().create_scenario()

354 self.outdir_train.mkdir()

355 self._prepare_instances()

356

357 if self.feature_data is not None:

358 self._create_feature_file()

359

360 self.create_scenario_file()

361

362 def create_scenario_file(

363 self: SMAC2Scenario,

364 configurator_target: Path = SMAC2.configurator_target,

365 pcs_port: PCSConvention = PCSConvention.SMAC,

366 ) -> Path:

367 """Create a file with the configuration scenario.

368

369 Writes supplementary information to the target algorithm (algo =) as:

370 algo = {configurator_target} {solver_directory} {sparkle_objective}

371 """

372 with self.scenario_file_path.open("w") as file:

373 file.write(

374 f"algo = {configurator_target.absolute()} "

375 f"{self.solver.directory} {self.tmp} {self.sparkle_objective} \n"

376 f"deterministic = {1 if self.solver.deterministic else 0}\n"

377 f"run_obj = {self._get_performance_measure()}\n"

378 f"cutoffTime = {self.solver_cutoff_time}\n"

379 f"paramfile = {self.solver.get_pcs_file(pcs_port)}\n"

380 f"outdir = {self.outdir_train}\n"

381 f"instance_file = {self.instance_file_path}\n"

382 f"test_instance_file = {self.instance_file_path}\n"

383 )

384 if self.cutoff_length is not None:

385 file.write(f"cutoff_length = {self.cutoff_length}\n")

386 if self.max_iterations is not None:

387 file.write(f"iteration-limit = {self.max_iterations}\n")

388 if self.wallclock_time is not None:

389 file.write(f"wallclock-limit = {self.wallclock_time}\n")

390 if self.cpu_time is not None:

391 file.write(f"cputime-limit = {self.cpu_time}\n")

392 if self.solver_calls is not None:

393 file.write(f"runcount-limit = {self.solver_calls}\n")

394 if self.cli_cores is not None:

395 file.write(f"cli-cores = {self.cli_cores}")

396 if self.feature_data is not None:

397 file.write(f"feature_file = {self.feature_file_path}\n")

398 if self.use_cpu_time_in_tunertime is not None:

399 file.write(

400 f"use-cpu-time-in-tunertime = {self.use_cpu_time_in_tunertime}\n"

401 )

402 # We don't let SMAC do the validation

403 file.write("validation = false" + "\n")

404 return self.scenario_file_path

405

406 def _prepare_instances(self: SMAC2Scenario) -> None:

407 """Create instance list file without instance specifics."""

408 self.instance_file_path.parent.mkdir(exist_ok=True, parents=True)

409 with self.instance_file_path.open("w+") as file:

410 for instance_path in self.instance_set._instance_paths:

411 file.write(f"{instance_path}\n")

412

413 def _create_feature_file(self: SMAC2Scenario) -> None:

414 """Create CSV file from feature data."""

415 self.feature_data.to_csv(self.feature_file_path, index_label="INSTANCE_NAME")

416

417 def _get_performance_measure(self: SMAC2Scenario) -> str:

418 """Retrieve the performance measure of the SparkleObjective.

419

420 Returns:

421 Performance measure of the sparkle objective

422 """

423 if self.sparkle_objective.time:

424 return "RUNTIME"

425 return "QUALITY"

426

427 def serialise(self: SMAC2Scenario) -> dict:

428 """Transform ConfigurationScenario to dictionary format."""

429 return {

430 "number_of_runs": self.number_of_runs,

431 "solver_calls": self.solver_calls,

432 "cpu_time": self.cpu_time,

433 "wallclock_time": self.wallclock_time,

434 "solver_cutoff_time": self.solver_cutoff_time,

435 "cutoff_length": self.cutoff_length,

436 "max_iterations": self.max_iterations,

437 "sparkle_objective": self.sparkle_objective.name,

438 "feature_data": str(self.feature_file_path),

439 "use_cpu_time_in_tunertime": self.use_cpu_time_in_tunertime,

440 }

441

442 @staticmethod

443 def from_file(scenario_file: Path) -> SMAC2Scenario:

444 """Reads scenario file and initalises SMAC2Scenario."""

445 config = {

446 keyvalue[0]: keyvalue[1]

447 for keyvalue in (

448 line.strip().split(" = ", maxsplit=1)

449 for line in scenario_file.open().readlines()

450 if line.strip() != ""

451 )

452 }

453

454 # Collect relevant settings

455 cpu_time = int(config["cpu_time"]) if "cpu_time" in config else None

456 wallclock_limit = (

457 int(config["wallclock-limit"]) if "wallclock-limit" in config else None

458 )

459 solver_calls = (

460 int(config["runcount-limit"]) if "runcount-limit" in config else None

461 )

462 max_iterations = (

463 int(config["iteration-limit"]) if "iteration-limit" in config else None

464 )

465 use_cpu_time_in_tunertime = (

466 config["use-cputime-in-tunertime"]

467 if "use-cputime-in-tunertime" in config

468 else None

469 )

470 cli_cores = config["cli-cores"] if "cli-cores" in config else None

471

472 _, solver_path, _, objective_str = config["algo"].split(" ")

473 objective = resolve_objective(objective_str)

474 solver = Solver(Path(solver_path.strip()))

475 # Extract the instance set from the instance file

476 instance_file_path = Path(config["instance_file"])

477 instance_set_path = Path(instance_file_path.open().readline().strip()).parent

478 instance_set = Instance_Set(Path(instance_set_path))

479 results_folder = scenario_file.parent / "results"

480 state_run_dirs = [p for p in results_folder.iterdir() if p.is_file()]

481 number_of_runs = len(state_run_dirs)

482 feature_data_path = None

483 if "feature_file" in config:

484 feature_data_path = Path(config["feature_file"])

485 # Get the timestamp from the scenario dir name

486 timestamp = scenario_file.parent.name.split("_")[-1]

487 return SMAC2Scenario(

488 solver,

489 instance_set,

490 [objective],

491 number_of_runs,

492 instance_file_path.parent.parent,

493 solver_calls,

494 max_iterations,

495 cpu_time,

496 wallclock_limit,

497 int(config["cutoffTime"]),

498 config["cutoff_length"],

499 cli_cores,

500 use_cpu_time_in_tunertime,

501 feature_data_path,

502 timestamp,

503 )