Coverage for sparkle/configurator/implementations/irace.py: 72%

1"""Configurator classes to implement IRACE in Sparkle."""

2from __future__ import annotations

3import shutil

4import subprocess

5from pathlib import Path

7from sparkle.configurator.configurator import Configurator, ConfigurationScenario

8from sparkle.solver import Solver

9from sparkle.structures import PerformanceDataFrame, FeatureDataFrame

10from sparkle.instance import InstanceSet, Instance_Set

11from sparkle.types import SparkleObjective, resolve_objective

13from runrunner import Runner, Run

16class IRACE(Configurator):

17 """Class for IRACE configurator."""

18 configurator_path = Path(__file__).parent.resolve() / "IRACE"

19 configurator_target = configurator_path / "irace_target_algorithm.py"

21 full_name = "Iterated Racing for Automatic Algorithm Configuration"

23 r_regex = r'\[\d+\]\s*["‘](?P<data>[^"`]+)["’]'

25 def __init__(self: IRACE) -> None:

26 """Initialize IRACE configurator."""

27 self._version: str = None

28 super().__init__(multi_objective_support=False)

30 @property

31 def name(self: IRACE) -> str:

32 """Returns the name of the configurator."""

33 return IRACE.__name__

35 @property

36 def version(self: IRACE) -> str:

37 """Returns the version of the configurator."""

38 if self._version is None:

39 import re

40 version_call = subprocess.run(["Rscript", "-e", "packageVersion('irace')"],

41 stdout=subprocess.PIPE, stderr=subprocess.PIPE)

42 if version_call.returncode == 0:

43 r_data = re.search(IRACE.r_regex,

44 version_call.stdout.decode().strip())

45 if r_data is not None and r_data.group("data") is not None:

46 self._version = r_data.group("data")

47 return self._version

49 @staticmethod

50 def configurator_executable() -> Path:

51 """Returns the path to the IRACE executable.

53 # NOTE: For the base class this is a class property.

54 However as it must be calculated in this case, it is a class method as calculated

55 class properties do not exist in Python.

57 Returns:

58 Path to the executable if it can be found, else None.

59 """

60 if shutil.which("R") is None:

61 return None # Not installed

62 r_call = subprocess.run(

63 ["Rscript", "-e", "find.package('irace')"],

64 stdout=subprocess.PIPE,

65 stderr=subprocess.PIPE)

66 if r_call.returncode != 0:

67 return None # Not installed

68 import re

69 r_path = re.search(IRACE.r_regex,

70 r_call.stdout.decode().strip())

71 if r_path is None or r_path.group("data") is None:

72 return # Could not find IRACE?

73 path = Path(r_path.group("data"))

74 return path / "bin" / "irace"

76 @staticmethod

77 def scenario_class() -> ConfigurationScenario:

78 """Returns the IRACE scenario class."""

79 return IRACEScenario

81 @staticmethod

82 def check_requirements(verbose: bool = False) -> bool:

83 """Check that IRACE is installed."""

84 import warnings

85 if shutil.which("R") is None:

86 if verbose:

87 warnings.warn(

88 "IRACE requires R, but R is not installed. "

89 "Please ensure R is installed.")

90 return False

91 if not IRACE.configurator_executable():

92 if verbose:

93 warnings.warn(

94 "IRACE executable not found. Please ensure IRACE is installed "

95 f"in the expected Path ({IRACE.configurator_path}).")

96 return False

97 return True

99 @staticmethod

100 def download_requirements() -> None:

101 """Download IRACE."""

102 if shutil.which("R") is None:

103 raise RuntimeError("IRACE requires R, but R is not installed.")

104 # Ensure personal library exists, do not raise warnings

105 subprocess.run([

106 "Rscript", "-e",

107 "dir.create(path = Sys.getenv('R_LIBS_USER'), "

108 "showWarnings = FALSE, recursive = TRUE)"],

109 stdout=subprocess.PIPE, stderr=subprocess.PIPE)

110 install_irace = subprocess.run(

111 ["Rscript", "-e",

112 # Install R

113 "install.packages('irace', "

114 "lib=Sys.getenv('R_LIBS_USER'), " # Install in user library

115 "dependencies = TRUE, " # Ensure dependencies are installed

116 "repos='https://cloud.r-project.org')"], # Set source

117 stdout=subprocess.PIPE, stderr=subprocess.PIPE)

118 print(f"{install_irace.stdout.decode()}\n\n"

119 f"{install_irace.stderr.decode()}")

120 if install_irace.returncode != 0:

121 import warnings

122 warnings.warn("IRACE had a non-zero return code during installation!\n\n"

123 f"{install_irace.stdout.decode()}\n\n"

124 f"{install_irace.stderr.decode()}")

125

126 def configure(self: IRACE,

127 scenario: ConfigurationScenario,

128 data_target: PerformanceDataFrame,

129 validate_after: bool = True,

130 sbatch_options: list[str] = [],

131 slurm_prepend: str | list[str] | Path = None,

132 num_parallel_jobs: int = None,

133 base_dir: Path = None,

134 run_on: Runner = Runner.SLURM) -> Run:

135 """Start configuration job.

136

137 Args:

138 scenario: ConfigurationScenario to execute.

139 data_target: PerformanceDataFrame where to store the found configurations

140 validate_after: Whether to validate the configuration on the training set

141 afterwards or not.

142 sbatch_options: List of slurm batch options to use

143 slurm_prepend: Slurm script to prepend to the sbatch

144 num_parallel_jobs: The maximum number of jobs to run in parallel

145 base_dir: The base_dir of RunRunner where the sbatch scripts will be placed

146 run_on: On which platform to run the jobs. Default: Slurm.

147

148 Returns:

149 A RunRunner Run object.

150 """

151 scenario.create_scenario()

152 configuration_ids = scenario.configuration_ids

153 # Create command to call IRACE. Create plural based on number of runs

154 # TODO: Setting seeds like this is weird and should be inspected.

155 seeds = [i for i in range(scenario.number_of_runs)]

156 output_files = [

157 scenario.results_directory.absolute() / f"output_{job_idx}.Rdata"

158 for job_idx in configuration_ids]

159 cmds = [f"python3 {Configurator.configurator_cli_path.absolute()} "

160 f"{IRACE.__name__} {output_path} {data_target.csv_filepath} "

161 f"{scenario.scenario_file_path} {configuration_id} "

162 f"{IRACE.configurator_executable().absolute()} "

163 f"--scenario {scenario.scenario_file_path} "

164 f"--log-file {output_path} "

165 f"--seed {seed}" for seed, configuration_id, output_path

166 in zip(seeds, configuration_ids, output_files)]

167 return super().configure(

168 configuration_commands=cmds,

169 data_target=data_target,

170 output=output_files,

171 scenario=scenario,

172 configuration_ids=configuration_ids,

173 sbatch_options=sbatch_options,

174 slurm_prepend=slurm_prepend,

175 validate_after=validate_after,

176 num_parallel_jobs=num_parallel_jobs,

177 base_dir=base_dir,

178 run_on=run_on

179 )

180

181 @staticmethod

182 def organise_output(output_source: Path,

183 output_target: Path,

184 scenario: IRACEScenario,

185 configuration_id: str) -> None | dict:

186 """Method to restructure and clean up after a single configurator call."""

187 get_config = subprocess.run(

188 ["Rscript", "-e",

189 'library("irace"); '

190 f'load("{output_source}"); '

191 "last <- length(iraceResults$iterationElites); "

192 "id <- iraceResults$iterationElites[last]; "

193 "print(getConfigurationById(iraceResults, ids = id))"],

194 capture_output=True)

195 r_table = get_config.stdout.decode()

196 if get_config.returncode != 0 or r_table.strip() == "":

197 raise RuntimeError("Failed to get configuration from IRACE file "

198 f"{output_source}:\n"

199 f"{get_config.stdout.decode()}\n"

200 f"{get_config.stderr.decode()}")

201

202 # Join the table header and content together

203 header = ""

204 content = ""

205 for i, line in enumerate(r_table.splitlines()):

206 if i & 1 == 0: # Even lines are headers

207 header += line

208 else: # Odd lines are parameter values

209 # First element is the ID

210 line = " ".join(line.split(" ")[1:])

211 content += line

212 # First header item is the ID

213 header = [x for x in header.split(" ") if x != ""][1:]

214 content = [x for x in content.split(" ") if x != ""][1:]

215 configuration = ""

216 for parameter, value in zip(header, content):

217 if not parameter == ".PARENT." and value != "NA" and value != "<NA>":

218 configuration += f"--{parameter} {value} "

219 configuration = Solver.config_str_to_dict(configuration)

220 return Configurator.save_configuration(scenario, configuration_id,

221 configuration, output_target)

222

223 def get_status_from_logs(self: Configurator) -> None:

224 """Method to scan the log files of the configurator for warnings."""

225 raise NotImplementedError

226

227

228class IRACEScenario(ConfigurationScenario):

229 """Class for IRACE scenario."""

230

231 def __init__(self: IRACEScenario,

232 solver: Solver,

233 instance_set: InstanceSet,

234 sparkle_objectives: list[SparkleObjective],

235 number_of_runs: int,

236 parent_directory: Path,

237 solver_calls: int = None,

238 solver_cutoff_time: int = None,

239 max_time: int = None,

240 budget_estimation: float = None,

241 first_test: int = None,

242 mu: int = None,

243 max_iterations: int = None,

244 feature_data: FeatureDataFrame = None,

245 )\

246 -> None:

247 """Initialize scenario paths and names.

248

249 Args:

250 solver: Solver that should be configured.

251 instance_set: Instances object for the scenario.

252 sparkle_objectives: SparkleObjectives used for each run of the configuration.

253 Will be simplified to the first objective.

254 number_of_runs: The number of configurator runs to perform

255 for configuring the solver.

256 parent_directory: Path where the scenario files will be placed.

257 solver_calls: The number of times the solver is called for each

258 configuration run. [MaxExperiments]

259 solver_cutoff_time: The maximum time allowed for each individual run during

260 configuration.

261 max_time: The time budget (CPU) allocated for the sum of solver calls

262 done by the configurator in seconds. [MaxTime]

263 budget_estimation: Fraction (smaller than 1) of the budget used to estimate

264 the mean computation time of a configuration. Only used when maxTime > 0.

265 Default: Computed as cutoff_time / max_time. [BudgetEstimation]

266 first_test: Specifies how many instances are evaluated before the first

267 elimination test. IRACE Default: 5. [firstTest]

268 mu: Parameter used to define the number of configurations sampled and

269 evaluated at each iteration. IRACE Default: 5. [mu]

270 max_iterations: Maximum number of iterations to be executed. Each iteration

271 involves the generation of new configurations and the use of racing to

272 select the best configurations. By default (with 0), irace calculates a

273 minimum number of iterations as N^iter = ⌊2 + log2 N param⌋, where

274 N^param is the number of non-fixed parameters to be tuned.

275 Setting this parameter may make irace stop sooner than it should without

276 using all the available budget. We recommend to use the default value.

277 feature_data: FeatureDataFrame object with the feature data.

278 Currently not supported by IRACE.

279 """

280 """

281 Other possible arguments that are not added yet to Sparkle:

282 --test-num-elites Number of elite configurations returned by irace that

283 will be tested if test instances are provided.

284 Default: 1.

285 --test-iteration-elites Enable/disable testing the elite configurations

286 found at each iteration. Default: 0.

287 --test-type Statistical test used for elimination. The default

288 value selects t-test if capping is enabled or F-test,

289 otherwise. Valid values are: F-test (Friedman test),

290 t-test (pairwise t-tests with no correction),

291 t-test-bonferroni (t-test with Bonferroni's correction

292 for multiple comparisons), t-test-holm (t-test with

293 Holm's correction for multiple comparisons).

294 --each-test Number of instances evaluated between elimination

295 tests. Default: 1.

296 --load-balancing Enable/disable load-balancing when executing

297 experiments in parallel. Load-balancing makes better

298 use of computing resources, but increases

299 communication overhead. If this overhead is large,

300 disabling load-balancing may be faster. Default: 1.

301 --mpi Enable/disable MPI. Use Rmpi to execute targetRunner

302 in parallel (parameter parallel is the number of

303 slaves). Default: 0.

304 --batchmode Specify how irace waits for jobs to finish when

305 targetRunner submits jobs to a batch cluster: sge,

306 pbs, torque, slurm or htcondor. targetRunner must

307 submit jobs to the cluster using, for example, qsub.

308 Default: 0.

309 --digits Maximum number of decimal places that are significant

310 for numerical (real) parameters. Default: 4.

311 --soft-restart Enable/disable the soft restart strategy that avoids

312 premature convergence of the probabilistic model.

313 Default: 1.

314 --soft-restart-threshold Soft restart threshold value for numerical

315 parameters. If NA, NULL or "", it is computed as

316 10^-digits.

317 -e,--elitist Enable/disable elitist irace. Default: 1.

318 --elitist-new-instances Number of instances added to the execution list

319 before previous instances in elitist irace. Default:

320 1.

321 --elitist-limit In elitist irace, maximum number per race of

322 elimination tests that do not eliminate a

323 configuration. Use 0 for no limit. Default: 2.

324 --capping Enable the use of adaptive capping, a technique

325 designed for minimizing the computation time of

326 configurations. This is only available when elitist is

327 active. Default: 0.

328 --capping-type Measure used to obtain the execution bound from the

329 performance of the elite configurations: median, mean,

330 worst, best. Default: median.

331 --bound-type Method to calculate the mean performance of elite

332 configurations: candidate or instance. Default:

333 candidate.

334 --bound-max Maximum execution bound for targetRunner. It must be

335 specified when capping is enabled. Default: 0.

336 --bound-digits Precision used for calculating the execution time. It

337 must be specified when capping is enabled. Default: 0.

338 --bound-par Penalization constant for timed out executions

339 (executions that reach boundMax execution time).

340 Default: 1.

341 --bound-as-timeout Replace the configuration cost of bounded executions

342 with boundMax. Default: 1.

343 --postselection Percentage of the configuration budget used to perform

344 a postselection race of the best configurations of

345 each iteration after the execution of irace. Default:

346 0.

347 --iterations Maximum number of iterations. Default: 0.

348 --experiments-per-iteration Number of runs of the target algorithm per

349 iteration. Default: 0.

350 --min-survival Minimum number of configurations needed to continue

351 the execution of each race (iteration). Default: 0.

352 --num-configurations Number of configurations to be sampled and evaluated

353 at each iteration. Default: 0.

354 --confidence Confidence level for the elimination test. Default:

355 0.95."""

356 super().__init__(solver, instance_set, sparkle_objectives,

357 number_of_runs, parent_directory)

358 self.solver = solver

359 self.instance_set = instance_set

360 if sparkle_objectives is not None:

361 self.sparkle_objective = sparkle_objectives[0]

362 else:

363 self.sparkle_objective = None

364

365 if feature_data is not None:

366 print("WARNING: Instance features currently not supported by IRACE.")

367

368 self.solver_calls = solver_calls if solver_calls and solver_calls > 0 else None

369 self.max_time = max_time if max_time and max_time > 0 else None

370 self.solver_cutoff_time = solver_cutoff_time

371 self.budget_estimation = budget_estimation

372 self.first_test = first_test

373 self.mu = mu

374 self.max_iterations = max_iterations

375

376 # Pathing

377 self.instance_file_path = self.directory / f"{self.instance_set.name}.txt"

378 self.validation = self.directory / "validation"

379 self.results_directory = self.directory / "results"

380

381 @property

382 def configurator(self: IRACEScenario) -> IRACE:

383 """Return the type of configurator the scenario belongs to."""

384 return IRACE

385

386 def create_scenario(self: IRACEScenario) -> None:

387 """Create scenario with solver and instances in the parent directory.

388

389 This prepares all the necessary subdirectories related to configuration.

390 Removes any existing directory if it overlaps with the scenario name.

391

392 Args:

393 parent_directory: Directory in which the scenario should be created.

394 """

395 # Set up directories

396 shutil.rmtree(self.directory, ignore_errors=True) # Clear directory

397 self.directory.mkdir(exist_ok=True, parents=True)

398 self.tmp.mkdir(exist_ok=True)

399 self.validation.mkdir(exist_ok=True)

400 self.results_directory.mkdir(exist_ok=True)

401

402 with self.instance_file_path.open("w+") as file:

403 for instance_path in self.instance_set._instance_paths:

404 file.write(f"{instance_path.name}\n")

405 self.create_scenario_file()

406

407 def create_scenario_file(self: IRACEScenario) -> Path:

408 """Create a file from the IRACE scenario.

409

410 Returns:

411 Path to the created file.

412 """

413 super().create_scenario_file()

414 from sparkle.tools.parameters import PCSConvention

415 solver_path = self.solver.directory.absolute()

416 pcs_path = self.solver.get_pcs_file(port_type=PCSConvention.IRACE).absolute()

417 with self.scenario_file_path.open("w") as file:

418 file.write(

419 f'execDir = "{self.directory.absolute()}"\n'

420 'targetRunnerLauncher = "python3"\n'

421 f'targetRunner = "{IRACE.configurator_target.absolute()}"\n'

422 'targetCmdline = "{targetRunner} '

423 f"{solver_path} {self.sparkle_objective} {self.solver_cutoff_time} "

424 '{configurationID} {instanceID} {seed} {instance} {targetRunnerArgs}"\n'

425 f"deterministic = {1 if self.solver.deterministic else 0}\n"

426 f'parameterFile = "{pcs_path.absolute()}"\n'

427 f'trainInstancesDir = "{self.instance_set.directory.absolute()}"\n'

428 f'trainInstancesFile = "{self.instance_file_path.absolute()}"\n'

429 "debugLevel = 1\n" # The verbosity level of IRACE

430 )

431 if self.solver_calls is not None:

432 file.write(f"maxExperiments = {self.solver_calls}\n")

433 elif self.max_time is not None:

434 file.write(f"maxTime = {self.max_time}\n")

435 if self.solver_calls is not None and self.max_time is not None:

436 print("WARNING: Both solver calls and max time specified for scenario. "

437 "This is not supported by IRACE, defaulting to solver calls.")

438 elif self.solver_calls is None and self.max_time is None:

439 print("WARNING: Neither solver calls nor max time specified. "

440 "Either budget is required for the IRACE scenario.")

441 if self.max_time is not None and self.budget_estimation is None:

442 # Auto Estimate

443 if self.solver_cutoff_time < self.max_time:

444 self.budget_estimation = self.solver_cutoff_time / self.max_time

445 file.write(f"budgetEstimation = {self.budget_estimation}\n")

446 if self.first_test is not None:

447 file.write(f"firstTest = {self.first_test}\n")

448 if self.mu is not None:

449 file.write(f"mu = {self.mu}\n")

450 if self.max_iterations is not None:

451 file.write(f"nbIterations = {self.max_iterations}\n")

452 print("Verifying contents of IRACE scenario file and testing solver call...")

453 check_file = subprocess.run(

454 [f"{IRACE.configurator_executable().absolute()}",

455 "-s", f"{self.scenario_file_path.absolute()}", "--check"],

456 capture_output=True)

457 if check_file.returncode != 0:

458 stdout_msg = "\n".join([

459 line for line in check_file.stdout.decode().splitlines()

460 if not line.startswith("#")])

461 print("An error occured in the IRACE scenario file:\n",

462 self.scenario_file_path.open("r").read(),

463 stdout_msg, "\n",

464 check_file.stderr.decode())

465 return None

466 print("IRACE scenario file is valid.")

467 return self.scenario_file_path

468

469 def serialise(self: IRACEScenario) -> dict:

470 """Serialize the IRACE scenario."""

471 return {

472 "number_of_runs": self.number_of_runs,

473 "solver_calls": self.solver_calls,

474 "max_time": self.max_time,

475 "solver_cutoff_time": self.solver_cutoff_time,

476 "budget_estimation": self.budget_estimation,

477 "first_test": self.first_test,

478 "mu": self.mu,

479 "max_iterations": self.max_iterations,

480 }

481

482 @staticmethod

483 def from_file(scenario_file: Path) -> IRACEScenario:

484 """Reads scenario file and initalises IRACEScenario."""

485 scenario_dict = {keyvalue[0]: keyvalue[1]

486 for keyvalue in (line.split(" = ", maxsplit=1)

487 for line in scenario_file.open().readlines()

488 if line.strip() != "")}

489 _, solver_path, objective, cutoff, _, _, _, _, _ =\

490 scenario_dict.pop("targetCmdline").split(" ")

491 scenario_dict["sparkle_objectives"] = [resolve_objective(objective)]

492 scenario_dict["solver_cutoff_time"] = int(cutoff)

493 scenario_dict["parent_directory"] = scenario_file.parent.parent

494 scenario_dict["number_of_runs"] =\

495 len([p for p in (scenario_file.parent / "results").iterdir()])

496 scenario_dict.pop("targetRunner")

497 scenario_dict.pop("execDir")

498 scenario_dict.pop("targetRunnerLauncher")

499 scenario_dict.pop("deterministic")

500 scenario_dict.pop("parameterFile")

501 scenario_dict.pop("debugLevel")

502 instance_set_path =\

503 Path(scenario_dict.pop("trainInstancesDir").strip().strip('"'))

504 instance_set = Instance_Set(instance_set_path)

505 solver = Solver(Path(solver_path.strip()))

506 scenario_dict.pop("trainInstancesFile")

507 # Replace keys with scenario variable names

508 if "budgetEstimation" in scenario_dict:

509 scenario_dict["budget_estimation"] =\

510 float(scenario_dict.pop(("budgetEstimation")))

511 if "firstTest" in scenario_dict:

512 scenario_dict["first_test"] = int(scenario_dict.pop("firstTest"))

513 if "mu" in scenario_dict:

514 scenario_dict["mu"] = int(scenario_dict.pop("mu"))

515 if "nbIterations" in scenario_dict:

516 scenario_dict["max_iterations"] = int(scenario_dict.pop("nbIterations"))

517 if "maxExperiments" in scenario_dict:

518 scenario_dict["solver_calls"] = int(scenario_dict.pop("maxExperiments"))

519 if "maxTime" in scenario_dict:

520 scenario_dict["max_time"] = int(scenario_dict.pop("maxTime"))

521

522 return IRACEScenario(solver, instance_set, **scenario_dict)