Coverage for sparkle/structures/performance

1"""Module to manage performance data files and common operations on them."""

2from __future__ import annotations

3import ast

4import copy

5from typing import Any

6import itertools

7from pathlib import Path

8import math

9import numpy as np

10import pandas as pd

12from sparkle.types import SparkleObjective, resolve_objective

15class PerformanceDataFrame(pd.DataFrame):

16 """Class to manage performance data and common operations on them."""

17 missing_value = math.nan

19 missing_objective = "UNKNOWN"

20 default_configuration = "Default"

22 index_objective = "Objective"

23 index_instance = "Instance"

24 index_run = "Run"

25 multi_index_names = [index_objective, index_instance, index_run]

27 column_solver = "Solver"

28 column_configuration = "Configuration"

29 column_meta = "Meta"

30 column_value = "Value"

31 column_seed = "Seed"

32 multi_column_names = [column_solver, column_configuration, column_meta]

33 multi_column_value = [column_value, column_seed]

34 multi_column_dtypes = [str, int]

36 def __init__(self: PerformanceDataFrame,

37 csv_filepath: Path,

38 solvers: list[str] = None,

39 configurations: dict[str, dict[str, dict]] = None,

40 objectives: list[str | SparkleObjective] = None,

41 instances: list[str] = None,

42 n_runs: int = 1) -> None:

43 """Initialise a PerformanceDataFrame.

45 Consists of:

46 - Columns representing the Solvers

47 - Rows representing the result by multi-index in order of:

48 * Objective (Static, given in constructor or read from file)

49 * Instance

50 * Runs (Static, given in constructor or read from file)

52 Args:

53 csv_filepath: If path exists, load from Path.

54 Otherwise create new and save to this path.

55 solvers: List of solver names to be added into the Dataframe

56 configurations: The configuration keys per solver to add, structured as

57 configurations[solver][config_key] = {"parameter": "value", ..}

58 objectives: List of SparkleObjectives or objective names. By default None,

59 then the objectives will be derived from Sparkle Settings if possible.

60 instances: List of instance names to be added into the Dataframe

61 n_runs: The number of runs to consider per Solver/Objective/Instance comb.

62 """

63 if csv_filepath and csv_filepath.exists(): # Read from file

64 df = pd.read_csv(csv_filepath,

65 header=[0, 1, 2], index_col=[0, 1, 2],

66 dtype={"Value": str, "Seed": int},

67 on_bad_lines="skip",

68 comment="$") # $ For extra data lines

69 super().__init__(df)

70 self.csv_filepath = csv_filepath

71 # Load configuration mapping

72 with self.csv_filepath.open() as f:

73 configuration_lines = [line.strip().strip("$").split(",", maxsplit=2)

74 for line in f.readlines()

75 if line.startswith("$")]

76 configurations = {s: {} for s in self.solvers}

77 for solver, config_key, config in configuration_lines[1:]: # Skip header

78 configurations[solver][config_key] = ast.literal_eval(config.strip('"'))

79 else: # New PerformanceDataFrame

80 # Initialize empty DataFrame

81 run_ids = list(range(1, n_runs + 1)) # We count runs from 1

82 # We always need objectives to maintain the dimensions

83 if objectives is None:

84 objectives = [PerformanceDataFrame.missing_objective]

85 else:

86 objectives = [str(o) for o in objectives]

87 # We always need an instance to maintain the dimensions

88 if instances is None:

89 instances = [PerformanceDataFrame.missing_value]

90 # We always need a solver to maintain the dimensions

91 if solvers is None:

92 solvers = [PerformanceDataFrame.missing_value]

93 midx = pd.MultiIndex.from_product(

94 [objectives, instances, run_ids],

95 names=PerformanceDataFrame.multi_index_names)

96 # Create the multi index tuples

97 if configurations is None:

98 configurations = \

99 {solver: {PerformanceDataFrame.default_configuration: {}}

100 for solver in solvers}

101 column_tuples = []

102 # We cannot do .from_product here as config ids are per solver

103 for solver in configurations.keys():

104 for config_id in configurations[solver].keys():

105 column_tuples.extend([

106 (solver, config_id, PerformanceDataFrame.column_seed),

107 (solver, config_id, PerformanceDataFrame.column_value)])

108 mcolumns = pd.MultiIndex.from_tuples(

109 column_tuples,

110 names=[PerformanceDataFrame.column_solver,

111 PerformanceDataFrame.column_configuration,

112 PerformanceDataFrame.column_meta])

113 # Set dtype object to avoid inferring float for categorical objectives

114 super().__init__(PerformanceDataFrame.missing_value,

115 index=midx, columns=mcolumns, dtype="object")

116 self.csv_filepath = csv_filepath

117

118 # Store configuration in global attributes dictionary, see Pandas Docs

119 self.attrs = configurations

120

121 if self.index.duplicated().any(): # Combine duplicate indices

122 combined = self.groupby(level=[0, 1, 2]).first()

123 # We keep the last to allow overwriting existing values

124 duplicates = self.index[self.index.duplicated(keep="last")]

125 # Remove all duplicate entries from self

126 self.drop(duplicates, inplace=True)

127 for d in duplicates: # Place combined duplicates in self

128 self.loc[d, :] = combined.loc[d, :]

129

130 # Sort the index to optimize lookup speed

131 self.sort_index(axis=0, inplace=True)

132 self.sort_index(axis=1, inplace=True)

133

134 if csv_filepath and not self.csv_filepath.exists(): # New Performance DataFrame

135 self.save_csv()

136

137 # Properties

138

139 @property

140 def num_objectives(self: PerformanceDataFrame) -> int:

141 """Retrieve the number of objectives in the DataFrame."""

142 return self.index.get_level_values(0).unique().size

143

144 @property

145 def num_instances(self: PerformanceDataFrame) -> int:

146 """Return the number of instances."""

147 return self.index.get_level_values(1).unique().size

148

149 @property

150 def num_runs(self: PerformanceDataFrame) -> int:

151 """Return the maximum number of runs of each instance."""

152 return self.index.get_level_values(2).unique().size

153

154 @property

155 def num_solvers(self: PerformanceDataFrame) -> int:

156 """Return the number of solvers."""

157 return self.columns.get_level_values(0).unique().size

158

159 @property

160 def num_solver_configurations(self: PerformanceDataFrame) -> int:

161 """Return the number of solver configurations."""

162 return int(self.columns.get_level_values( # Config has a seed & value

163 PerformanceDataFrame.column_configuration).size / 2)

164

165 @property

166 def multi_objective(self: PerformanceDataFrame) -> bool:

167 """Return whether the dataframe represent MO or not."""

168 return self.num_objectives > 1

169

170 @property

171 def solvers(self: PerformanceDataFrame) -> list[str]:

172 """Return the solver present as a list of strings."""

173 # Do not return the nan solver as its not an actual solver

174 return self.columns.get_level_values(

175 PerformanceDataFrame.column_solver).dropna().unique().to_list()

176

177 @property

178 def configuration_ids(self: PerformanceDataFrame) -> list[str]:

179 """Return the list of configuration keys."""

180 return self.columns.get_level_values(

181 PerformanceDataFrame.column_configuration).unique().to_list()

182

183 @property

184 def configurations(self: PerformanceDataFrame) -> dict[str, dict[str, dict]]:

185 """Return a dictionary (copy) containing the configurations for each solver."""

186 return copy.deepcopy(self.attrs) # Deepcopy to avoid mutation of attribute

187

188 @property

189 def objective_names(self: PerformanceDataFrame) -> list[str]:

190 """Return the objective names as a list of strings."""

191 return self.index.get_level_values(0).unique().to_list()

192

193 @property

194 def objectives(self: PerformanceDataFrame) -> list[SparkleObjective]:

195 """Return the objectives as a list of SparkleObjectives."""

196 return [resolve_objective(o) for o in self.objective_names]

197

198 @property

199 def instances(self: PerformanceDataFrame) -> list[str]:

200 """Return the instances as a Pandas Index object."""

201 return self.index.get_level_values(1).unique().to_list()

202

203 @property

204 def run_ids(self: PerformanceDataFrame) -> list[int]:

205 """Return the run ids as a list of integers."""

206 return self.index.get_level_values(2).unique().to_list()

207

208 @property

209 def has_missing_values(self: PerformanceDataFrame) -> bool:

210 """Returns True if there are any missing values in the dataframe."""

211 return self.drop(PerformanceDataFrame.column_seed,

212 level=PerformanceDataFrame.column_meta,

213 axis=1).isnull().any().any()

214

215 def is_missing(self: PerformanceDataFrame,

216 solver: str,

217 instance: str,) -> int:

218 """Checks if a solver/instance is missing values."""

219 return self.xs(solver, axis=1).xs(

220 instance, axis=0,

221 level=PerformanceDataFrame.index_instance).drop(

222 PerformanceDataFrame.column_seed,

223 level=PerformanceDataFrame.column_meta,

224 axis=1).isnull().any().any()

225

226 def verify_objective(self: PerformanceDataFrame,

227 objective: str) -> str:

228 """Method to check whether the specified objective is valid.

229

230 Users are allowed to index the dataframe without specifying all dimensions.

231 However, when dealing with multiple objectives this is not allowed and this

232 is verified here. If we have only one objective this is returned. Otherwise,

233 if an objective is specified by the user this is returned.

234

235 Args:

236 objective: The objective given by the user

237 """

238 if objective is None:

239 if self.multi_objective:

240 raise ValueError("Error: MO Data, but objective not specified.")

241 elif self.num_objectives == 1:

242 return self.objective_names[0]

243 else:

244 return PerformanceDataFrame.missing_objective

245 return objective

246

247 def verify_run_id(self: PerformanceDataFrame,

248 run_id: int) -> int:

249 """Method to check whether run id is valid.

250

251 Similar to verify_objective but here we check the dimensionality of runs.

252

253 Args:

254 run_id: the run as specified by the user.

255 """

256 if run_id is None:

257 if self.num_runs > 1:

258 raise ValueError("Error: Multiple run performance data, "

259 "but run not specified")

260 else:

261 run_id = self.run_ids[0]

262 return run_id

263

264 def verify_indexing(self: PerformanceDataFrame,

265 objective: str,

266 run_id: int) -> tuple[str, int]:

267 """Method to check whether data indexing is correct.

268

269 Users are allowed to use the Performance Dataframe without the second and

270 fourth dimension (Objective and Run respectively) in the case they only

271 have one objective or only do one run. This method adjusts the indexing for

272 those cases accordingly.

273

274 Args:

275 objective: The given objective name

276 run_id: The given run index

277

278 Returns:

279 A tuple representing the (possibly adjusted) Objective and Run index.

280 """

281 objective = self.verify_objective(objective)

282 run_id = self.verify_run_id(run_id)

283 return objective, run_id

284

285 # Getters and Setters

286

287 def add_solver(self: PerformanceDataFrame,

288 solver_name: str,

289 configurations: list[(str, dict)] = None,

290 initial_value: float | list[str | float] = None) -> None:

291 """Add a new solver to the dataframe. Initializes value to None by default.

292

293 Args:

294 solver_name: The name of the solver to be added.

295 configurations: A list of configuration keys for the solver.

296 initial_value: The value assigned for each index of the new solver.

297 If not None, must match the index dimension (n_obj * n_inst * n_runs).

298 """

299 if solver_name in self.solvers:

300 print(f"WARNING: Tried adding already existing solver {solver_name} to "

301 f"Performance DataFrame: {self.csv_filepath}")

302 return

303 if not isinstance(initial_value, list): # Single value

304 initial_value = [[initial_value, initial_value]]

305 if configurations is None:

306 configurations = [(PerformanceDataFrame.default_configuration, {})]

307 self.attrs[solver_name] = {}

308 for (config_key, config), (value, seed) in itertools.product(configurations,

309 initial_value):

310 self[(solver_name, config_key, PerformanceDataFrame.column_seed)] = seed

311 self[(solver_name, config_key, PerformanceDataFrame.column_value)] = value

312 self.attrs[solver_name][config_key] = config

313 if self.num_solvers == 2: # Remove nan solver

314 for solver in self.solvers:

315 if str(solver) == str(PerformanceDataFrame.missing_value):

316 self.remove_solver(solver)

317 break

318

319 def add_configuration(

320 self: PerformanceDataFrame,

321 solver: str,

322 configuration_id: str | list[str],

323 configuration: dict[str, Any] | list[dict[str, Any]] = None) -> None:

324 """Add new configurations for a solver to the dataframe.

325

326 If the key already exists, update the value.

327

328 Args:

329 solver: The name of the solver to be added.

330 configuration_id: The name of the configuration to be added.

331 configuration: The configuration to be added.

332 """

333 if not isinstance(configuration_id, list):

334 configuration_id = [configuration_id]

335 if not isinstance(configuration, list):

336 configuration = [configuration]

337 for config_id, config in zip(configuration_id, configuration):

338 if config_id not in self.get_configurations(solver):

339 self[(solver, config_id, PerformanceDataFrame.column_value)] = None

340 self[(solver, config_id, PerformanceDataFrame.column_seed)] = None

341 self.attrs[solver][config_id] = config

342 # Sort the index to optimize lookup speed

343 self.sort_index(axis=1, inplace=True)

344

345 def add_objective(self: PerformanceDataFrame,

346 objective_name: str,

347 initial_value: float = None) -> None:

348 """Add an objective to the DataFrame."""

349 initial_value = initial_value or self.missing_value

350 if objective_name in self.objective_names:

351 print(f"WARNING: Tried adding already existing objective {objective_name} "

352 f"to Performance DataFrame: {self.csv_filepath}")

353 return

354 for instance, run in itertools.product(self.instances, self.run_ids):

355 self.loc[(objective_name, instance, run)] = initial_value

356 self.sort_index(axis=0, inplace=True)

357

358 def add_instance(self: PerformanceDataFrame,

359 instance_name: str,

360 initial_values: Any | list[Any] = None) -> None:

361 """Add and instance to the DataFrame.

362

363 Args:

364 instance_name: The name of the instance to be added.

365 initial_values: The values assigned for each index of the new instance.

366 If list, must match the column dimension (Value, Seed, Configuration).

367 """

368 initial_values = initial_values or self.missing_value

369 if not isinstance(initial_values, list):

370 initial_values = ([initial_values]

371 * 2 # Value and Seed per target column

372 * self.num_solver_configurations)

373 elif len(initial_values) == len(PerformanceDataFrame.multi_column_names):

374 initial_values = initial_values * self.num_solvers

375

376 if instance_name in self.instances:

377 print(f"WARNING: Tried adding already existing instance {instance_name} "

378 f"to Performance DataFrame: {self.csv_filepath}")

379 return

380 # Add rows for all combinations

381 for objective, run in itertools.product(self.objective_names, self.run_ids):

382 self.loc[(objective, instance_name, run)] = initial_values

383 if self.num_instances == 2: # Remove nan instance

384 for instance in self.instances:

385 if not isinstance(instance, str) and math.isnan(instance):

386 self.remove_instances(instance)

387 break

388 # Sort the index to optimize lookup speed

389 self.sort_index(axis=0, inplace=True)

390

391 def add_runs(self: PerformanceDataFrame,

392 num_extra_runs: int,

393 instance_names: list[str] = None,

394 initial_values: Any | list[Any] = None) -> None:

395 """Add runs to the DataFrame.

396

397 Args:

398 num_extra_runs: The number of runs to be added.

399 instance_names: The instances for which runs are to be added.

400 By default None, which means runs are added to all instances.

401 initial_values: The initial value for each objective of each new run.

402 If a list, needs to have a value for Value, Seed and Configuration.

403 """

404 initial_values = initial_values or self.missing_value

405 if not isinstance(initial_values, list):

406 initial_values =\

407 [initial_values] * self.num_solvers * 2 # Value and Seed

408 elif len(initial_values) == 2: # Value and seed provided

409 initial_values = initial_values * self.num_solvers

410 instance_names = self.instances if instance_names is None else instance_names

411 for objective, instance in itertools.product(self.objective_names,

412 instance_names):

413 index_runs_start = len(self.loc[(objective, instance)]) + 1

414 for run in range(index_runs_start, index_runs_start + num_extra_runs):

415 self.loc[(objective, instance, run)] = initial_values

416 # Sort the index to optimize lookup speed

417 # NOTE: It would be better to do this at the end, but that results in

418 # PerformanceWarning: indexing past lexsort depth may impact performance.

419 self.sort_index(axis=0, inplace=True)

420

421 def get_configurations(self: PerformanceDataFrame,

422 solver_name: str) -> list[str]:

423 """Return the list of configuration keys for a solver."""

424 return list(self[solver_name].columns.get_level_values(

425 PerformanceDataFrame.column_configuration).unique())

426

427 def get_full_configuration(self: PerformanceDataFrame,

428 solver: str,

429 configuration_id: str | list[str]) -> dict | list[dict]:

430 """Return the actual configuration associated with the configuration key."""

431 if isinstance(configuration_id, str):

432 return self.attrs[solver][configuration_id]

433 return [self.attrs[solver][cid] for cid in configuration_id]

434

435 def remove_solver(self: PerformanceDataFrame, solvers: str | list[str]) -> None:

436 """Drop one or more solvers from the Dataframe."""

437 if not solvers: # Bugfix for when an empty list is passed to avoid nan adding

438 return

439 # To make sure objectives / runs are saved when no solvers are present

440 solvers = [solvers] if isinstance(solvers, str) else solvers

441 if self.num_solvers == 1: # This would preferrably be done after removing

442 for field in PerformanceDataFrame.multi_column_value:

443 self[PerformanceDataFrame.missing_value,

444 PerformanceDataFrame.missing_value, field] =\

445 PerformanceDataFrame.missing_value

446 self.drop(columns=solvers, level=0, axis=1, inplace=True)

447 for solver in solvers:

448 del self.attrs[solver]

449

450 def remove_configuration(self: PerformanceDataFrame,

451 solver: str, configuration: str | list[str]) -> None:

452 """Drop one or more configurations from the Dataframe."""

453 if isinstance(configuration, str):

454 configuration = [configuration]

455 for config in configuration:

456 self.drop((solver, config), axis=1, inplace=True)

457 del self.attrs[solver][config]

458 # Sort the index to optimize lookup speed

459 self.sort_index(axis=1, inplace=True)

460

461 def remove_objective(self: PerformanceDataFrame,

462 objectives: str | list[str]) -> None:

463 """Remove objective from the Dataframe."""

464 if len(self.objectives) < 2:

465 raise Exception("Cannot remove last objective from PerformanceDataFrame")

466 self.drop(objectives,

467 axis=0, level=PerformanceDataFrame.index_objective, inplace=True)

468

469 def remove_instances(self: PerformanceDataFrame, instances: str | list[str]) -> None:

470 """Drop instances from the Dataframe."""

471 # To make sure objectives / runs are saved when no instances are present

472 num_instances = len(instances) if isinstance(instances, list) else 1

473 if self.num_instances - num_instances == 0:

474 for objective, run in itertools.product(self.objective_names, self.run_ids):

475 self.loc[(objective, PerformanceDataFrame.missing_value, run)] =\

476 PerformanceDataFrame.missing_value

477 self.drop(instances,

478 axis=0,

479 level=PerformanceDataFrame.index_instance, inplace=True)

480 # Sort the index to optimize lookup speed

481 self.sort_index(axis=0, inplace=True)

482

483 def remove_runs(self: PerformanceDataFrame,

484 runs: int | list[int],

485 instance_names: list[str] = None) -> None:

486 """Drop one or more runs from the Dataframe.

487

488 Args:

489 runs: The run indices to be removed. If its an int,

490 the last n runs are removed. NOTE: If each instance has a different

491 number of runs, the amount of removed runs is not uniform.

492 instance_names: The instances for which runs are to be removed.

493 By default None, which means runs are removed from all instances.

494 """

495 instance_names = self.instances if instance_names is None else instance_names

496 runs = list(range((self.num_runs + 1) - runs, (self.num_runs + 1)))\

497 if isinstance(runs, int) else runs

498 self.drop(runs,

499 axis=0,

500 level=PerformanceDataFrame.index_run,

501 inplace=True)

502 # Sort the index to optimize lookup speed

503 self.sort_index(axis=0, inplace=True)

504

505 def remove_empty_runs(self: PerformanceDataFrame) -> None:

506 """Remove runs that contain no data, except for the first."""

507 for row_index in self.index:

508 if row_index[2] == 1: # First run, never delete

509 continue

510 if self.loc[row_index].isna().all():

511 self.drop(row_index, inplace=True)

512

513 def filter_objective(self: PerformanceDataFrame,

514 objective: str | list[str]) -> None:

515 """Filter the Dataframe to a subset of objectives."""

516 if isinstance(objective, str):

517 objective = [objective]

518 self.drop(list(set(self.objective_names) - set(objective)),

519 axis=0, level=PerformanceDataFrame.index_objective, inplace=True)

520

521 def reset_value(self: PerformanceDataFrame,

522 solver: str,

523 instance: str,

524 objective: str = None,

525 run: int = None) -> None:

526 """Reset a value in the dataframe."""

527 self.set_value(PerformanceDataFrame.missing_value,

528 solver, instance, objective, run)

529

530 def set_value(self: PerformanceDataFrame,

531 value: float | str | list[float | str] | list[list[float | str]],

532 solver: str | list[str],

533 instance: str | list[str],

534 configuration: str = None,

535 objective: str | list[str] = None,

536 run: int | list[int] = None,

537 solver_fields: list[str] = ["Value"],

538 append_write_csv: bool = False) -> None:

539 """Setter method to assign a value to the Dataframe.

540

541 Allows for setting the same value to multiple indices.

542

543 Args:

544 value: Value(s) to be assigned. If value is a list, first dimension is

545 the solver field, second dimension is if multiple different values are

546 to be assigned. Must be the same shape as target.

547 solver: The solver(s) for which the value should be set.

548 If solver is a list, multiple solvers are set. If None, all

549 solvers are set.

550 instance: The instance(s) for which the value should be set.

551 If instance is a list, multiple instances are set. If None, all

552 instances are set.

553 configuration: The configuration(s) for which the value should be set.

554 When left None, set for all configurations

555 objective: The objectives for which the value should be set.

556 When left None, set for all objectives

557 run: The run index for which the value should be set.

558 If left None, set for all runs.

559 solver_fields: The level to which each value should be assinged.

560 Defaults to ["Value"].

561 append_write_csv: For concurrent writing to the PerformanceDataFrame.

562 If True, the value is directly appended to the CSV file.

563 This will create duplicate entries in the file, but these are combined

564 when loading the file.

565 """

566 # Convert indices to slices for None values

567 solver = slice(solver) if solver is None else solver

568 configuration = slice(configuration) if configuration is None else configuration

569 instance = slice(instance) if instance is None else instance

570 objective = slice(objective) if objective is None else objective

571 run = slice(run) if run is None else run

572 # Convert column indices to slices for setting multiple columns

573 value = [value] if not isinstance(value, list) else value

574 # NOTE: We currently forloop levels here, as it allows us to set the same

575 # sequence of values to the indices

576 for item, level in zip(value, solver_fields):

577 self.loc[(objective, instance, run), (solver, configuration, level)] = item

578

579 if append_write_csv:

580 writeable = self.loc[(objective, instance, run), :]

581 if isinstance(writeable, pd.Series): # Single row, convert to pd.DataFrame

582 writeable = self.loc[[(objective, instance, run)], :]

583 # Append the new rows to the dataframe csv file

584 writeable.to_csv(self.csv_filepath, mode="a", header=False)

585

586 def get_value(self: PerformanceDataFrame,

587 solver: str | list[str] = None,

588 instance: str | list[str] = None,

589 configuration: str = None,

590 objective: str = None,

591 run: int = None,

592 solver_fields: list[str] = ["Value"]

593 ) -> float | str | list[Any]:

594 """Index a value of the DataFrame and return it."""

595 # Convert indices to slices for None values

596 solver = slice(solver) if solver is None else solver

597 configuration = slice(configuration) if configuration is None else configuration

598 instance = slice(instance) if instance is None else instance

599 objective = slice(objective) if objective is None else objective

600 solver_fields = slice(solver_fields) if solver_fields is None else solver_fields

601 run = slice(run) if run is None else run

602 target = self.loc[(objective, instance, run),

603 (solver, configuration, solver_fields)].values

604 # Reduce dimensions when relevant

605 if len(target) > 0 and isinstance(target[0], np.ndarray) and len(target[0]) == 1:

606 target = target.flatten()

607 target = target.tolist()

608 if len(target) == 1:

609 return target[0]

610 return target

611

612 def get_instance_num_runs(self: PerformanceDataFrame,

613 instance: str) -> int:

614 """Return the number of runs for an instance."""

615 # We assume each objective has the same index for Instance/Runs

616 return len(self.loc[(self.objective_names[0], instance)].index)

617

618 # Calculables

619

620 def mean(self: PerformanceDataFrame,

621 objective: str = None,

622 solver: str = None,

623 instance: str = None) -> float:

624 """Return the mean value of a slice of the dataframe."""

625 objective = self.verify_objective(objective)

626 subset = self.xs(objective, level=0)

627 if solver is not None:

628 subset = subset.xs(solver, axis=1, drop_level=False)

629 if instance is not None:

630 subset = subset.xs(instance, axis=0, drop_level=False)

631 value = subset.astype(float).mean()

632 if isinstance(value, pd.Series):

633 return value.mean()

634 return value

635

636 def get_job_list(self: PerformanceDataFrame, rerun: bool = False) \

637 -> list[tuple[str, str]]:

638 """Return a list of performance computation jobs there are to be done.

639

640 Get a list of tuple[instance, solver] to run from the performance data.

641 If rerun is False (default), get only the tuples that don't have a

642 value, else (True) get all the tuples.

643

644 Args:

645 rerun: Boolean indicating if we want to rerun all jobs

646

647 Returns:

648 A tuple of (solver, config, instance, run) combinations

649 """

650 # Drop the seed as we are looking for nan values, not seeds

651 df = self.drop(PerformanceDataFrame.column_seed, axis=1,

652 level=PerformanceDataFrame.column_meta)

653 df = df.droplevel(PerformanceDataFrame.column_meta, axis=1)

654 if rerun: # Return all combinations

655 # Drop objective, not needed

656 df = df.droplevel(PerformanceDataFrame.index_objective, axis=0)

657 result = [tuple(column) + tuple(index)

658 for column, index in itertools.product(df.columns, df.index)]

659 else:

660 result = []

661 for (solver, config), (objective, instance, run) in itertools.product(

662 df.columns, df.index):

663 value = df.loc[(objective, instance, run), (solver, config)]

664 if value is None or (

665 isinstance(value, (int, float)) and math.isnan(value)):

666 result.append(tuple([solver, config, instance, run]))

667 # Filter duplicates

668 result = list(set(result))

669 return result

670

671 def configuration_performance(

672 self: PerformanceDataFrame,

673 solver: str,

674 configuration: str | list[str] = None,

675 objective: str | SparkleObjective = None,

676 instances: list[str] = None,

677 per_instance: bool = False) -> tuple[str, float]:

678 """Return the (best) configuration performance for objective over the instances.

679

680 Args:

681 solver: The solver for which we determine evaluate the configuration

682 configuration: The configuration (id) to evaluate

683 objective: The objective for which we calculate find the best value

684 instances: The instances which should be selected for the evaluation

685 per_instance: Whether to return the performance per instance,

686 or aggregated.

687

688 Returns:

689 The (best) configuration id and its aggregated performance.

690 """

691 objective = self.verify_objective(objective)

692 if isinstance(objective, str):

693 objective = resolve_objective(objective)

694 # Filter objective

695 subdf = self.xs(objective.name, level=0, drop_level=True)

696 # Filter solver

697 subdf = subdf.xs(solver, axis=1, drop_level=True)

698 # Drop the seed, then drop meta level as it is no longer needed

699 subdf = subdf.drop(PerformanceDataFrame.column_seed, axis=1,

700 level=PerformanceDataFrame.column_meta)

701 subdf = subdf.droplevel(PerformanceDataFrame.column_meta, axis=1)

702 # Ensure the objective is numeric

703 subdf = subdf.astype(float)

704

705 if instances: # Filter instances

706 subdf = subdf.loc[instances, :]

707 if configuration: # Filter configuration

708 if not isinstance(configuration, list):

709 configuration = [configuration]

710 subdf = subdf.filter(configuration, axis=1)

711 # Aggregate the runs

712 subdf = subdf.groupby(PerformanceDataFrame.index_instance).agg(

713 func=objective.run_aggregator.__name__)

714 # Aggregate the instances

715 sub_series = subdf.agg(func=objective.instance_aggregator.__name__)

716 # Select the best configuration

717 best_conf = sub_series.idxmin() if objective.minimise else sub_series.idxmax()

718 if per_instance: # Return a list of instance results

719 return best_conf, subdf[best_conf].to_list()

720 return best_conf, sub_series[best_conf]

721

722 def best_configuration(self: PerformanceDataFrame,

723 solver: str,

724 objective: SparkleObjective = None,

725 instances: list[str] = None) -> tuple[str, float]:

726 """Return the best configuration for the given objective over the instances.

727

728 Args:

729 solver: The solver for which we determine the best configuration

730 objective: The objective for which we calculate the best configuration

731 instances: The instances which should be selected for the evaluation

732

733 Returns:

734 The best configuration id and its aggregated performance.

735 """

736 return self.configuration_performance(solver, None, objective, instances)

737

738 def best_instance_performance(

739 self: PerformanceDataFrame,

740 objective: str | SparkleObjective = None,

741 instances: list[str] = None,

742 run_id: int = None,

743 exclude_solvers: list[(str, str)] = None) -> pd.Series:

744 """Return the best performance for each instance in the portfolio.

745

746 Args:

747 objective: The objective for which we calculate the best performance

748 instances: The instances which should be selected for the evaluation

749 run_id: The run for which we calculate the best performance. If None,

750 we consider all runs.

751 exclude_solvers: List of (solver, config_id) to exclude in the calculation.

752

753 Returns:

754 The best performance for each instance in the portfolio.

755 """

756 objective = self.verify_objective(objective)

757 if isinstance(objective, str):

758 objective = resolve_objective(objective)

759 subdf = self.drop( # Drop Seed, not needed

760 [PerformanceDataFrame.column_seed],

761 axis=1, level=PerformanceDataFrame.column_meta)

762 subdf = subdf.xs(objective.name, level=0) # Drop objective

763 if exclude_solvers is not None:

764 subdf = subdf.drop(exclude_solvers, axis=1)

765 if instances is not None:

766 subdf = subdf.loc[instances, :]

767 if run_id is not None:

768 run_id = self.verify_run_id(run_id)

769 subdf = subdf.xs(run_id, level=1)

770 else:

771 # Drop the run level

772 subdf = subdf.droplevel(level=1)

773 # Ensure the objective is numeric

774 subdf = subdf.astype(float)

775 series = subdf.min(axis=1) if objective.minimise else subdf.max(axis=1)

776 # Ensure we always return the best for each run

777 series = series.sort_values(ascending=objective.minimise)

778 return series.groupby(series.index).first().astype(float)

779

780 def best_performance(

781 self: PerformanceDataFrame,

782 exclude_solvers: list[(str, str)] = [],

783 instances: list[str] = None,

784 objective: str | SparkleObjective = None) -> float:

785 """Return the overall best performance of the portfolio.

786

787 Args:

788 exclude_solvers: List of (solver, config_id) to exclude in the calculation.

789 Defaults to none.

790 instances: The instances which should be selected for the evaluation

791 If None, use all instances.

792 objective: The objective for which we calculate the best performance

793

794 Returns:

795 The aggregated best performance of the portfolio over all instances.

796 """

797 objective = self.verify_objective(objective)

798 if isinstance(objective, str):

799 objective = resolve_objective(objective)

800 instance_best = self.best_instance_performance(

801 objective, instances=instances,

802 exclude_solvers=exclude_solvers).to_numpy(dtype=float)

803 return objective.instance_aggregator(instance_best)

804

805 def schedule_performance(

806 self: PerformanceDataFrame,

807 schedule: dict[str: dict[str: (str, str, int)]],

808 target_solver: str | tuple[str, str] = None,

809 objective: str | SparkleObjective = None) -> float:

810 """Return the performance of a selection schedule on the portfolio.

811

812 Args:

813 schedule: Compute the best performance according to a selection schedule.

814 A schedule is a dictionary of instances, with a schedule per instance,

815 consisting of a triple of solver, config_id and maximum runtime.

816 target_solver: If not None, store the found values in this solver of the DF.

817 objective: The objective for which we calculate the best performance

818

819 Returns:

820 The performance of the schedule over the instances in the dictionary.

821 """

822 objective = self.verify_objective(objective)

823 if isinstance(objective, str):

824 objective = resolve_objective(objective)

825 select = min if objective.minimise else max

826 performances = [0.0] * len(schedule.keys())

827 if not isinstance(target_solver, tuple):

828 target_conf = PerformanceDataFrame.default_configuration

829 else:

830 target_solver, target_conf = target_solver

831 if target_solver and target_solver not in self.solvers:

832 self.add_solver(target_solver)

833 for ix, instance in enumerate(schedule.keys()):

834 for iy, (solver, config, max_runtime) in enumerate(schedule[instance]):

835 performance = float(self.get_value(

836 solver, instance, config, objective.name))

837 if max_runtime is not None: # We are dealing with runtime

838 performances[ix] += performance

839 if performance < max_runtime:

840 break # Solver finished in time

841 else: # Quality, we take the best found performance

842 if iy == 0: # First solver, set initial value

843 performances[ix] = performance

844 continue

845 performances[ix] = select(performances[ix], performance)

846 if target_solver is not None:

847 self.set_value(performances[ix], target_solver,

848 instance, target_conf, objective.name)

849 return performances

850

851 def marginal_contribution(

852 self: PerformanceDataFrame,

853 objective: str | SparkleObjective = None,

854 instances: list[str] = None,

855 sort: bool = False) -> list[float]:

856 """Return the marginal contribution of the solver configuration on the instances.

857

858 Args:

859 objective: The objective for which we calculate the marginal contribution.

860 instances: The instances which should be selected for the evaluation

861 sort: Whether to sort the results afterwards

862 Returns:

863 The marginal contribution of each solver.

864 """

865 output = []

866 objective = self.verify_objective(objective)

867 if isinstance(objective, str):

868 objective = resolve_objective(objective)

869 best_performance = self.best_performance(objective=objective,

870 instances=instances)

871 for solver in self.solvers:

872 for config_id in self.get_configurations(solver):

873 # By calculating the best performance excluding this Solver,

874 # we can determine its relative impact on the portfolio.

875 missing_solver_config_best = self.best_performance(

876 exclude_solvers=[(solver, config_id)],

877 instances=instances,

878 objective=objective)

879 # Now we need to see how much the portfolio's best performance

880 # decreases without this solver.

881 marginal_contribution = missing_solver_config_best / best_performance

882 if missing_solver_config_best == best_performance:

883 # No change, no contribution

884 marginal_contribution = 0.0

885 output.append((solver, config_id,

886 marginal_contribution, missing_solver_config_best))

887 if sort:

888 output.sort(key=lambda x: x[2], reverse=objective.minimise)

889 return output

890

891 def get_solver_ranking(self: PerformanceDataFrame,

892 objective: str | SparkleObjective = None,

893 instances: list[str] = None,

894 ) -> list[tuple[str, dict, float]]:

895 """Return a list with solvers ranked by average performance."""

896 objective = self.verify_objective(objective)

897 if isinstance(objective, str):

898 objective = resolve_objective(objective)

899 # Drop Seed

900 sub_df = self.drop(

901 [PerformanceDataFrame.column_seed],

902 axis=1, level=PerformanceDataFrame.column_meta)

903 # Reduce objective

904 sub_df: pd.DataFrame = sub_df.loc(axis=0)[objective.name, :, :]

905 # Drop Objective, Meta multi index

906 sub_df = sub_df.droplevel(PerformanceDataFrame.index_objective).droplevel(

907 PerformanceDataFrame.column_meta, axis=1)

908 if instances is not None: # Select instances

909 sub_df = sub_df.loc(axis=0)[instances, ]

910 # Ensure data is numeric

911 sub_df = sub_df.astype(float)

912 # Aggregate runs

913 sub_df = sub_df.groupby(PerformanceDataFrame.index_instance).agg(

914 func=objective.run_aggregator.__name__)

915 # Aggregate instances

916 sub_series = sub_df.aggregate(func=objective.instance_aggregator.__name__)

917 # Sort by objective

918 sub_series.sort_values(ascending=objective.minimise, inplace=True)

919 return [(index[0], index[1], sub_series[index]) for index in sub_series.index]

920

921 def save_csv(self: PerformanceDataFrame, csv_filepath: Path = None) -> None:

922 """Write a CSV to the given path.

923

924 Args:

925 csv_filepath: String path to the csv file. Defaults to self.csv_filepath.

926 """

927 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath

928 self.to_csv(csv_filepath)

929 # Append the configurations

930 with csv_filepath.open("a") as fout:

931 fout.write("\n$Solver,configuration_id,Configuration\n")

932 for solver in self.solvers:

933 for config_id in self.attrs[solver]:

934 configuration = self.attrs[solver][config_id]

935 fout.write(f"${solver},{config_id},{str(configuration)}\n")

936

937 def clone(self: PerformanceDataFrame,

938 csv_filepath: Path = None) -> PerformanceDataFrame:

939 """Create a copy of this object.

940

941 Args:

942 csv_filepath: The new filepath to use for saving the object to.

943 If None, will not be saved.

944 Warning: If the original path is used, it could lead to dataloss!

945 """

946 pd_copy = PerformanceDataFrame(

947 csv_filepath=csv_filepath,

948 solvers=self.solvers,

949 configurations=self.configurations,

950 objectives=self.objectives,

951 instances=self.instances,

952 n_runs=self.num_runs)

953 # Copy values

954 for column_index in self.columns:

955 for index in self.index:

956 pd_copy.at[index, column_index] = self.loc[index, column_index]

957 # Ensure everything is sorted?

958 return pd_copy

959

960 def clean_csv(self: PerformanceDataFrame) -> None:

961 """Set all values in Performance Data to None."""

962 self[:] = PerformanceDataFrame.missing_value

963 self.save_csv()

Coverage for sparkle/structures/performance_dataframe.py: 89%

418 statements