Coverage for sparkle/structures/performance_dataframe.py: 84%

418 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-09-29 10:17 +0000

1"""Module to manage performance data files and common operations on them.""" 

2 

3from __future__ import annotations 

4import ast 

5import copy 

6from typing import Any 

7import itertools 

8from pathlib import Path 

9import math 

10import numpy as np 

11import pandas as pd 

12 

13from sparkle.types import SparkleObjective, resolve_objective 

14 

15 

16class PerformanceDataFrame(pd.DataFrame): 

17 """Class to manage performance data and common operations on them.""" 

18 

19 missing_value = math.nan 

20 

21 missing_objective = "UNKNOWN" 

22 default_configuration = "Default" 

23 

24 index_objective = "Objective" 

25 index_instance = "Instance" 

26 index_run = "Run" 

27 multi_index_names = [index_objective, index_instance, index_run] 

28 

29 column_solver = "Solver" 

30 column_configuration = "Configuration" 

31 column_meta = "Meta" 

32 column_value = "Value" 

33 column_seed = "Seed" 

34 multi_column_names = [column_solver, column_configuration, column_meta] 

35 multi_column_value = [column_value, column_seed] 

36 multi_column_dtypes = [str, int] 

37 

38 def __init__( 

39 self: PerformanceDataFrame, 

40 csv_filepath: Path, 

41 solvers: list[str] = None, 

42 configurations: dict[str, dict[str, dict]] = None, 

43 objectives: list[str | SparkleObjective] = None, 

44 instances: list[str] = None, 

45 n_runs: int = 1, 

46 ) -> None: 

47 """Initialise a PerformanceDataFrame. 

48 

49 Consists of: 

50 - Columns representing the Solvers 

51 - Rows representing the result by multi-index in order of: 

52 * Objective (Static, given in constructor or read from file) 

53 * Instance 

54 * Runs (Static, given in constructor or read from file) 

55 

56 Args: 

57 csv_filepath: If path exists, load from Path. 

58 Otherwise create new and save to this path. 

59 solvers: List of solver names to be added into the Dataframe 

60 configurations: The configuration keys per solver to add, structured as 

61 configurations[solver][config_key] = {"parameter": "value", ..} 

62 objectives: List of SparkleObjectives or objective names. By default None, 

63 then the objectives will be derived from Sparkle Settings if possible. 

64 instances: List of instance names to be added into the Dataframe 

65 n_runs: The number of runs to consider per Solver/Objective/Instance comb. 

66 """ 

67 if csv_filepath and csv_filepath.exists(): # Read from file 

68 df = pd.read_csv( 

69 csv_filepath, 

70 header=[0, 1, 2], 

71 index_col=[0, 1, 2], 

72 on_bad_lines="skip", 

73 dtype={"Value": str, "Seed": int}, 

74 comment="$", 

75 ) # $ For extra data lines 

76 super().__init__(df) 

77 self.csv_filepath = csv_filepath 

78 # Load configuration mapping 

79 with self.csv_filepath.open() as f: 

80 configuration_lines = [ 

81 line.strip().strip("$").split(",", maxsplit=2) 

82 for line in f.readlines() 

83 if line.startswith("$") 

84 ] 

85 configurations = {s: {} for s in self.solvers} 

86 for solver, config_key, config in configuration_lines[1:]: # Skip header 

87 configurations[solver][config_key] = ast.literal_eval(config.strip('"')) 

88 else: # New PerformanceDataFrame 

89 # Initialize empty DataFrame 

90 run_ids = list(range(1, n_runs + 1)) # We count runs from 1 

91 # We always need objectives to maintain the dimensions 

92 if objectives is None: 

93 objectives = [PerformanceDataFrame.missing_objective] 

94 else: 

95 objectives = [str(o) for o in objectives] 

96 # We always need an instance to maintain the dimensions 

97 if instances is None: 

98 instances = [PerformanceDataFrame.missing_value] 

99 # We always need a solver to maintain the dimensions 

100 if solvers is None: 

101 solvers = [PerformanceDataFrame.missing_value] 

102 midx = pd.MultiIndex.from_product( 

103 [objectives, instances, run_ids], 

104 names=PerformanceDataFrame.multi_index_names, 

105 ) 

106 # Create the multi index tuples 

107 if configurations is None: 

108 configurations = { 

109 solver: {PerformanceDataFrame.default_configuration: {}} 

110 for solver in solvers 

111 } 

112 column_tuples = [] 

113 # We cannot do .from_product here as config ids are per solver 

114 for solver in configurations.keys(): 

115 for config_id in configurations[solver].keys(): 

116 column_tuples.extend( 

117 [ 

118 (solver, config_id, PerformanceDataFrame.column_seed), 

119 (solver, config_id, PerformanceDataFrame.column_value), 

120 ] 

121 ) 

122 mcolumns = pd.MultiIndex.from_tuples( 

123 column_tuples, 

124 names=[ 

125 PerformanceDataFrame.column_solver, 

126 PerformanceDataFrame.column_configuration, 

127 PerformanceDataFrame.column_meta, 

128 ], 

129 ) 

130 # Set dtype object to avoid inferring float for categorical objectives 

131 super().__init__( 

132 PerformanceDataFrame.missing_value, 

133 index=midx, 

134 columns=mcolumns, 

135 dtype="object", 

136 ) 

137 self.csv_filepath = csv_filepath 

138 

139 # Store configuration in global attributes dictionary, see Pandas Docs 

140 self.attrs = configurations 

141 

142 if self.index.duplicated().any(): # Drop all duplicates except for last 

143 # NOTE: This is rather convoluted (but fast!) due to the fact we need to do it inplace to maintain our type (PerformanceDataFrame) 

144 # Make the index levels into columns (in-place) 

145 self.reset_index(inplace=True) 

146 # The first nlevels columns are the index columns created by reset_index, drop duplicates in those columns 

147 idx_cols = self.columns[ 

148 : len(PerformanceDataFrame.multi_index_names) 

149 ].tolist() 

150 self.drop_duplicates( 

151 subset=idx_cols, keep="last", inplace=True 

152 ) # Drop duplicates 

153 self.set_index(idx_cols, inplace=True) # Restore the MultiIndex (in-place) 

154 self.index.rename( 

155 self.multi_index_names, inplace=True 

156 ) # Restore level names 

157 

158 # Sort the index to optimize lookup speed 

159 self.sort_index(axis=0, inplace=True) 

160 self.sort_index(axis=1, inplace=True) 

161 

162 if csv_filepath and not self.csv_filepath.exists(): # New Performance DataFrame 

163 self.save_csv() 

164 

165 # Properties 

166 

167 @property 

168 def num_objectives(self: PerformanceDataFrame) -> int: 

169 """Retrieve the number of objectives in the DataFrame.""" 

170 return self.index.get_level_values(0).unique().size 

171 

172 @property 

173 def num_instances(self: PerformanceDataFrame) -> int: 

174 """Return the number of instances.""" 

175 return self.index.get_level_values(1).unique().size 

176 

177 @property 

178 def num_runs(self: PerformanceDataFrame) -> int: 

179 """Return the maximum number of runs of each instance.""" 

180 return self.index.get_level_values(2).unique().size 

181 

182 @property 

183 def num_solvers(self: PerformanceDataFrame) -> int: 

184 """Return the number of solvers.""" 

185 return self.columns.get_level_values(0).unique().size 

186 

187 @property 

188 def num_solver_configurations(self: PerformanceDataFrame) -> int: 

189 """Return the number of solver configurations.""" 

190 return int( 

191 self.columns.get_level_values( # Config has a seed & value 

192 PerformanceDataFrame.column_configuration 

193 ).size 

194 / 2 

195 ) 

196 

197 @property 

198 def multi_objective(self: PerformanceDataFrame) -> bool: 

199 """Return whether the dataframe represent MO or not.""" 

200 return self.num_objectives > 1 

201 

202 @property 

203 def solvers(self: PerformanceDataFrame) -> list[str]: 

204 """Return the solver present as a list of strings.""" 

205 # Do not return the nan solver as its not an actual solver 

206 return ( 

207 self.columns.get_level_values(PerformanceDataFrame.column_solver) 

208 .dropna() 

209 .unique() 

210 .to_list() 

211 ) 

212 

213 @property 

214 def configuration_ids(self: PerformanceDataFrame) -> list[str]: 

215 """Return the list of configuration keys.""" 

216 return ( 

217 self.columns.get_level_values(PerformanceDataFrame.column_configuration) 

218 .unique() 

219 .to_list() 

220 ) 

221 

222 @property 

223 def configurations(self: PerformanceDataFrame) -> dict[str, dict[str, dict]]: 

224 """Return a dictionary (copy) containing the configurations for each solver.""" 

225 return copy.deepcopy(self.attrs) # Deepcopy to avoid mutation of attribute 

226 

227 @property 

228 def objective_names(self: PerformanceDataFrame) -> list[str]: 

229 """Return the objective names as a list of strings.""" 

230 return self.index.get_level_values(0).unique().to_list() 

231 

232 @property 

233 def objectives(self: PerformanceDataFrame) -> list[SparkleObjective]: 

234 """Return the objectives as a list of SparkleObjectives.""" 

235 return [resolve_objective(o) for o in self.objective_names] 

236 

237 @property 

238 def instances(self: PerformanceDataFrame) -> list[str]: 

239 """Return the instances as a Pandas Index object.""" 

240 return self.index.get_level_values(1).unique().to_list() 

241 

242 @property 

243 def run_ids(self: PerformanceDataFrame) -> list[int]: 

244 """Return the run ids as a list of integers.""" 

245 return self.index.get_level_values(2).unique().to_list() 

246 

247 @property 

248 def has_missing_values(self: PerformanceDataFrame) -> bool: 

249 """Returns True if there are any missing values in the dataframe.""" 

250 return ( 

251 self.drop( 

252 PerformanceDataFrame.column_seed, 

253 level=PerformanceDataFrame.column_meta, 

254 axis=1, 

255 ) 

256 .isnull() 

257 .any() 

258 .any() 

259 ) 

260 

261 def is_missing( 

262 self: PerformanceDataFrame, 

263 solver: str, 

264 instance: str, 

265 ) -> int: 

266 """Checks if a solver/instance is missing values.""" 

267 return ( 

268 self.xs(solver, axis=1) 

269 .xs(instance, axis=0, level=PerformanceDataFrame.index_instance) 

270 .drop( 

271 PerformanceDataFrame.column_seed, 

272 level=PerformanceDataFrame.column_meta, 

273 axis=1, 

274 ) 

275 .isnull() 

276 .any() 

277 .any() 

278 ) 

279 

280 def verify_objective(self: PerformanceDataFrame, objective: str) -> str: 

281 """Method to check whether the specified objective is valid. 

282 

283 Users are allowed to index the dataframe without specifying all dimensions. 

284 However, when dealing with multiple objectives this is not allowed and this 

285 is verified here. If we have only one objective this is returned. Otherwise, 

286 if an objective is specified by the user this is returned. 

287 

288 Args: 

289 objective: The objective given by the user 

290 """ 

291 if objective is None: 

292 if self.multi_objective: 

293 raise ValueError("Error: MO Data, but objective not specified.") 

294 elif self.num_objectives == 1: 

295 return self.objective_names[0] 

296 else: 

297 return PerformanceDataFrame.missing_objective 

298 return objective 

299 

300 def verify_run_id(self: PerformanceDataFrame, run_id: int) -> int: 

301 """Method to check whether run id is valid. 

302 

303 Similar to verify_objective but here we check the dimensionality of runs. 

304 

305 Args: 

306 run_id: the run as specified by the user. 

307 """ 

308 if run_id is None: 

309 if self.num_runs > 1: 

310 raise ValueError( 

311 "Error: Multiple run performance data, but run not specified" 

312 ) 

313 else: 

314 run_id = self.run_ids[0] 

315 return run_id 

316 

317 def verify_indexing( 

318 self: PerformanceDataFrame, objective: str, run_id: int 

319 ) -> tuple[str, int]: 

320 """Method to check whether data indexing is correct. 

321 

322 Users are allowed to use the Performance Dataframe without the second and 

323 fourth dimension (Objective and Run respectively) in the case they only 

324 have one objective or only do one run. This method adjusts the indexing for 

325 those cases accordingly. 

326 

327 Args: 

328 objective: The given objective name 

329 run_id: The given run index 

330 

331 Returns: 

332 A tuple representing the (possibly adjusted) Objective and Run index. 

333 """ 

334 objective = self.verify_objective(objective) 

335 run_id = self.verify_run_id(run_id) 

336 return objective, run_id 

337 

338 # Getters and Setters 

339 

340 def add_solver( 

341 self: PerformanceDataFrame, 

342 solver_name: str, 

343 configurations: list[(str, dict)] = None, 

344 initial_value: float | list[str | float] = None, 

345 ) -> None: 

346 """Add a new solver to the dataframe. Initializes value to None by default. 

347 

348 Args: 

349 solver_name: The name of the solver to be added. 

350 configurations: A list of configuration keys for the solver. 

351 initial_value: The value assigned for each index of the new solver. 

352 If not None, must match the index dimension (n_obj * n_inst * n_runs). 

353 """ 

354 if solver_name in self.solvers: 

355 print( 

356 f"WARNING: Tried adding already existing solver {solver_name} to " 

357 f"Performance DataFrame: {self.csv_filepath}" 

358 ) 

359 return 

360 if not isinstance(initial_value, list): # Single value 

361 initial_value = [[initial_value, initial_value]] 

362 if configurations is None: 

363 configurations = [(PerformanceDataFrame.default_configuration, {})] 

364 self.attrs[solver_name] = {} 

365 for (config_key, config), (value, seed) in itertools.product( 

366 configurations, initial_value 

367 ): 

368 self[(solver_name, config_key, PerformanceDataFrame.column_seed)] = seed 

369 self[(solver_name, config_key, PerformanceDataFrame.column_value)] = value 

370 self.attrs[solver_name][config_key] = config 

371 if self.num_solvers == 2: # Remove nan solver 

372 for solver in self.solvers: 

373 if str(solver) == str(PerformanceDataFrame.missing_value): 

374 self.remove_solver(solver) 

375 break 

376 

377 def add_configuration( 

378 self: PerformanceDataFrame, 

379 solver: str, 

380 configuration_id: str | list[str], 

381 configuration: dict[str, Any] | list[dict[str, Any]] = None, 

382 ) -> None: 

383 """Add new configurations for a solver to the dataframe. 

384 

385 If the key already exists, update the value. 

386 

387 Args: 

388 solver: The name of the solver to be added. 

389 configuration_id: The name of the configuration to be added. 

390 configuration: The configuration to be added. 

391 """ 

392 if not isinstance(configuration_id, list): 

393 configuration_id = [configuration_id] 

394 if not isinstance(configuration, list): 

395 configuration = [configuration] 

396 for config_id, config in zip(configuration_id, configuration): 

397 if config_id not in self.get_configurations(solver): 

398 self[(solver, config_id, PerformanceDataFrame.column_value)] = None 

399 self[(solver, config_id, PerformanceDataFrame.column_seed)] = None 

400 self.attrs[solver][config_id] = config 

401 # Sort the index to optimize lookup speed 

402 self.sort_index(axis=1, inplace=True) 

403 

404 def add_objective( 

405 self: PerformanceDataFrame, objective_name: str, initial_value: float = None 

406 ) -> None: 

407 """Add an objective to the DataFrame.""" 

408 initial_value = initial_value or self.missing_value 

409 if objective_name in self.objective_names: 

410 print( 

411 f"WARNING: Tried adding already existing objective {objective_name} " 

412 f"to Performance DataFrame: {self.csv_filepath}" 

413 ) 

414 return 

415 for instance, run in itertools.product(self.instances, self.run_ids): 

416 self.loc[(objective_name, instance, run)] = initial_value 

417 self.sort_index(axis=0, inplace=True) 

418 

419 def add_instance( 

420 self: PerformanceDataFrame, 

421 instance_name: str, 

422 initial_values: Any | list[Any] = None, 

423 ) -> None: 

424 """Add and instance to the DataFrame. 

425 

426 Args: 

427 instance_name: The name of the instance to be added. 

428 initial_values: The values assigned for each index of the new instance. 

429 If list, must match the column dimension (Value, Seed, Configuration). 

430 """ 

431 initial_values = initial_values or self.missing_value 

432 if not isinstance(initial_values, list): 

433 initial_values = ( 

434 [initial_values] 

435 * 2 # Value and Seed per target column 

436 * self.num_solver_configurations 

437 ) 

438 elif len(initial_values) == len(PerformanceDataFrame.multi_column_names): 

439 initial_values = initial_values * self.num_solvers 

440 

441 if instance_name in self.instances: 

442 print( 

443 f"WARNING: Tried adding already existing instance {instance_name} " 

444 f"to Performance DataFrame: {self.csv_filepath}" 

445 ) 

446 return 

447 # Add rows for all combinations 

448 for objective, run in itertools.product(self.objective_names, self.run_ids): 

449 self.loc[(objective, instance_name, run)] = initial_values 

450 if self.num_instances == 2: # Remove nan instance 

451 for instance in self.instances: 

452 if not isinstance(instance, str) and math.isnan(instance): 

453 self.remove_instances(instance) 

454 break 

455 # Sort the index to optimize lookup speed 

456 self.sort_index(axis=0, inplace=True) 

457 

458 def add_runs( 

459 self: PerformanceDataFrame, 

460 num_extra_runs: int, 

461 instance_names: list[str] = None, 

462 initial_values: Any | list[Any] = None, 

463 ) -> None: 

464 """Add runs to the DataFrame. 

465 

466 Args: 

467 num_extra_runs: The number of runs to be added. 

468 instance_names: The instances for which runs are to be added. 

469 By default None, which means runs are added to all instances. 

470 initial_values: The initial value for each objective of each new run. 

471 If a list, needs to have a value for Value, Seed and Configuration. 

472 """ 

473 initial_values = initial_values or self.missing_value 

474 if not isinstance(initial_values, list): 

475 initial_values = [initial_values] * self.num_solvers * 2 # Value and Seed 

476 elif len(initial_values) == 2: # Value and seed provided 

477 initial_values = initial_values * self.num_solvers 

478 instance_names = self.instances if instance_names is None else instance_names 

479 for objective, instance in itertools.product( 

480 self.objective_names, instance_names 

481 ): 

482 index_runs_start = len(self.loc[(objective, instance)]) + 1 

483 for run in range(index_runs_start, index_runs_start + num_extra_runs): 

484 self.loc[(objective, instance, run)] = initial_values 

485 # Sort the index to optimize lookup speed 

486 # NOTE: It would be better to do this at the end, but that results in 

487 # PerformanceWarning: indexing past lexsort depth may impact performance. 

488 self.sort_index(axis=0, inplace=True) 

489 

490 def get_configurations(self: PerformanceDataFrame, solver_name: str) -> list[str]: 

491 """Return the list of configuration keys for a solver.""" 

492 return list( 

493 self[solver_name] 

494 .columns.get_level_values(PerformanceDataFrame.column_configuration) 

495 .unique() 

496 ) 

497 

498 def get_full_configuration( 

499 self: PerformanceDataFrame, solver: str, configuration_id: str | list[str] 

500 ) -> dict | list[dict]: 

501 """Return the actual configuration associated with the configuration key.""" 

502 if isinstance(configuration_id, str): 

503 return self.attrs[solver][configuration_id] 

504 return [self.attrs[solver][cid] for cid in configuration_id] 

505 

506 def remove_solver(self: PerformanceDataFrame, solvers: str | list[str]) -> None: 

507 """Drop one or more solvers from the Dataframe.""" 

508 if not solvers: # Bugfix for when an empty list is passed to avoid nan adding 

509 return 

510 # To make sure objectives / runs are saved when no solvers are present 

511 solvers = [solvers] if isinstance(solvers, str) else solvers 

512 if self.num_solvers == 1: # This would preferrably be done after removing 

513 for field in PerformanceDataFrame.multi_column_value: 

514 self[ 

515 PerformanceDataFrame.missing_value, 

516 PerformanceDataFrame.missing_value, 

517 field, 

518 ] = PerformanceDataFrame.missing_value 

519 self.drop(columns=solvers, level=0, axis=1, inplace=True) 

520 for solver in solvers: 

521 del self.attrs[solver] 

522 

523 def remove_configuration( 

524 self: PerformanceDataFrame, solver: str, configuration: str | list[str] 

525 ) -> None: 

526 """Drop one or more configurations from the Dataframe.""" 

527 if isinstance(configuration, str): 

528 configuration = [configuration] 

529 for config in configuration: 

530 self.drop((solver, config), axis=1, inplace=True) 

531 del self.attrs[solver][config] 

532 # Sort the index to optimize lookup speed 

533 self.sort_index(axis=1, inplace=True) 

534 

535 def remove_objective( 

536 self: PerformanceDataFrame, objectives: str | list[str] 

537 ) -> None: 

538 """Remove objective from the Dataframe.""" 

539 if len(self.objectives) < 2: 

540 raise Exception("Cannot remove last objective from PerformanceDataFrame") 

541 self.drop( 

542 objectives, 

543 axis=0, 

544 level=PerformanceDataFrame.index_objective, 

545 inplace=True, 

546 ) 

547 

548 def remove_instances(self: PerformanceDataFrame, instances: str | list[str]) -> None: 

549 """Drop instances from the Dataframe.""" 

550 # To make sure objectives / runs are saved when no instances are present 

551 num_instances = len(instances) if isinstance(instances, list) else 1 

552 if self.num_instances - num_instances == 0: 

553 for objective, run in itertools.product(self.objective_names, self.run_ids): 

554 self.loc[(objective, PerformanceDataFrame.missing_value, run)] = ( 

555 PerformanceDataFrame.missing_value 

556 ) 

557 self.drop( 

558 instances, axis=0, level=PerformanceDataFrame.index_instance, inplace=True 

559 ) 

560 # Sort the index to optimize lookup speed 

561 self.sort_index(axis=0, inplace=True) 

562 

563 def remove_runs( 

564 self: PerformanceDataFrame, 

565 runs: int | list[int], 

566 instance_names: list[str] = None, 

567 ) -> None: 

568 """Drop one or more runs from the Dataframe. 

569 

570 Args: 

571 runs: The run indices to be removed. If its an int, 

572 the last n runs are removed. NOTE: If each instance has a different 

573 number of runs, the amount of removed runs is not uniform. 

574 instance_names: The instances for which runs are to be removed. 

575 By default None, which means runs are removed from all instances. 

576 """ 

577 instance_names = self.instances if instance_names is None else instance_names 

578 runs = ( 

579 list(range((self.num_runs + 1) - runs, (self.num_runs + 1))) 

580 if isinstance(runs, int) 

581 else runs 

582 ) 

583 self.drop(runs, axis=0, level=PerformanceDataFrame.index_run, inplace=True) 

584 # Sort the index to optimize lookup speed 

585 self.sort_index(axis=0, inplace=True) 

586 

587 def remove_empty_runs(self: PerformanceDataFrame) -> None: 

588 """Remove runs that contain no data, except for the first.""" 

589 for row_index in self.index: 

590 if row_index[2] == 1: # First run, never delete 

591 continue 

592 if self.loc[row_index].isna().all(): 

593 self.drop(row_index, inplace=True) 

594 

595 def filter_objective(self: PerformanceDataFrame, objective: str | list[str]) -> None: 

596 """Filter the Dataframe to a subset of objectives.""" 

597 if isinstance(objective, str): 

598 objective = [objective] 

599 self.drop( 

600 list(set(self.objective_names) - set(objective)), 

601 axis=0, 

602 level=PerformanceDataFrame.index_objective, 

603 inplace=True, 

604 ) 

605 

606 def reset_value( 

607 self: PerformanceDataFrame, 

608 solver: str, 

609 instance: str, 

610 objective: str = None, 

611 run: int = None, 

612 ) -> None: 

613 """Reset a value in the dataframe.""" 

614 self.set_value( 

615 PerformanceDataFrame.missing_value, solver, instance, objective, run 

616 ) 

617 

618 def set_value( 

619 self: PerformanceDataFrame, 

620 value: float | str | list[float | str] | list[list[float | str]], 

621 solver: str | list[str], 

622 instance: str | list[str], 

623 configuration: str = None, 

624 objective: str | list[str] = None, 

625 run: int | list[int] = None, 

626 solver_fields: list[str] = ["Value"], 

627 append_write_csv: bool = False, 

628 ) -> None: 

629 """Setter method to assign a value to the Dataframe. 

630 

631 Allows for setting the same value to multiple indices. 

632 

633 Args: 

634 value: Value(s) to be assigned. If value is a list, first dimension is 

635 the solver field, second dimension is if multiple different values are 

636 to be assigned. Must be the same shape as target. 

637 solver: The solver(s) for which the value should be set. 

638 If solver is a list, multiple solvers are set. If None, all 

639 solvers are set. 

640 instance: The instance(s) for which the value should be set. 

641 If instance is a list, multiple instances are set. If None, all 

642 instances are set. 

643 configuration: The configuration(s) for which the value should be set. 

644 When left None, set for all configurations 

645 objective: The objectives for which the value should be set. 

646 When left None, set for all objectives 

647 run: The run index for which the value should be set. 

648 If left None, set for all runs. 

649 solver_fields: The level to which each value should be assinged. 

650 Defaults to ["Value"]. 

651 append_write_csv: For concurrent writing to the PerformanceDataFrame. 

652 If True, the value is directly appended to the CSV file. 

653 This will create duplicate entries in the file, but these are combined 

654 when loading the file. 

655 """ 

656 # Convert indices to slices for None values 

657 solver = slice(solver) if solver is None else solver 

658 configuration = slice(configuration) if configuration is None else configuration 

659 instance = slice(instance) if instance is None else instance 

660 objective = slice(objective) if objective is None else objective 

661 run = slice(run) if run is None else run 

662 # Convert column indices to slices for setting multiple columns 

663 value = [value] if not isinstance(value, list) else value 

664 # NOTE: We currently forloop levels here, as it allows us to set the same 

665 # sequence of values to the indices 

666 for item, level in zip(value, solver_fields): 

667 self.loc[(objective, instance, run), (solver, configuration, level)] = item 

668 

669 if append_write_csv: 

670 writeable = self.loc[(objective, instance, run), :] 

671 if isinstance(writeable, pd.Series): # Single row, convert to pd.DataFrame 

672 writeable = self.loc[[(objective, instance, run)], :] 

673 # Append the new rows to the dataframe csv file 

674 writeable.to_csv(self.csv_filepath, mode="a", header=False) 

675 

676 def get_value( 

677 self: PerformanceDataFrame, 

678 solver: str | list[str] = None, 

679 instance: str | list[str] = None, 

680 configuration: str = None, 

681 objective: str = None, 

682 run: int = None, 

683 solver_fields: list[str] = ["Value"], 

684 ) -> float | str | list[Any]: 

685 """Index a value of the DataFrame and return it.""" 

686 # Convert indices to slices for None values 

687 solver = slice(solver) if solver is None else solver 

688 configuration = slice(configuration) if configuration is None else configuration 

689 instance = slice(instance) if instance is None else instance 

690 objective = slice(objective) if objective is None else objective 

691 solver_fields = slice(solver_fields) if solver_fields is None else solver_fields 

692 run = slice(run) if run is None else run 

693 target = self.loc[ 

694 (objective, instance, run), (solver, configuration, solver_fields) 

695 ].values 

696 # Reduce dimensions when relevant 

697 if len(target) > 0 and isinstance(target[0], np.ndarray) and len(target[0]) == 1: 

698 target = target.flatten() 

699 target = target.tolist() 

700 if len(target) == 1: 

701 return target[0] 

702 return target 

703 

704 def get_instance_num_runs(self: PerformanceDataFrame, instance: str) -> int: 

705 """Return the number of runs for an instance.""" 

706 # We assume each objective has the same index for Instance/Runs 

707 return len(self.loc[(self.objective_names[0], instance)].index) 

708 

709 # Calculables 

710 

711 def mean( 

712 self: PerformanceDataFrame, 

713 objective: str = None, 

714 solver: str = None, 

715 instance: str = None, 

716 ) -> float: 

717 """Return the mean value of a slice of the dataframe.""" 

718 objective = self.verify_objective(objective) 

719 subset = self.xs(objective, level=0) 

720 if solver is not None: 

721 subset = subset.xs(solver, axis=1, drop_level=False) 

722 if instance is not None: 

723 subset = subset.xs(instance, axis=0, drop_level=False) 

724 value = subset.astype(float).mean() 

725 if isinstance(value, pd.Series): 

726 return value.mean() 

727 return value 

728 

729 def get_job_list( 

730 self: PerformanceDataFrame, rerun: bool = False 

731 ) -> list[tuple[str, str]]: 

732 """Return a list of performance computation jobs there are to be done. 

733 

734 Get a list of tuple[instance, solver] to run from the performance data. 

735 If rerun is False (default), get only the tuples that don't have a 

736 value, else (True) get all the tuples. 

737 

738 Args: 

739 rerun: Boolean indicating if we want to rerun all jobs 

740 

741 Returns: 

742 A tuple of (solver, config, instance, run) combinations 

743 """ 

744 # Drop the seed as we are looking for nan values, not seeds 

745 df = self.drop( 

746 PerformanceDataFrame.column_seed, 

747 axis=1, 

748 level=PerformanceDataFrame.column_meta, 

749 ) 

750 df = df.droplevel(PerformanceDataFrame.column_meta, axis=1) 

751 if rerun: # Return all combinations 

752 # Drop objective, not needed 

753 df = df.droplevel(PerformanceDataFrame.index_objective, axis=0) 

754 result = [ 

755 tuple(column) + tuple(index) 

756 for column, index in itertools.product(df.columns, df.index) 

757 ] 

758 else: 

759 result = [] 

760 for (solver, config), (objective, instance, run) in itertools.product( 

761 df.columns, df.index 

762 ): 

763 value = df.loc[(objective, instance, run), (solver, config)] 

764 if value is None or ( 

765 isinstance(value, (int, float)) and math.isnan(value) 

766 ): 

767 result.append(tuple([solver, config, instance, run])) 

768 # Filter duplicates while keeping the order conistent 

769 result = list(dict.fromkeys(result)) 

770 return result 

771 

772 def configuration_performance( 

773 self: PerformanceDataFrame, 

774 solver: str, 

775 configuration: str | list[str] = None, 

776 objective: str | SparkleObjective = None, 

777 instances: list[str] = None, 

778 per_instance: bool = False, 

779 ) -> tuple[str, float]: 

780 """Return the (best) configuration performance for objective over the instances. 

781 

782 Args: 

783 solver: The solver for which we determine evaluate the configuration 

784 configuration: The configuration (id) to evaluate 

785 objective: The objective for which we calculate find the best value 

786 instances: The instances which should be selected for the evaluation 

787 per_instance: Whether to return the performance per instance, 

788 or aggregated. 

789 

790 Returns: 

791 The (best) configuration id and its aggregated performance. 

792 """ 

793 objective = self.verify_objective(objective) 

794 if isinstance(objective, str): 

795 objective = resolve_objective(objective) 

796 # Filter objective 

797 subdf = self.xs(objective.name, level=0, drop_level=True) 

798 # Filter solver 

799 subdf = subdf.xs(solver, axis=1, drop_level=True) 

800 # Drop the seed, then drop meta level as it is no longer needed 

801 subdf = subdf.drop( 

802 PerformanceDataFrame.column_seed, 

803 axis=1, 

804 level=PerformanceDataFrame.column_meta, 

805 ) 

806 subdf = subdf.droplevel(PerformanceDataFrame.column_meta, axis=1) 

807 # Ensure the objective is numeric 

808 subdf = subdf.astype(float) 

809 

810 if instances: # Filter instances 

811 subdf = subdf.loc[instances, :] 

812 if configuration: # Filter configuration 

813 if not isinstance(configuration, list): 

814 configuration = [configuration] 

815 subdf = subdf.filter(configuration, axis=1) 

816 # Aggregate the runs 

817 subdf = subdf.groupby(PerformanceDataFrame.index_instance).agg( 

818 func=objective.run_aggregator.__name__ 

819 ) 

820 # Aggregate the instances 

821 sub_series = subdf.agg(func=objective.instance_aggregator.__name__) 

822 # Select the best configuration 

823 best_conf = sub_series.idxmin() if objective.minimise else sub_series.idxmax() 

824 if per_instance: # Return a list of instance results 

825 return best_conf, subdf[best_conf].to_list() 

826 return best_conf, sub_series[best_conf] 

827 

828 def best_configuration( 

829 self: PerformanceDataFrame, 

830 solver: str, 

831 objective: SparkleObjective = None, 

832 instances: list[str] = None, 

833 ) -> tuple[str, float]: 

834 """Return the best configuration for the given objective over the instances. 

835 

836 Args: 

837 solver: The solver for which we determine the best configuration 

838 objective: The objective for which we calculate the best configuration 

839 instances: The instances which should be selected for the evaluation 

840 

841 Returns: 

842 The best configuration id and its aggregated performance. 

843 """ 

844 return self.configuration_performance(solver, None, objective, instances) 

845 

846 def best_instance_performance( 

847 self: PerformanceDataFrame, 

848 objective: str | SparkleObjective = None, 

849 instances: list[str] = None, 

850 run_id: int = None, 

851 exclude_solvers: list[(str, str)] = None, 

852 ) -> pd.Series: 

853 """Return the best performance for each instance in the portfolio. 

854 

855 Args: 

856 objective: The objective for which we calculate the best performance 

857 instances: The instances which should be selected for the evaluation 

858 run_id: The run for which we calculate the best performance. If None, 

859 we consider all runs. 

860 exclude_solvers: List of (solver, config_id) to exclude in the calculation. 

861 

862 Returns: 

863 The best performance for each instance in the portfolio. 

864 """ 

865 objective = self.verify_objective(objective) 

866 if isinstance(objective, str): 

867 objective = resolve_objective(objective) 

868 subdf = self.drop( # Drop Seed, not needed 

869 [PerformanceDataFrame.column_seed], 

870 axis=1, 

871 level=PerformanceDataFrame.column_meta, 

872 ) 

873 subdf = subdf.xs(objective.name, level=0) # Drop objective 

874 if exclude_solvers is not None: 

875 subdf = subdf.drop(exclude_solvers, axis=1) 

876 if instances is not None: 

877 subdf = subdf.loc[instances, :] 

878 if run_id is not None: 

879 run_id = self.verify_run_id(run_id) 

880 subdf = subdf.xs(run_id, level=1) 

881 else: 

882 # Drop the run level 

883 subdf = subdf.droplevel(level=1) 

884 # Ensure the objective is numeric 

885 subdf = subdf.astype(float) 

886 series = subdf.min(axis=1) if objective.minimise else subdf.max(axis=1) 

887 # Ensure we always return the best for each run 

888 series = series.sort_values(ascending=objective.minimise) 

889 return series.groupby(series.index).first().astype(float) 

890 

891 def best_performance( 

892 self: PerformanceDataFrame, 

893 exclude_solvers: list[(str, str)] = [], 

894 instances: list[str] = None, 

895 objective: str | SparkleObjective = None, 

896 ) -> float: 

897 """Return the overall best performance of the portfolio. 

898 

899 Args: 

900 exclude_solvers: List of (solver, config_id) to exclude in the calculation. 

901 Defaults to none. 

902 instances: The instances which should be selected for the evaluation 

903 If None, use all instances. 

904 objective: The objective for which we calculate the best performance 

905 

906 Returns: 

907 The aggregated best performance of the portfolio over all instances. 

908 """ 

909 objective = self.verify_objective(objective) 

910 if isinstance(objective, str): 

911 objective = resolve_objective(objective) 

912 instance_best = self.best_instance_performance( 

913 objective, instances=instances, exclude_solvers=exclude_solvers 

914 ).to_numpy(dtype=float) 

915 return objective.instance_aggregator(instance_best) 

916 

917 def schedule_performance( 

918 self: PerformanceDataFrame, 

919 schedule: dict[str : dict[str : (str, str, int)]], 

920 target_solver: str | tuple[str, str] = None, 

921 objective: str | SparkleObjective = None, 

922 ) -> float: 

923 """Return the performance of a selection schedule on the portfolio. 

924 

925 Args: 

926 schedule: Compute the best performance according to a selection schedule. 

927 A schedule is a dictionary of instances, with a schedule per instance, 

928 consisting of a triple of solver, config_id and maximum runtime. 

929 target_solver: If not None, store the found values in this solver of the DF. 

930 objective: The objective for which we calculate the best performance 

931 

932 Returns: 

933 The performance of the schedule over the instances in the dictionary. 

934 """ 

935 objective = self.verify_objective(objective) 

936 if isinstance(objective, str): 

937 objective = resolve_objective(objective) 

938 select = min if objective.minimise else max 

939 performances = [0.0] * len(schedule.keys()) 

940 if not isinstance(target_solver, tuple): 

941 target_conf = PerformanceDataFrame.default_configuration 

942 else: 

943 target_solver, target_conf = target_solver 

944 if target_solver and target_solver not in self.solvers: 

945 self.add_solver(target_solver) 

946 for ix, instance in enumerate(schedule.keys()): 

947 for iy, (solver, config, max_runtime) in enumerate(schedule[instance]): 

948 performance = float( 

949 self.get_value(solver, instance, config, objective.name) 

950 ) 

951 if max_runtime is not None: # We are dealing with runtime 

952 performances[ix] += performance 

953 if performance < max_runtime: 

954 break # Solver finished in time 

955 else: # Quality, we take the best found performance 

956 if iy == 0: # First solver, set initial value 

957 performances[ix] = performance 

958 continue 

959 performances[ix] = select(performances[ix], performance) 

960 if target_solver is not None: 

961 self.set_value( 

962 performances[ix], 

963 target_solver, 

964 instance, 

965 target_conf, 

966 objective.name, 

967 ) 

968 return performances 

969 

970 def marginal_contribution( 

971 self: PerformanceDataFrame, 

972 objective: str | SparkleObjective = None, 

973 instances: list[str] = None, 

974 sort: bool = False, 

975 ) -> list[float]: 

976 """Return the marginal contribution of the solver configuration on the instances. 

977 

978 Args: 

979 objective: The objective for which we calculate the marginal contribution. 

980 instances: The instances which should be selected for the evaluation 

981 sort: Whether to sort the results afterwards 

982 Returns: 

983 The marginal contribution of each solver (configuration) as: 

984 [(solver, config_id, marginal_contribution, portfolio_best_performance_without_solver)] 

985 """ 

986 output = [] 

987 objective = self.verify_objective(objective) 

988 if isinstance(objective, str): 

989 objective = resolve_objective(objective) 

990 best_performance = self.best_performance( 

991 objective=objective, instances=instances 

992 ) 

993 for solver in self.solvers: 

994 for config_id in self.get_configurations(solver): 

995 # By calculating the best performance excluding this Solver, 

996 # we can determine its relative impact on the portfolio. 

997 missing_solver_config_best = self.best_performance( 

998 exclude_solvers=[(solver, config_id)], 

999 instances=instances, 

1000 objective=objective, 

1001 ) 

1002 # Now we need to see how much the portfolio's best performance 

1003 # decreases without this solver. 

1004 marginal_contribution = missing_solver_config_best / best_performance 

1005 if missing_solver_config_best == best_performance: 

1006 # No change, no contribution 

1007 marginal_contribution = 0.0 

1008 output.append( 

1009 ( 

1010 solver, 

1011 config_id, 

1012 marginal_contribution, 

1013 missing_solver_config_best, 

1014 ) 

1015 ) 

1016 if sort: 

1017 output.sort(key=lambda x: x[2], reverse=objective.minimise) 

1018 return output 

1019 

1020 def get_solver_ranking( 

1021 self: PerformanceDataFrame, 

1022 objective: str | SparkleObjective = None, 

1023 instances: list[str] = None, 

1024 ) -> list[tuple[str, dict, float]]: 

1025 """Return a list with solvers ranked by average performance.""" 

1026 objective = self.verify_objective(objective) 

1027 if isinstance(objective, str): 

1028 objective = resolve_objective(objective) 

1029 # Drop Seed 

1030 sub_df = self.drop( 

1031 [PerformanceDataFrame.column_seed], 

1032 axis=1, 

1033 level=PerformanceDataFrame.column_meta, 

1034 ) 

1035 # Reduce objective 

1036 sub_df: pd.DataFrame = sub_df.loc(axis=0)[objective.name, :, :] 

1037 # Drop Objective, Meta multi index 

1038 sub_df = sub_df.droplevel(PerformanceDataFrame.index_objective).droplevel( 

1039 PerformanceDataFrame.column_meta, axis=1 

1040 ) 

1041 if instances is not None: # Select instances 

1042 sub_df = sub_df.loc(axis=0)[instances,] 

1043 # Ensure data is numeric 

1044 sub_df = sub_df.astype(float) 

1045 # Aggregate runs 

1046 sub_df = sub_df.groupby(PerformanceDataFrame.index_instance).agg( 

1047 func=objective.run_aggregator.__name__ 

1048 ) 

1049 # Aggregate instances 

1050 sub_series = sub_df.aggregate(func=objective.instance_aggregator.__name__) 

1051 # Sort by objective 

1052 sub_series.sort_values(ascending=objective.minimise, inplace=True) 

1053 return [(index[0], index[1], sub_series[index]) for index in sub_series.index] 

1054 

1055 def save_csv(self: PerformanceDataFrame, csv_filepath: Path = None) -> None: 

1056 """Write a CSV to the given path. 

1057 

1058 Args: 

1059 csv_filepath: String path to the csv file. Defaults to self.csv_filepath. 

1060 """ 

1061 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath 

1062 self.to_csv(csv_filepath) 

1063 # Append the configurations 

1064 with csv_filepath.open("a") as fout: 

1065 fout.write("\n$Solver,configuration_id,Configuration\n") 

1066 for solver in self.solvers: 

1067 for config_id in self.attrs[solver]: 

1068 configuration = self.attrs[solver][config_id] 

1069 fout.write(f"${solver},{config_id},{str(configuration)}\n") 

1070 

1071 def clone( 

1072 self: PerformanceDataFrame, csv_filepath: Path = None 

1073 ) -> PerformanceDataFrame: 

1074 """Create a copy of this object. 

1075 

1076 Args: 

1077 csv_filepath: The new filepath to use for saving the object to. 

1078 If None, will not be saved. 

1079 Warning: If the original path is used, it could lead to dataloss! 

1080 """ 

1081 pd_copy = PerformanceDataFrame( 

1082 csv_filepath=csv_filepath, 

1083 solvers=self.solvers, 

1084 configurations=self.configurations, 

1085 objectives=self.objectives, 

1086 instances=self.instances, 

1087 n_runs=self.num_runs, 

1088 ) 

1089 # Copy values 

1090 for column_index in self.columns: 

1091 for index in self.index: 

1092 pd_copy.at[index, column_index] = self.loc[index, column_index] 

1093 # Ensure everything is sorted? 

1094 return pd_copy 

1095 

1096 def clean_csv(self: PerformanceDataFrame) -> None: 

1097 """Set all values in Performance Data to None.""" 

1098 self[:] = PerformanceDataFrame.missing_value 

1099 self.save_csv()