Coverage for sparkle/structures/performance_dataframe.py: 89%
418 statements
« prev ^ index » next coverage.py v7.9.1, created at 2025-07-01 13:21 +0000
« prev ^ index » next coverage.py v7.9.1, created at 2025-07-01 13:21 +0000
1"""Module to manage performance data files and common operations on them."""
2from __future__ import annotations
3import ast
4import copy
5from typing import Any
6import itertools
7from pathlib import Path
8import math
9import numpy as np
10import pandas as pd
12from sparkle.types import SparkleObjective, resolve_objective
15class PerformanceDataFrame(pd.DataFrame):
16 """Class to manage performance data and common operations on them."""
17 missing_value = math.nan
19 missing_objective = "UNKNOWN"
20 default_configuration = "Default"
22 index_objective = "Objective"
23 index_instance = "Instance"
24 index_run = "Run"
25 multi_index_names = [index_objective, index_instance, index_run]
27 column_solver = "Solver"
28 column_configuration = "Configuration"
29 column_meta = "Meta"
30 column_value = "Value"
31 column_seed = "Seed"
32 multi_column_names = [column_solver, column_configuration, column_meta]
33 multi_column_value = [column_value, column_seed]
34 multi_column_dtypes = [str, int]
36 def __init__(self: PerformanceDataFrame,
37 csv_filepath: Path,
38 solvers: list[str] = None,
39 configurations: dict[str, dict[str, dict]] = None,
40 objectives: list[str | SparkleObjective] = None,
41 instances: list[str] = None,
42 n_runs: int = 1) -> None:
43 """Initialise a PerformanceDataFrame.
45 Consists of:
46 - Columns representing the Solvers
47 - Rows representing the result by multi-index in order of:
48 * Objective (Static, given in constructor or read from file)
49 * Instance
50 * Runs (Static, given in constructor or read from file)
52 Args:
53 csv_filepath: If path exists, load from Path.
54 Otherwise create new and save to this path.
55 solvers: List of solver names to be added into the Dataframe
56 configurations: The configuration keys per solver to add, structured as
57 configurations[solver][config_key] = {"parameter": "value", ..}
58 objectives: List of SparkleObjectives or objective names. By default None,
59 then the objectives will be derived from Sparkle Settings if possible.
60 instances: List of instance names to be added into the Dataframe
61 n_runs: The number of runs to consider per Solver/Objective/Instance comb.
62 """
63 if csv_filepath and csv_filepath.exists(): # Read from file
64 df = pd.read_csv(csv_filepath,
65 header=[0, 1, 2], index_col=[0, 1, 2],
66 dtype={"Value": str, "Seed": int},
67 on_bad_lines="skip",
68 comment="$") # $ For extra data lines
69 super().__init__(df)
70 self.csv_filepath = csv_filepath
71 # Load configuration mapping
72 with self.csv_filepath.open() as f:
73 configuration_lines = [line.strip().strip("$").split(",", maxsplit=2)
74 for line in f.readlines()
75 if line.startswith("$")]
76 configurations = {s: {} for s in self.solvers}
77 for solver, config_key, config in configuration_lines[1:]: # Skip header
78 configurations[solver][config_key] = ast.literal_eval(config.strip('"'))
79 else: # New PerformanceDataFrame
80 # Initialize empty DataFrame
81 run_ids = list(range(1, n_runs + 1)) # We count runs from 1
82 # We always need objectives to maintain the dimensions
83 if objectives is None:
84 objectives = [PerformanceDataFrame.missing_objective]
85 else:
86 objectives = [str(o) for o in objectives]
87 # We always need an instance to maintain the dimensions
88 if instances is None:
89 instances = [PerformanceDataFrame.missing_value]
90 # We always need a solver to maintain the dimensions
91 if solvers is None:
92 solvers = [PerformanceDataFrame.missing_value]
93 midx = pd.MultiIndex.from_product(
94 [objectives, instances, run_ids],
95 names=PerformanceDataFrame.multi_index_names)
96 # Create the multi index tuples
97 if configurations is None:
98 configurations = \
99 {solver: {PerformanceDataFrame.default_configuration: {}}
100 for solver in solvers}
101 column_tuples = []
102 # We cannot do .from_product here as config ids are per solver
103 for solver in configurations.keys():
104 for config_id in configurations[solver].keys():
105 column_tuples.extend([
106 (solver, config_id, PerformanceDataFrame.column_seed),
107 (solver, config_id, PerformanceDataFrame.column_value)])
108 mcolumns = pd.MultiIndex.from_tuples(
109 column_tuples,
110 names=[PerformanceDataFrame.column_solver,
111 PerformanceDataFrame.column_configuration,
112 PerformanceDataFrame.column_meta])
113 # Set dtype object to avoid inferring float for categorical objectives
114 super().__init__(PerformanceDataFrame.missing_value,
115 index=midx, columns=mcolumns, dtype="object")
116 self.csv_filepath = csv_filepath
118 # Store configuration in global attributes dictionary, see Pandas Docs
119 self.attrs = configurations
121 if self.index.duplicated().any(): # Combine duplicate indices
122 combined = self.groupby(level=[0, 1, 2]).first()
123 # We keep the last to allow overwriting existing values
124 duplicates = self.index[self.index.duplicated(keep="last")]
125 # Remove all duplicate entries from self
126 self.drop(duplicates, inplace=True)
127 for d in duplicates: # Place combined duplicates in self
128 self.loc[d, :] = combined.loc[d, :]
130 # Sort the index to optimize lookup speed
131 self.sort_index(axis=0, inplace=True)
132 self.sort_index(axis=1, inplace=True)
134 if csv_filepath and not self.csv_filepath.exists(): # New Performance DataFrame
135 self.save_csv()
137 # Properties
139 @property
140 def num_objectives(self: PerformanceDataFrame) -> int:
141 """Retrieve the number of objectives in the DataFrame."""
142 return self.index.get_level_values(0).unique().size
144 @property
145 def num_instances(self: PerformanceDataFrame) -> int:
146 """Return the number of instances."""
147 return self.index.get_level_values(1).unique().size
149 @property
150 def num_runs(self: PerformanceDataFrame) -> int:
151 """Return the maximum number of runs of each instance."""
152 return self.index.get_level_values(2).unique().size
154 @property
155 def num_solvers(self: PerformanceDataFrame) -> int:
156 """Return the number of solvers."""
157 return self.columns.get_level_values(0).unique().size
159 @property
160 def num_solver_configurations(self: PerformanceDataFrame) -> int:
161 """Return the number of solver configurations."""
162 return int(self.columns.get_level_values( # Config has a seed & value
163 PerformanceDataFrame.column_configuration).size / 2)
165 @property
166 def multi_objective(self: PerformanceDataFrame) -> bool:
167 """Return whether the dataframe represent MO or not."""
168 return self.num_objectives > 1
170 @property
171 def solvers(self: PerformanceDataFrame) -> list[str]:
172 """Return the solver present as a list of strings."""
173 # Do not return the nan solver as its not an actual solver
174 return self.columns.get_level_values(
175 PerformanceDataFrame.column_solver).dropna().unique().to_list()
177 @property
178 def configuration_ids(self: PerformanceDataFrame) -> list[str]:
179 """Return the list of configuration keys."""
180 return self.columns.get_level_values(
181 PerformanceDataFrame.column_configuration).unique().to_list()
183 @property
184 def configurations(self: PerformanceDataFrame) -> dict[str, dict[str, dict]]:
185 """Return a dictionary (copy) containing the configurations for each solver."""
186 return copy.deepcopy(self.attrs) # Deepcopy to avoid mutation of attribute
188 @property
189 def objective_names(self: PerformanceDataFrame) -> list[str]:
190 """Return the objective names as a list of strings."""
191 return self.index.get_level_values(0).unique().to_list()
193 @property
194 def objectives(self: PerformanceDataFrame) -> list[SparkleObjective]:
195 """Return the objectives as a list of SparkleObjectives."""
196 return [resolve_objective(o) for o in self.objective_names]
198 @property
199 def instances(self: PerformanceDataFrame) -> list[str]:
200 """Return the instances as a Pandas Index object."""
201 return self.index.get_level_values(1).unique().to_list()
203 @property
204 def run_ids(self: PerformanceDataFrame) -> list[int]:
205 """Return the run ids as a list of integers."""
206 return self.index.get_level_values(2).unique().to_list()
208 @property
209 def has_missing_values(self: PerformanceDataFrame) -> bool:
210 """Returns True if there are any missing values in the dataframe."""
211 return self.drop(PerformanceDataFrame.column_seed,
212 level=PerformanceDataFrame.column_meta,
213 axis=1).isnull().any().any()
215 def is_missing(self: PerformanceDataFrame,
216 solver: str,
217 instance: str,) -> int:
218 """Checks if a solver/instance is missing values."""
219 return self.xs(solver, axis=1).xs(
220 instance, axis=0,
221 level=PerformanceDataFrame.index_instance).drop(
222 PerformanceDataFrame.column_seed,
223 level=PerformanceDataFrame.column_meta,
224 axis=1).isnull().any().any()
226 def verify_objective(self: PerformanceDataFrame,
227 objective: str) -> str:
228 """Method to check whether the specified objective is valid.
230 Users are allowed to index the dataframe without specifying all dimensions.
231 However, when dealing with multiple objectives this is not allowed and this
232 is verified here. If we have only one objective this is returned. Otherwise,
233 if an objective is specified by the user this is returned.
235 Args:
236 objective: The objective given by the user
237 """
238 if objective is None:
239 if self.multi_objective:
240 raise ValueError("Error: MO Data, but objective not specified.")
241 elif self.num_objectives == 1:
242 return self.objective_names[0]
243 else:
244 return PerformanceDataFrame.missing_objective
245 return objective
247 def verify_run_id(self: PerformanceDataFrame,
248 run_id: int) -> int:
249 """Method to check whether run id is valid.
251 Similar to verify_objective but here we check the dimensionality of runs.
253 Args:
254 run_id: the run as specified by the user.
255 """
256 if run_id is None:
257 if self.num_runs > 1:
258 raise ValueError("Error: Multiple run performance data, "
259 "but run not specified")
260 else:
261 run_id = self.run_ids[0]
262 return run_id
264 def verify_indexing(self: PerformanceDataFrame,
265 objective: str,
266 run_id: int) -> tuple[str, int]:
267 """Method to check whether data indexing is correct.
269 Users are allowed to use the Performance Dataframe without the second and
270 fourth dimension (Objective and Run respectively) in the case they only
271 have one objective or only do one run. This method adjusts the indexing for
272 those cases accordingly.
274 Args:
275 objective: The given objective name
276 run_id: The given run index
278 Returns:
279 A tuple representing the (possibly adjusted) Objective and Run index.
280 """
281 objective = self.verify_objective(objective)
282 run_id = self.verify_run_id(run_id)
283 return objective, run_id
285 # Getters and Setters
287 def add_solver(self: PerformanceDataFrame,
288 solver_name: str,
289 configurations: list[(str, dict)] = None,
290 initial_value: float | list[str | float] = None) -> None:
291 """Add a new solver to the dataframe. Initializes value to None by default.
293 Args:
294 solver_name: The name of the solver to be added.
295 configurations: A list of configuration keys for the solver.
296 initial_value: The value assigned for each index of the new solver.
297 If not None, must match the index dimension (n_obj * n_inst * n_runs).
298 """
299 if solver_name in self.solvers:
300 print(f"WARNING: Tried adding already existing solver {solver_name} to "
301 f"Performance DataFrame: {self.csv_filepath}")
302 return
303 if not isinstance(initial_value, list): # Single value
304 initial_value = [[initial_value, initial_value]]
305 if configurations is None:
306 configurations = [(PerformanceDataFrame.default_configuration, {})]
307 self.attrs[solver_name] = {}
308 for (config_key, config), (value, seed) in itertools.product(configurations,
309 initial_value):
310 self[(solver_name, config_key, PerformanceDataFrame.column_seed)] = seed
311 self[(solver_name, config_key, PerformanceDataFrame.column_value)] = value
312 self.attrs[solver_name][config_key] = config
313 if self.num_solvers == 2: # Remove nan solver
314 for solver in self.solvers:
315 if str(solver) == str(PerformanceDataFrame.missing_value):
316 self.remove_solver(solver)
317 break
319 def add_configuration(
320 self: PerformanceDataFrame,
321 solver: str,
322 configuration_id: str | list[str],
323 configuration: dict[str, Any] | list[dict[str, Any]] = None) -> None:
324 """Add new configurations for a solver to the dataframe.
326 If the key already exists, update the value.
328 Args:
329 solver: The name of the solver to be added.
330 configuration_id: The name of the configuration to be added.
331 configuration: The configuration to be added.
332 """
333 if not isinstance(configuration_id, list):
334 configuration_id = [configuration_id]
335 if not isinstance(configuration, list):
336 configuration = [configuration]
337 for config_id, config in zip(configuration_id, configuration):
338 if config_id not in self.get_configurations(solver):
339 self[(solver, config_id, PerformanceDataFrame.column_value)] = None
340 self[(solver, config_id, PerformanceDataFrame.column_seed)] = None
341 self.attrs[solver][config_id] = config
342 # Sort the index to optimize lookup speed
343 self.sort_index(axis=1, inplace=True)
345 def add_objective(self: PerformanceDataFrame,
346 objective_name: str,
347 initial_value: float = None) -> None:
348 """Add an objective to the DataFrame."""
349 initial_value = initial_value or self.missing_value
350 if objective_name in self.objective_names:
351 print(f"WARNING: Tried adding already existing objective {objective_name} "
352 f"to Performance DataFrame: {self.csv_filepath}")
353 return
354 for instance, run in itertools.product(self.instances, self.run_ids):
355 self.loc[(objective_name, instance, run)] = initial_value
356 self.sort_index(axis=0, inplace=True)
358 def add_instance(self: PerformanceDataFrame,
359 instance_name: str,
360 initial_values: Any | list[Any] = None) -> None:
361 """Add and instance to the DataFrame.
363 Args:
364 instance_name: The name of the instance to be added.
365 initial_values: The values assigned for each index of the new instance.
366 If list, must match the column dimension (Value, Seed, Configuration).
367 """
368 initial_values = initial_values or self.missing_value
369 if not isinstance(initial_values, list):
370 initial_values = ([initial_values]
371 * 2 # Value and Seed per target column
372 * self.num_solver_configurations)
373 elif len(initial_values) == len(PerformanceDataFrame.multi_column_names):
374 initial_values = initial_values * self.num_solvers
376 if instance_name in self.instances:
377 print(f"WARNING: Tried adding already existing instance {instance_name} "
378 f"to Performance DataFrame: {self.csv_filepath}")
379 return
380 # Add rows for all combinations
381 for objective, run in itertools.product(self.objective_names, self.run_ids):
382 self.loc[(objective, instance_name, run)] = initial_values
383 if self.num_instances == 2: # Remove nan instance
384 for instance in self.instances:
385 if not isinstance(instance, str) and math.isnan(instance):
386 self.remove_instances(instance)
387 break
388 # Sort the index to optimize lookup speed
389 self.sort_index(axis=0, inplace=True)
391 def add_runs(self: PerformanceDataFrame,
392 num_extra_runs: int,
393 instance_names: list[str] = None,
394 initial_values: Any | list[Any] = None) -> None:
395 """Add runs to the DataFrame.
397 Args:
398 num_extra_runs: The number of runs to be added.
399 instance_names: The instances for which runs are to be added.
400 By default None, which means runs are added to all instances.
401 initial_values: The initial value for each objective of each new run.
402 If a list, needs to have a value for Value, Seed and Configuration.
403 """
404 initial_values = initial_values or self.missing_value
405 if not isinstance(initial_values, list):
406 initial_values =\
407 [initial_values] * self.num_solvers * 2 # Value and Seed
408 elif len(initial_values) == 2: # Value and seed provided
409 initial_values = initial_values * self.num_solvers
410 instance_names = self.instances if instance_names is None else instance_names
411 for objective, instance in itertools.product(self.objective_names,
412 instance_names):
413 index_runs_start = len(self.loc[(objective, instance)]) + 1
414 for run in range(index_runs_start, index_runs_start + num_extra_runs):
415 self.loc[(objective, instance, run)] = initial_values
416 # Sort the index to optimize lookup speed
417 # NOTE: It would be better to do this at the end, but that results in
418 # PerformanceWarning: indexing past lexsort depth may impact performance.
419 self.sort_index(axis=0, inplace=True)
421 def get_configurations(self: PerformanceDataFrame,
422 solver_name: str) -> list[str]:
423 """Return the list of configuration keys for a solver."""
424 return list(self[solver_name].columns.get_level_values(
425 PerformanceDataFrame.column_configuration).unique())
427 def get_full_configuration(self: PerformanceDataFrame,
428 solver: str,
429 configuration_id: str | list[str]) -> dict | list[dict]:
430 """Return the actual configuration associated with the configuration key."""
431 if isinstance(configuration_id, str):
432 return self.attrs[solver][configuration_id]
433 return [self.attrs[solver][cid] for cid in configuration_id]
435 def remove_solver(self: PerformanceDataFrame, solvers: str | list[str]) -> None:
436 """Drop one or more solvers from the Dataframe."""
437 if not solvers: # Bugfix for when an empty list is passed to avoid nan adding
438 return
439 # To make sure objectives / runs are saved when no solvers are present
440 solvers = [solvers] if isinstance(solvers, str) else solvers
441 if self.num_solvers == 1: # This would preferrably be done after removing
442 for field in PerformanceDataFrame.multi_column_value:
443 self[PerformanceDataFrame.missing_value,
444 PerformanceDataFrame.missing_value, field] =\
445 PerformanceDataFrame.missing_value
446 self.drop(columns=solvers, level=0, axis=1, inplace=True)
447 for solver in solvers:
448 del self.attrs[solver]
450 def remove_configuration(self: PerformanceDataFrame,
451 solver: str, configuration: str | list[str]) -> None:
452 """Drop one or more configurations from the Dataframe."""
453 if isinstance(configuration, str):
454 configuration = [configuration]
455 for config in configuration:
456 self.drop((solver, config), axis=1, inplace=True)
457 del self.attrs[solver][config]
458 # Sort the index to optimize lookup speed
459 self.sort_index(axis=1, inplace=True)
461 def remove_objective(self: PerformanceDataFrame,
462 objectives: str | list[str]) -> None:
463 """Remove objective from the Dataframe."""
464 if len(self.objectives) < 2:
465 raise Exception("Cannot remove last objective from PerformanceDataFrame")
466 self.drop(objectives,
467 axis=0, level=PerformanceDataFrame.index_objective, inplace=True)
469 def remove_instances(self: PerformanceDataFrame, instances: str | list[str]) -> None:
470 """Drop instances from the Dataframe."""
471 # To make sure objectives / runs are saved when no instances are present
472 num_instances = len(instances) if isinstance(instances, list) else 1
473 if self.num_instances - num_instances == 0:
474 for objective, run in itertools.product(self.objective_names, self.run_ids):
475 self.loc[(objective, PerformanceDataFrame.missing_value, run)] =\
476 PerformanceDataFrame.missing_value
477 self.drop(instances,
478 axis=0,
479 level=PerformanceDataFrame.index_instance, inplace=True)
480 # Sort the index to optimize lookup speed
481 self.sort_index(axis=0, inplace=True)
483 def remove_runs(self: PerformanceDataFrame,
484 runs: int | list[int],
485 instance_names: list[str] = None) -> None:
486 """Drop one or more runs from the Dataframe.
488 Args:
489 runs: The run indices to be removed. If its an int,
490 the last n runs are removed. NOTE: If each instance has a different
491 number of runs, the amount of removed runs is not uniform.
492 instance_names: The instances for which runs are to be removed.
493 By default None, which means runs are removed from all instances.
494 """
495 instance_names = self.instances if instance_names is None else instance_names
496 runs = list(range((self.num_runs + 1) - runs, (self.num_runs + 1)))\
497 if isinstance(runs, int) else runs
498 self.drop(runs,
499 axis=0,
500 level=PerformanceDataFrame.index_run,
501 inplace=True)
502 # Sort the index to optimize lookup speed
503 self.sort_index(axis=0, inplace=True)
505 def remove_empty_runs(self: PerformanceDataFrame) -> None:
506 """Remove runs that contain no data, except for the first."""
507 for row_index in self.index:
508 if row_index[2] == 1: # First run, never delete
509 continue
510 if self.loc[row_index].isna().all():
511 self.drop(row_index, inplace=True)
513 def filter_objective(self: PerformanceDataFrame,
514 objective: str | list[str]) -> None:
515 """Filter the Dataframe to a subset of objectives."""
516 if isinstance(objective, str):
517 objective = [objective]
518 self.drop(list(set(self.objective_names) - set(objective)),
519 axis=0, level=PerformanceDataFrame.index_objective, inplace=True)
521 def reset_value(self: PerformanceDataFrame,
522 solver: str,
523 instance: str,
524 objective: str = None,
525 run: int = None) -> None:
526 """Reset a value in the dataframe."""
527 self.set_value(PerformanceDataFrame.missing_value,
528 solver, instance, objective, run)
530 def set_value(self: PerformanceDataFrame,
531 value: float | str | list[float | str] | list[list[float | str]],
532 solver: str | list[str],
533 instance: str | list[str],
534 configuration: str = None,
535 objective: str | list[str] = None,
536 run: int | list[int] = None,
537 solver_fields: list[str] = ["Value"],
538 append_write_csv: bool = False) -> None:
539 """Setter method to assign a value to the Dataframe.
541 Allows for setting the same value to multiple indices.
543 Args:
544 value: Value(s) to be assigned. If value is a list, first dimension is
545 the solver field, second dimension is if multiple different values are
546 to be assigned. Must be the same shape as target.
547 solver: The solver(s) for which the value should be set.
548 If solver is a list, multiple solvers are set. If None, all
549 solvers are set.
550 instance: The instance(s) for which the value should be set.
551 If instance is a list, multiple instances are set. If None, all
552 instances are set.
553 configuration: The configuration(s) for which the value should be set.
554 When left None, set for all configurations
555 objective: The objectives for which the value should be set.
556 When left None, set for all objectives
557 run: The run index for which the value should be set.
558 If left None, set for all runs.
559 solver_fields: The level to which each value should be assinged.
560 Defaults to ["Value"].
561 append_write_csv: For concurrent writing to the PerformanceDataFrame.
562 If True, the value is directly appended to the CSV file.
563 This will create duplicate entries in the file, but these are combined
564 when loading the file.
565 """
566 # Convert indices to slices for None values
567 solver = slice(solver) if solver is None else solver
568 configuration = slice(configuration) if configuration is None else configuration
569 instance = slice(instance) if instance is None else instance
570 objective = slice(objective) if objective is None else objective
571 run = slice(run) if run is None else run
572 # Convert column indices to slices for setting multiple columns
573 value = [value] if not isinstance(value, list) else value
574 # NOTE: We currently forloop levels here, as it allows us to set the same
575 # sequence of values to the indices
576 for item, level in zip(value, solver_fields):
577 self.loc[(objective, instance, run), (solver, configuration, level)] = item
579 if append_write_csv:
580 writeable = self.loc[(objective, instance, run), :]
581 if isinstance(writeable, pd.Series): # Single row, convert to pd.DataFrame
582 writeable = self.loc[[(objective, instance, run)], :]
583 # Append the new rows to the dataframe csv file
584 writeable.to_csv(self.csv_filepath, mode="a", header=False)
586 def get_value(self: PerformanceDataFrame,
587 solver: str | list[str] = None,
588 instance: str | list[str] = None,
589 configuration: str = None,
590 objective: str = None,
591 run: int = None,
592 solver_fields: list[str] = ["Value"]
593 ) -> float | str | list[Any]:
594 """Index a value of the DataFrame and return it."""
595 # Convert indices to slices for None values
596 solver = slice(solver) if solver is None else solver
597 configuration = slice(configuration) if configuration is None else configuration
598 instance = slice(instance) if instance is None else instance
599 objective = slice(objective) if objective is None else objective
600 solver_fields = slice(solver_fields) if solver_fields is None else solver_fields
601 run = slice(run) if run is None else run
602 target = self.loc[(objective, instance, run),
603 (solver, configuration, solver_fields)].values
604 # Reduce dimensions when relevant
605 if len(target) > 0 and isinstance(target[0], np.ndarray) and len(target[0]) == 1:
606 target = target.flatten()
607 target = target.tolist()
608 if len(target) == 1:
609 return target[0]
610 return target
612 def get_instance_num_runs(self: PerformanceDataFrame,
613 instance: str) -> int:
614 """Return the number of runs for an instance."""
615 # We assume each objective has the same index for Instance/Runs
616 return len(self.loc[(self.objective_names[0], instance)].index)
618 # Calculables
620 def mean(self: PerformanceDataFrame,
621 objective: str = None,
622 solver: str = None,
623 instance: str = None) -> float:
624 """Return the mean value of a slice of the dataframe."""
625 objective = self.verify_objective(objective)
626 subset = self.xs(objective, level=0)
627 if solver is not None:
628 subset = subset.xs(solver, axis=1, drop_level=False)
629 if instance is not None:
630 subset = subset.xs(instance, axis=0, drop_level=False)
631 value = subset.astype(float).mean()
632 if isinstance(value, pd.Series):
633 return value.mean()
634 return value
636 def get_job_list(self: PerformanceDataFrame, rerun: bool = False) \
637 -> list[tuple[str, str]]:
638 """Return a list of performance computation jobs there are to be done.
640 Get a list of tuple[instance, solver] to run from the performance data.
641 If rerun is False (default), get only the tuples that don't have a
642 value, else (True) get all the tuples.
644 Args:
645 rerun: Boolean indicating if we want to rerun all jobs
647 Returns:
648 A tuple of (solver, config, instance, run) combinations
649 """
650 # Drop the seed as we are looking for nan values, not seeds
651 df = self.drop(PerformanceDataFrame.column_seed, axis=1,
652 level=PerformanceDataFrame.column_meta)
653 df = df.droplevel(PerformanceDataFrame.column_meta, axis=1)
654 if rerun: # Return all combinations
655 # Drop objective, not needed
656 df = df.droplevel(PerformanceDataFrame.index_objective, axis=0)
657 result = [tuple(column) + tuple(index)
658 for column, index in itertools.product(df.columns, df.index)]
659 else:
660 result = []
661 for (solver, config), (objective, instance, run) in itertools.product(
662 df.columns, df.index):
663 value = df.loc[(objective, instance, run), (solver, config)]
664 if value is None or (
665 isinstance(value, (int, float)) and math.isnan(value)):
666 result.append(tuple([solver, config, instance, run]))
667 # Filter duplicates
668 result = list(set(result))
669 return result
671 def configuration_performance(
672 self: PerformanceDataFrame,
673 solver: str,
674 configuration: str | list[str] = None,
675 objective: str | SparkleObjective = None,
676 instances: list[str] = None,
677 per_instance: bool = False) -> tuple[str, float]:
678 """Return the (best) configuration performance for objective over the instances.
680 Args:
681 solver: The solver for which we determine evaluate the configuration
682 configuration: The configuration (id) to evaluate
683 objective: The objective for which we calculate find the best value
684 instances: The instances which should be selected for the evaluation
685 per_instance: Whether to return the performance per instance,
686 or aggregated.
688 Returns:
689 The (best) configuration id and its aggregated performance.
690 """
691 objective = self.verify_objective(objective)
692 if isinstance(objective, str):
693 objective = resolve_objective(objective)
694 # Filter objective
695 subdf = self.xs(objective.name, level=0, drop_level=True)
696 # Filter solver
697 subdf = subdf.xs(solver, axis=1, drop_level=True)
698 # Drop the seed, then drop meta level as it is no longer needed
699 subdf = subdf.drop(PerformanceDataFrame.column_seed, axis=1,
700 level=PerformanceDataFrame.column_meta)
701 subdf = subdf.droplevel(PerformanceDataFrame.column_meta, axis=1)
702 # Ensure the objective is numeric
703 subdf = subdf.astype(float)
705 if instances: # Filter instances
706 subdf = subdf.loc[instances, :]
707 if configuration: # Filter configuration
708 if not isinstance(configuration, list):
709 configuration = [configuration]
710 subdf = subdf.filter(configuration, axis=1)
711 # Aggregate the runs
712 subdf = subdf.groupby(PerformanceDataFrame.index_instance).agg(
713 func=objective.run_aggregator.__name__)
714 # Aggregate the instances
715 sub_series = subdf.agg(func=objective.instance_aggregator.__name__)
716 # Select the best configuration
717 best_conf = sub_series.idxmin() if objective.minimise else sub_series.idxmax()
718 if per_instance: # Return a list of instance results
719 return best_conf, subdf[best_conf].to_list()
720 return best_conf, sub_series[best_conf]
722 def best_configuration(self: PerformanceDataFrame,
723 solver: str,
724 objective: SparkleObjective = None,
725 instances: list[str] = None) -> tuple[str, float]:
726 """Return the best configuration for the given objective over the instances.
728 Args:
729 solver: The solver for which we determine the best configuration
730 objective: The objective for which we calculate the best configuration
731 instances: The instances which should be selected for the evaluation
733 Returns:
734 The best configuration id and its aggregated performance.
735 """
736 return self.configuration_performance(solver, None, objective, instances)
738 def best_instance_performance(
739 self: PerformanceDataFrame,
740 objective: str | SparkleObjective = None,
741 instances: list[str] = None,
742 run_id: int = None,
743 exclude_solvers: list[(str, str)] = None) -> pd.Series:
744 """Return the best performance for each instance in the portfolio.
746 Args:
747 objective: The objective for which we calculate the best performance
748 instances: The instances which should be selected for the evaluation
749 run_id: The run for which we calculate the best performance. If None,
750 we consider all runs.
751 exclude_solvers: List of (solver, config_id) to exclude in the calculation.
753 Returns:
754 The best performance for each instance in the portfolio.
755 """
756 objective = self.verify_objective(objective)
757 if isinstance(objective, str):
758 objective = resolve_objective(objective)
759 subdf = self.drop( # Drop Seed, not needed
760 [PerformanceDataFrame.column_seed],
761 axis=1, level=PerformanceDataFrame.column_meta)
762 subdf = subdf.xs(objective.name, level=0) # Drop objective
763 if exclude_solvers is not None:
764 subdf = subdf.drop(exclude_solvers, axis=1)
765 if instances is not None:
766 subdf = subdf.loc[instances, :]
767 if run_id is not None:
768 run_id = self.verify_run_id(run_id)
769 subdf = subdf.xs(run_id, level=1)
770 else:
771 # Drop the run level
772 subdf = subdf.droplevel(level=1)
773 # Ensure the objective is numeric
774 subdf = subdf.astype(float)
775 series = subdf.min(axis=1) if objective.minimise else subdf.max(axis=1)
776 # Ensure we always return the best for each run
777 series = series.sort_values(ascending=objective.minimise)
778 return series.groupby(series.index).first().astype(float)
780 def best_performance(
781 self: PerformanceDataFrame,
782 exclude_solvers: list[(str, str)] = [],
783 instances: list[str] = None,
784 objective: str | SparkleObjective = None) -> float:
785 """Return the overall best performance of the portfolio.
787 Args:
788 exclude_solvers: List of (solver, config_id) to exclude in the calculation.
789 Defaults to none.
790 instances: The instances which should be selected for the evaluation
791 If None, use all instances.
792 objective: The objective for which we calculate the best performance
794 Returns:
795 The aggregated best performance of the portfolio over all instances.
796 """
797 objective = self.verify_objective(objective)
798 if isinstance(objective, str):
799 objective = resolve_objective(objective)
800 instance_best = self.best_instance_performance(
801 objective, instances=instances,
802 exclude_solvers=exclude_solvers).to_numpy(dtype=float)
803 return objective.instance_aggregator(instance_best)
805 def schedule_performance(
806 self: PerformanceDataFrame,
807 schedule: dict[str: dict[str: (str, str, int)]],
808 target_solver: str | tuple[str, str] = None,
809 objective: str | SparkleObjective = None) -> float:
810 """Return the performance of a selection schedule on the portfolio.
812 Args:
813 schedule: Compute the best performance according to a selection schedule.
814 A schedule is a dictionary of instances, with a schedule per instance,
815 consisting of a triple of solver, config_id and maximum runtime.
816 target_solver: If not None, store the found values in this solver of the DF.
817 objective: The objective for which we calculate the best performance
819 Returns:
820 The performance of the schedule over the instances in the dictionary.
821 """
822 objective = self.verify_objective(objective)
823 if isinstance(objective, str):
824 objective = resolve_objective(objective)
825 select = min if objective.minimise else max
826 performances = [0.0] * len(schedule.keys())
827 if not isinstance(target_solver, tuple):
828 target_conf = PerformanceDataFrame.default_configuration
829 else:
830 target_solver, target_conf = target_solver
831 if target_solver and target_solver not in self.solvers:
832 self.add_solver(target_solver)
833 for ix, instance in enumerate(schedule.keys()):
834 for iy, (solver, config, max_runtime) in enumerate(schedule[instance]):
835 performance = float(self.get_value(
836 solver, instance, config, objective.name))
837 if max_runtime is not None: # We are dealing with runtime
838 performances[ix] += performance
839 if performance < max_runtime:
840 break # Solver finished in time
841 else: # Quality, we take the best found performance
842 if iy == 0: # First solver, set initial value
843 performances[ix] = performance
844 continue
845 performances[ix] = select(performances[ix], performance)
846 if target_solver is not None:
847 self.set_value(performances[ix], target_solver,
848 instance, target_conf, objective.name)
849 return performances
851 def marginal_contribution(
852 self: PerformanceDataFrame,
853 objective: str | SparkleObjective = None,
854 instances: list[str] = None,
855 sort: bool = False) -> list[float]:
856 """Return the marginal contribution of the solver configuration on the instances.
858 Args:
859 objective: The objective for which we calculate the marginal contribution.
860 instances: The instances which should be selected for the evaluation
861 sort: Whether to sort the results afterwards
862 Returns:
863 The marginal contribution of each solver.
864 """
865 output = []
866 objective = self.verify_objective(objective)
867 if isinstance(objective, str):
868 objective = resolve_objective(objective)
869 best_performance = self.best_performance(objective=objective,
870 instances=instances)
871 for solver in self.solvers:
872 for config_id in self.get_configurations(solver):
873 # By calculating the best performance excluding this Solver,
874 # we can determine its relative impact on the portfolio.
875 missing_solver_config_best = self.best_performance(
876 exclude_solvers=[(solver, config_id)],
877 instances=instances,
878 objective=objective)
879 # Now we need to see how much the portfolio's best performance
880 # decreases without this solver.
881 marginal_contribution = missing_solver_config_best / best_performance
882 if missing_solver_config_best == best_performance:
883 # No change, no contribution
884 marginal_contribution = 0.0
885 output.append((solver, config_id,
886 marginal_contribution, missing_solver_config_best))
887 if sort:
888 output.sort(key=lambda x: x[2], reverse=objective.minimise)
889 return output
891 def get_solver_ranking(self: PerformanceDataFrame,
892 objective: str | SparkleObjective = None,
893 instances: list[str] = None,
894 ) -> list[tuple[str, dict, float]]:
895 """Return a list with solvers ranked by average performance."""
896 objective = self.verify_objective(objective)
897 if isinstance(objective, str):
898 objective = resolve_objective(objective)
899 # Drop Seed
900 sub_df = self.drop(
901 [PerformanceDataFrame.column_seed],
902 axis=1, level=PerformanceDataFrame.column_meta)
903 # Reduce objective
904 sub_df: pd.DataFrame = sub_df.loc(axis=0)[objective.name, :, :]
905 # Drop Objective, Meta multi index
906 sub_df = sub_df.droplevel(PerformanceDataFrame.index_objective).droplevel(
907 PerformanceDataFrame.column_meta, axis=1)
908 if instances is not None: # Select instances
909 sub_df = sub_df.loc(axis=0)[instances, ]
910 # Ensure data is numeric
911 sub_df = sub_df.astype(float)
912 # Aggregate runs
913 sub_df = sub_df.groupby(PerformanceDataFrame.index_instance).agg(
914 func=objective.run_aggregator.__name__)
915 # Aggregate instances
916 sub_series = sub_df.aggregate(func=objective.instance_aggregator.__name__)
917 # Sort by objective
918 sub_series.sort_values(ascending=objective.minimise, inplace=True)
919 return [(index[0], index[1], sub_series[index]) for index in sub_series.index]
921 def save_csv(self: PerformanceDataFrame, csv_filepath: Path = None) -> None:
922 """Write a CSV to the given path.
924 Args:
925 csv_filepath: String path to the csv file. Defaults to self.csv_filepath.
926 """
927 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath
928 self.to_csv(csv_filepath)
929 # Append the configurations
930 with csv_filepath.open("a") as fout:
931 fout.write("\n$Solver,configuration_id,Configuration\n")
932 for solver in self.solvers:
933 for config_id in self.attrs[solver]:
934 configuration = self.attrs[solver][config_id]
935 fout.write(f"${solver},{config_id},{str(configuration)}\n")
937 def clone(self: PerformanceDataFrame,
938 csv_filepath: Path = None) -> PerformanceDataFrame:
939 """Create a copy of this object.
941 Args:
942 csv_filepath: The new filepath to use for saving the object to.
943 If None, will not be saved.
944 Warning: If the original path is used, it could lead to dataloss!
945 """
946 pd_copy = PerformanceDataFrame(
947 csv_filepath=csv_filepath,
948 solvers=self.solvers,
949 configurations=self.configurations,
950 objectives=self.objectives,
951 instances=self.instances,
952 n_runs=self.num_runs)
953 # Copy values
954 for column_index in self.columns:
955 for index in self.index:
956 pd_copy.at[index, column_index] = self.loc[index, column_index]
957 # Ensure everything is sorted?
958 return pd_copy
960 def clean_csv(self: PerformanceDataFrame) -> None:
961 """Set all values in Performance Data to None."""
962 self[:] = PerformanceDataFrame.missing_value
963 self.save_csv()