Coverage for sparkle/structures/performance_dataframe.py: 84%
418 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-29 10:17 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-29 10:17 +0000
1"""Module to manage performance data files and common operations on them."""
3from __future__ import annotations
4import ast
5import copy
6from typing import Any
7import itertools
8from pathlib import Path
9import math
10import numpy as np
11import pandas as pd
13from sparkle.types import SparkleObjective, resolve_objective
16class PerformanceDataFrame(pd.DataFrame):
17 """Class to manage performance data and common operations on them."""
19 missing_value = math.nan
21 missing_objective = "UNKNOWN"
22 default_configuration = "Default"
24 index_objective = "Objective"
25 index_instance = "Instance"
26 index_run = "Run"
27 multi_index_names = [index_objective, index_instance, index_run]
29 column_solver = "Solver"
30 column_configuration = "Configuration"
31 column_meta = "Meta"
32 column_value = "Value"
33 column_seed = "Seed"
34 multi_column_names = [column_solver, column_configuration, column_meta]
35 multi_column_value = [column_value, column_seed]
36 multi_column_dtypes = [str, int]
38 def __init__(
39 self: PerformanceDataFrame,
40 csv_filepath: Path,
41 solvers: list[str] = None,
42 configurations: dict[str, dict[str, dict]] = None,
43 objectives: list[str | SparkleObjective] = None,
44 instances: list[str] = None,
45 n_runs: int = 1,
46 ) -> None:
47 """Initialise a PerformanceDataFrame.
49 Consists of:
50 - Columns representing the Solvers
51 - Rows representing the result by multi-index in order of:
52 * Objective (Static, given in constructor or read from file)
53 * Instance
54 * Runs (Static, given in constructor or read from file)
56 Args:
57 csv_filepath: If path exists, load from Path.
58 Otherwise create new and save to this path.
59 solvers: List of solver names to be added into the Dataframe
60 configurations: The configuration keys per solver to add, structured as
61 configurations[solver][config_key] = {"parameter": "value", ..}
62 objectives: List of SparkleObjectives or objective names. By default None,
63 then the objectives will be derived from Sparkle Settings if possible.
64 instances: List of instance names to be added into the Dataframe
65 n_runs: The number of runs to consider per Solver/Objective/Instance comb.
66 """
67 if csv_filepath and csv_filepath.exists(): # Read from file
68 df = pd.read_csv(
69 csv_filepath,
70 header=[0, 1, 2],
71 index_col=[0, 1, 2],
72 on_bad_lines="skip",
73 dtype={"Value": str, "Seed": int},
74 comment="$",
75 ) # $ For extra data lines
76 super().__init__(df)
77 self.csv_filepath = csv_filepath
78 # Load configuration mapping
79 with self.csv_filepath.open() as f:
80 configuration_lines = [
81 line.strip().strip("$").split(",", maxsplit=2)
82 for line in f.readlines()
83 if line.startswith("$")
84 ]
85 configurations = {s: {} for s in self.solvers}
86 for solver, config_key, config in configuration_lines[1:]: # Skip header
87 configurations[solver][config_key] = ast.literal_eval(config.strip('"'))
88 else: # New PerformanceDataFrame
89 # Initialize empty DataFrame
90 run_ids = list(range(1, n_runs + 1)) # We count runs from 1
91 # We always need objectives to maintain the dimensions
92 if objectives is None:
93 objectives = [PerformanceDataFrame.missing_objective]
94 else:
95 objectives = [str(o) for o in objectives]
96 # We always need an instance to maintain the dimensions
97 if instances is None:
98 instances = [PerformanceDataFrame.missing_value]
99 # We always need a solver to maintain the dimensions
100 if solvers is None:
101 solvers = [PerformanceDataFrame.missing_value]
102 midx = pd.MultiIndex.from_product(
103 [objectives, instances, run_ids],
104 names=PerformanceDataFrame.multi_index_names,
105 )
106 # Create the multi index tuples
107 if configurations is None:
108 configurations = {
109 solver: {PerformanceDataFrame.default_configuration: {}}
110 for solver in solvers
111 }
112 column_tuples = []
113 # We cannot do .from_product here as config ids are per solver
114 for solver in configurations.keys():
115 for config_id in configurations[solver].keys():
116 column_tuples.extend(
117 [
118 (solver, config_id, PerformanceDataFrame.column_seed),
119 (solver, config_id, PerformanceDataFrame.column_value),
120 ]
121 )
122 mcolumns = pd.MultiIndex.from_tuples(
123 column_tuples,
124 names=[
125 PerformanceDataFrame.column_solver,
126 PerformanceDataFrame.column_configuration,
127 PerformanceDataFrame.column_meta,
128 ],
129 )
130 # Set dtype object to avoid inferring float for categorical objectives
131 super().__init__(
132 PerformanceDataFrame.missing_value,
133 index=midx,
134 columns=mcolumns,
135 dtype="object",
136 )
137 self.csv_filepath = csv_filepath
139 # Store configuration in global attributes dictionary, see Pandas Docs
140 self.attrs = configurations
142 if self.index.duplicated().any(): # Drop all duplicates except for last
143 # NOTE: This is rather convoluted (but fast!) due to the fact we need to do it inplace to maintain our type (PerformanceDataFrame)
144 # Make the index levels into columns (in-place)
145 self.reset_index(inplace=True)
146 # The first nlevels columns are the index columns created by reset_index, drop duplicates in those columns
147 idx_cols = self.columns[
148 : len(PerformanceDataFrame.multi_index_names)
149 ].tolist()
150 self.drop_duplicates(
151 subset=idx_cols, keep="last", inplace=True
152 ) # Drop duplicates
153 self.set_index(idx_cols, inplace=True) # Restore the MultiIndex (in-place)
154 self.index.rename(
155 self.multi_index_names, inplace=True
156 ) # Restore level names
158 # Sort the index to optimize lookup speed
159 self.sort_index(axis=0, inplace=True)
160 self.sort_index(axis=1, inplace=True)
162 if csv_filepath and not self.csv_filepath.exists(): # New Performance DataFrame
163 self.save_csv()
165 # Properties
167 @property
168 def num_objectives(self: PerformanceDataFrame) -> int:
169 """Retrieve the number of objectives in the DataFrame."""
170 return self.index.get_level_values(0).unique().size
172 @property
173 def num_instances(self: PerformanceDataFrame) -> int:
174 """Return the number of instances."""
175 return self.index.get_level_values(1).unique().size
177 @property
178 def num_runs(self: PerformanceDataFrame) -> int:
179 """Return the maximum number of runs of each instance."""
180 return self.index.get_level_values(2).unique().size
182 @property
183 def num_solvers(self: PerformanceDataFrame) -> int:
184 """Return the number of solvers."""
185 return self.columns.get_level_values(0).unique().size
187 @property
188 def num_solver_configurations(self: PerformanceDataFrame) -> int:
189 """Return the number of solver configurations."""
190 return int(
191 self.columns.get_level_values( # Config has a seed & value
192 PerformanceDataFrame.column_configuration
193 ).size
194 / 2
195 )
197 @property
198 def multi_objective(self: PerformanceDataFrame) -> bool:
199 """Return whether the dataframe represent MO or not."""
200 return self.num_objectives > 1
202 @property
203 def solvers(self: PerformanceDataFrame) -> list[str]:
204 """Return the solver present as a list of strings."""
205 # Do not return the nan solver as its not an actual solver
206 return (
207 self.columns.get_level_values(PerformanceDataFrame.column_solver)
208 .dropna()
209 .unique()
210 .to_list()
211 )
213 @property
214 def configuration_ids(self: PerformanceDataFrame) -> list[str]:
215 """Return the list of configuration keys."""
216 return (
217 self.columns.get_level_values(PerformanceDataFrame.column_configuration)
218 .unique()
219 .to_list()
220 )
222 @property
223 def configurations(self: PerformanceDataFrame) -> dict[str, dict[str, dict]]:
224 """Return a dictionary (copy) containing the configurations for each solver."""
225 return copy.deepcopy(self.attrs) # Deepcopy to avoid mutation of attribute
227 @property
228 def objective_names(self: PerformanceDataFrame) -> list[str]:
229 """Return the objective names as a list of strings."""
230 return self.index.get_level_values(0).unique().to_list()
232 @property
233 def objectives(self: PerformanceDataFrame) -> list[SparkleObjective]:
234 """Return the objectives as a list of SparkleObjectives."""
235 return [resolve_objective(o) for o in self.objective_names]
237 @property
238 def instances(self: PerformanceDataFrame) -> list[str]:
239 """Return the instances as a Pandas Index object."""
240 return self.index.get_level_values(1).unique().to_list()
242 @property
243 def run_ids(self: PerformanceDataFrame) -> list[int]:
244 """Return the run ids as a list of integers."""
245 return self.index.get_level_values(2).unique().to_list()
247 @property
248 def has_missing_values(self: PerformanceDataFrame) -> bool:
249 """Returns True if there are any missing values in the dataframe."""
250 return (
251 self.drop(
252 PerformanceDataFrame.column_seed,
253 level=PerformanceDataFrame.column_meta,
254 axis=1,
255 )
256 .isnull()
257 .any()
258 .any()
259 )
261 def is_missing(
262 self: PerformanceDataFrame,
263 solver: str,
264 instance: str,
265 ) -> int:
266 """Checks if a solver/instance is missing values."""
267 return (
268 self.xs(solver, axis=1)
269 .xs(instance, axis=0, level=PerformanceDataFrame.index_instance)
270 .drop(
271 PerformanceDataFrame.column_seed,
272 level=PerformanceDataFrame.column_meta,
273 axis=1,
274 )
275 .isnull()
276 .any()
277 .any()
278 )
280 def verify_objective(self: PerformanceDataFrame, objective: str) -> str:
281 """Method to check whether the specified objective is valid.
283 Users are allowed to index the dataframe without specifying all dimensions.
284 However, when dealing with multiple objectives this is not allowed and this
285 is verified here. If we have only one objective this is returned. Otherwise,
286 if an objective is specified by the user this is returned.
288 Args:
289 objective: The objective given by the user
290 """
291 if objective is None:
292 if self.multi_objective:
293 raise ValueError("Error: MO Data, but objective not specified.")
294 elif self.num_objectives == 1:
295 return self.objective_names[0]
296 else:
297 return PerformanceDataFrame.missing_objective
298 return objective
300 def verify_run_id(self: PerformanceDataFrame, run_id: int) -> int:
301 """Method to check whether run id is valid.
303 Similar to verify_objective but here we check the dimensionality of runs.
305 Args:
306 run_id: the run as specified by the user.
307 """
308 if run_id is None:
309 if self.num_runs > 1:
310 raise ValueError(
311 "Error: Multiple run performance data, but run not specified"
312 )
313 else:
314 run_id = self.run_ids[0]
315 return run_id
317 def verify_indexing(
318 self: PerformanceDataFrame, objective: str, run_id: int
319 ) -> tuple[str, int]:
320 """Method to check whether data indexing is correct.
322 Users are allowed to use the Performance Dataframe without the second and
323 fourth dimension (Objective and Run respectively) in the case they only
324 have one objective or only do one run. This method adjusts the indexing for
325 those cases accordingly.
327 Args:
328 objective: The given objective name
329 run_id: The given run index
331 Returns:
332 A tuple representing the (possibly adjusted) Objective and Run index.
333 """
334 objective = self.verify_objective(objective)
335 run_id = self.verify_run_id(run_id)
336 return objective, run_id
338 # Getters and Setters
340 def add_solver(
341 self: PerformanceDataFrame,
342 solver_name: str,
343 configurations: list[(str, dict)] = None,
344 initial_value: float | list[str | float] = None,
345 ) -> None:
346 """Add a new solver to the dataframe. Initializes value to None by default.
348 Args:
349 solver_name: The name of the solver to be added.
350 configurations: A list of configuration keys for the solver.
351 initial_value: The value assigned for each index of the new solver.
352 If not None, must match the index dimension (n_obj * n_inst * n_runs).
353 """
354 if solver_name in self.solvers:
355 print(
356 f"WARNING: Tried adding already existing solver {solver_name} to "
357 f"Performance DataFrame: {self.csv_filepath}"
358 )
359 return
360 if not isinstance(initial_value, list): # Single value
361 initial_value = [[initial_value, initial_value]]
362 if configurations is None:
363 configurations = [(PerformanceDataFrame.default_configuration, {})]
364 self.attrs[solver_name] = {}
365 for (config_key, config), (value, seed) in itertools.product(
366 configurations, initial_value
367 ):
368 self[(solver_name, config_key, PerformanceDataFrame.column_seed)] = seed
369 self[(solver_name, config_key, PerformanceDataFrame.column_value)] = value
370 self.attrs[solver_name][config_key] = config
371 if self.num_solvers == 2: # Remove nan solver
372 for solver in self.solvers:
373 if str(solver) == str(PerformanceDataFrame.missing_value):
374 self.remove_solver(solver)
375 break
377 def add_configuration(
378 self: PerformanceDataFrame,
379 solver: str,
380 configuration_id: str | list[str],
381 configuration: dict[str, Any] | list[dict[str, Any]] = None,
382 ) -> None:
383 """Add new configurations for a solver to the dataframe.
385 If the key already exists, update the value.
387 Args:
388 solver: The name of the solver to be added.
389 configuration_id: The name of the configuration to be added.
390 configuration: The configuration to be added.
391 """
392 if not isinstance(configuration_id, list):
393 configuration_id = [configuration_id]
394 if not isinstance(configuration, list):
395 configuration = [configuration]
396 for config_id, config in zip(configuration_id, configuration):
397 if config_id not in self.get_configurations(solver):
398 self[(solver, config_id, PerformanceDataFrame.column_value)] = None
399 self[(solver, config_id, PerformanceDataFrame.column_seed)] = None
400 self.attrs[solver][config_id] = config
401 # Sort the index to optimize lookup speed
402 self.sort_index(axis=1, inplace=True)
404 def add_objective(
405 self: PerformanceDataFrame, objective_name: str, initial_value: float = None
406 ) -> None:
407 """Add an objective to the DataFrame."""
408 initial_value = initial_value or self.missing_value
409 if objective_name in self.objective_names:
410 print(
411 f"WARNING: Tried adding already existing objective {objective_name} "
412 f"to Performance DataFrame: {self.csv_filepath}"
413 )
414 return
415 for instance, run in itertools.product(self.instances, self.run_ids):
416 self.loc[(objective_name, instance, run)] = initial_value
417 self.sort_index(axis=0, inplace=True)
419 def add_instance(
420 self: PerformanceDataFrame,
421 instance_name: str,
422 initial_values: Any | list[Any] = None,
423 ) -> None:
424 """Add and instance to the DataFrame.
426 Args:
427 instance_name: The name of the instance to be added.
428 initial_values: The values assigned for each index of the new instance.
429 If list, must match the column dimension (Value, Seed, Configuration).
430 """
431 initial_values = initial_values or self.missing_value
432 if not isinstance(initial_values, list):
433 initial_values = (
434 [initial_values]
435 * 2 # Value and Seed per target column
436 * self.num_solver_configurations
437 )
438 elif len(initial_values) == len(PerformanceDataFrame.multi_column_names):
439 initial_values = initial_values * self.num_solvers
441 if instance_name in self.instances:
442 print(
443 f"WARNING: Tried adding already existing instance {instance_name} "
444 f"to Performance DataFrame: {self.csv_filepath}"
445 )
446 return
447 # Add rows for all combinations
448 for objective, run in itertools.product(self.objective_names, self.run_ids):
449 self.loc[(objective, instance_name, run)] = initial_values
450 if self.num_instances == 2: # Remove nan instance
451 for instance in self.instances:
452 if not isinstance(instance, str) and math.isnan(instance):
453 self.remove_instances(instance)
454 break
455 # Sort the index to optimize lookup speed
456 self.sort_index(axis=0, inplace=True)
458 def add_runs(
459 self: PerformanceDataFrame,
460 num_extra_runs: int,
461 instance_names: list[str] = None,
462 initial_values: Any | list[Any] = None,
463 ) -> None:
464 """Add runs to the DataFrame.
466 Args:
467 num_extra_runs: The number of runs to be added.
468 instance_names: The instances for which runs are to be added.
469 By default None, which means runs are added to all instances.
470 initial_values: The initial value for each objective of each new run.
471 If a list, needs to have a value for Value, Seed and Configuration.
472 """
473 initial_values = initial_values or self.missing_value
474 if not isinstance(initial_values, list):
475 initial_values = [initial_values] * self.num_solvers * 2 # Value and Seed
476 elif len(initial_values) == 2: # Value and seed provided
477 initial_values = initial_values * self.num_solvers
478 instance_names = self.instances if instance_names is None else instance_names
479 for objective, instance in itertools.product(
480 self.objective_names, instance_names
481 ):
482 index_runs_start = len(self.loc[(objective, instance)]) + 1
483 for run in range(index_runs_start, index_runs_start + num_extra_runs):
484 self.loc[(objective, instance, run)] = initial_values
485 # Sort the index to optimize lookup speed
486 # NOTE: It would be better to do this at the end, but that results in
487 # PerformanceWarning: indexing past lexsort depth may impact performance.
488 self.sort_index(axis=0, inplace=True)
490 def get_configurations(self: PerformanceDataFrame, solver_name: str) -> list[str]:
491 """Return the list of configuration keys for a solver."""
492 return list(
493 self[solver_name]
494 .columns.get_level_values(PerformanceDataFrame.column_configuration)
495 .unique()
496 )
498 def get_full_configuration(
499 self: PerformanceDataFrame, solver: str, configuration_id: str | list[str]
500 ) -> dict | list[dict]:
501 """Return the actual configuration associated with the configuration key."""
502 if isinstance(configuration_id, str):
503 return self.attrs[solver][configuration_id]
504 return [self.attrs[solver][cid] for cid in configuration_id]
506 def remove_solver(self: PerformanceDataFrame, solvers: str | list[str]) -> None:
507 """Drop one or more solvers from the Dataframe."""
508 if not solvers: # Bugfix for when an empty list is passed to avoid nan adding
509 return
510 # To make sure objectives / runs are saved when no solvers are present
511 solvers = [solvers] if isinstance(solvers, str) else solvers
512 if self.num_solvers == 1: # This would preferrably be done after removing
513 for field in PerformanceDataFrame.multi_column_value:
514 self[
515 PerformanceDataFrame.missing_value,
516 PerformanceDataFrame.missing_value,
517 field,
518 ] = PerformanceDataFrame.missing_value
519 self.drop(columns=solvers, level=0, axis=1, inplace=True)
520 for solver in solvers:
521 del self.attrs[solver]
523 def remove_configuration(
524 self: PerformanceDataFrame, solver: str, configuration: str | list[str]
525 ) -> None:
526 """Drop one or more configurations from the Dataframe."""
527 if isinstance(configuration, str):
528 configuration = [configuration]
529 for config in configuration:
530 self.drop((solver, config), axis=1, inplace=True)
531 del self.attrs[solver][config]
532 # Sort the index to optimize lookup speed
533 self.sort_index(axis=1, inplace=True)
535 def remove_objective(
536 self: PerformanceDataFrame, objectives: str | list[str]
537 ) -> None:
538 """Remove objective from the Dataframe."""
539 if len(self.objectives) < 2:
540 raise Exception("Cannot remove last objective from PerformanceDataFrame")
541 self.drop(
542 objectives,
543 axis=0,
544 level=PerformanceDataFrame.index_objective,
545 inplace=True,
546 )
548 def remove_instances(self: PerformanceDataFrame, instances: str | list[str]) -> None:
549 """Drop instances from the Dataframe."""
550 # To make sure objectives / runs are saved when no instances are present
551 num_instances = len(instances) if isinstance(instances, list) else 1
552 if self.num_instances - num_instances == 0:
553 for objective, run in itertools.product(self.objective_names, self.run_ids):
554 self.loc[(objective, PerformanceDataFrame.missing_value, run)] = (
555 PerformanceDataFrame.missing_value
556 )
557 self.drop(
558 instances, axis=0, level=PerformanceDataFrame.index_instance, inplace=True
559 )
560 # Sort the index to optimize lookup speed
561 self.sort_index(axis=0, inplace=True)
563 def remove_runs(
564 self: PerformanceDataFrame,
565 runs: int | list[int],
566 instance_names: list[str] = None,
567 ) -> None:
568 """Drop one or more runs from the Dataframe.
570 Args:
571 runs: The run indices to be removed. If its an int,
572 the last n runs are removed. NOTE: If each instance has a different
573 number of runs, the amount of removed runs is not uniform.
574 instance_names: The instances for which runs are to be removed.
575 By default None, which means runs are removed from all instances.
576 """
577 instance_names = self.instances if instance_names is None else instance_names
578 runs = (
579 list(range((self.num_runs + 1) - runs, (self.num_runs + 1)))
580 if isinstance(runs, int)
581 else runs
582 )
583 self.drop(runs, axis=0, level=PerformanceDataFrame.index_run, inplace=True)
584 # Sort the index to optimize lookup speed
585 self.sort_index(axis=0, inplace=True)
587 def remove_empty_runs(self: PerformanceDataFrame) -> None:
588 """Remove runs that contain no data, except for the first."""
589 for row_index in self.index:
590 if row_index[2] == 1: # First run, never delete
591 continue
592 if self.loc[row_index].isna().all():
593 self.drop(row_index, inplace=True)
595 def filter_objective(self: PerformanceDataFrame, objective: str | list[str]) -> None:
596 """Filter the Dataframe to a subset of objectives."""
597 if isinstance(objective, str):
598 objective = [objective]
599 self.drop(
600 list(set(self.objective_names) - set(objective)),
601 axis=0,
602 level=PerformanceDataFrame.index_objective,
603 inplace=True,
604 )
606 def reset_value(
607 self: PerformanceDataFrame,
608 solver: str,
609 instance: str,
610 objective: str = None,
611 run: int = None,
612 ) -> None:
613 """Reset a value in the dataframe."""
614 self.set_value(
615 PerformanceDataFrame.missing_value, solver, instance, objective, run
616 )
618 def set_value(
619 self: PerformanceDataFrame,
620 value: float | str | list[float | str] | list[list[float | str]],
621 solver: str | list[str],
622 instance: str | list[str],
623 configuration: str = None,
624 objective: str | list[str] = None,
625 run: int | list[int] = None,
626 solver_fields: list[str] = ["Value"],
627 append_write_csv: bool = False,
628 ) -> None:
629 """Setter method to assign a value to the Dataframe.
631 Allows for setting the same value to multiple indices.
633 Args:
634 value: Value(s) to be assigned. If value is a list, first dimension is
635 the solver field, second dimension is if multiple different values are
636 to be assigned. Must be the same shape as target.
637 solver: The solver(s) for which the value should be set.
638 If solver is a list, multiple solvers are set. If None, all
639 solvers are set.
640 instance: The instance(s) for which the value should be set.
641 If instance is a list, multiple instances are set. If None, all
642 instances are set.
643 configuration: The configuration(s) for which the value should be set.
644 When left None, set for all configurations
645 objective: The objectives for which the value should be set.
646 When left None, set for all objectives
647 run: The run index for which the value should be set.
648 If left None, set for all runs.
649 solver_fields: The level to which each value should be assinged.
650 Defaults to ["Value"].
651 append_write_csv: For concurrent writing to the PerformanceDataFrame.
652 If True, the value is directly appended to the CSV file.
653 This will create duplicate entries in the file, but these are combined
654 when loading the file.
655 """
656 # Convert indices to slices for None values
657 solver = slice(solver) if solver is None else solver
658 configuration = slice(configuration) if configuration is None else configuration
659 instance = slice(instance) if instance is None else instance
660 objective = slice(objective) if objective is None else objective
661 run = slice(run) if run is None else run
662 # Convert column indices to slices for setting multiple columns
663 value = [value] if not isinstance(value, list) else value
664 # NOTE: We currently forloop levels here, as it allows us to set the same
665 # sequence of values to the indices
666 for item, level in zip(value, solver_fields):
667 self.loc[(objective, instance, run), (solver, configuration, level)] = item
669 if append_write_csv:
670 writeable = self.loc[(objective, instance, run), :]
671 if isinstance(writeable, pd.Series): # Single row, convert to pd.DataFrame
672 writeable = self.loc[[(objective, instance, run)], :]
673 # Append the new rows to the dataframe csv file
674 writeable.to_csv(self.csv_filepath, mode="a", header=False)
676 def get_value(
677 self: PerformanceDataFrame,
678 solver: str | list[str] = None,
679 instance: str | list[str] = None,
680 configuration: str = None,
681 objective: str = None,
682 run: int = None,
683 solver_fields: list[str] = ["Value"],
684 ) -> float | str | list[Any]:
685 """Index a value of the DataFrame and return it."""
686 # Convert indices to slices for None values
687 solver = slice(solver) if solver is None else solver
688 configuration = slice(configuration) if configuration is None else configuration
689 instance = slice(instance) if instance is None else instance
690 objective = slice(objective) if objective is None else objective
691 solver_fields = slice(solver_fields) if solver_fields is None else solver_fields
692 run = slice(run) if run is None else run
693 target = self.loc[
694 (objective, instance, run), (solver, configuration, solver_fields)
695 ].values
696 # Reduce dimensions when relevant
697 if len(target) > 0 and isinstance(target[0], np.ndarray) and len(target[0]) == 1:
698 target = target.flatten()
699 target = target.tolist()
700 if len(target) == 1:
701 return target[0]
702 return target
704 def get_instance_num_runs(self: PerformanceDataFrame, instance: str) -> int:
705 """Return the number of runs for an instance."""
706 # We assume each objective has the same index for Instance/Runs
707 return len(self.loc[(self.objective_names[0], instance)].index)
709 # Calculables
711 def mean(
712 self: PerformanceDataFrame,
713 objective: str = None,
714 solver: str = None,
715 instance: str = None,
716 ) -> float:
717 """Return the mean value of a slice of the dataframe."""
718 objective = self.verify_objective(objective)
719 subset = self.xs(objective, level=0)
720 if solver is not None:
721 subset = subset.xs(solver, axis=1, drop_level=False)
722 if instance is not None:
723 subset = subset.xs(instance, axis=0, drop_level=False)
724 value = subset.astype(float).mean()
725 if isinstance(value, pd.Series):
726 return value.mean()
727 return value
729 def get_job_list(
730 self: PerformanceDataFrame, rerun: bool = False
731 ) -> list[tuple[str, str]]:
732 """Return a list of performance computation jobs there are to be done.
734 Get a list of tuple[instance, solver] to run from the performance data.
735 If rerun is False (default), get only the tuples that don't have a
736 value, else (True) get all the tuples.
738 Args:
739 rerun: Boolean indicating if we want to rerun all jobs
741 Returns:
742 A tuple of (solver, config, instance, run) combinations
743 """
744 # Drop the seed as we are looking for nan values, not seeds
745 df = self.drop(
746 PerformanceDataFrame.column_seed,
747 axis=1,
748 level=PerformanceDataFrame.column_meta,
749 )
750 df = df.droplevel(PerformanceDataFrame.column_meta, axis=1)
751 if rerun: # Return all combinations
752 # Drop objective, not needed
753 df = df.droplevel(PerformanceDataFrame.index_objective, axis=0)
754 result = [
755 tuple(column) + tuple(index)
756 for column, index in itertools.product(df.columns, df.index)
757 ]
758 else:
759 result = []
760 for (solver, config), (objective, instance, run) in itertools.product(
761 df.columns, df.index
762 ):
763 value = df.loc[(objective, instance, run), (solver, config)]
764 if value is None or (
765 isinstance(value, (int, float)) and math.isnan(value)
766 ):
767 result.append(tuple([solver, config, instance, run]))
768 # Filter duplicates while keeping the order conistent
769 result = list(dict.fromkeys(result))
770 return result
772 def configuration_performance(
773 self: PerformanceDataFrame,
774 solver: str,
775 configuration: str | list[str] = None,
776 objective: str | SparkleObjective = None,
777 instances: list[str] = None,
778 per_instance: bool = False,
779 ) -> tuple[str, float]:
780 """Return the (best) configuration performance for objective over the instances.
782 Args:
783 solver: The solver for which we determine evaluate the configuration
784 configuration: The configuration (id) to evaluate
785 objective: The objective for which we calculate find the best value
786 instances: The instances which should be selected for the evaluation
787 per_instance: Whether to return the performance per instance,
788 or aggregated.
790 Returns:
791 The (best) configuration id and its aggregated performance.
792 """
793 objective = self.verify_objective(objective)
794 if isinstance(objective, str):
795 objective = resolve_objective(objective)
796 # Filter objective
797 subdf = self.xs(objective.name, level=0, drop_level=True)
798 # Filter solver
799 subdf = subdf.xs(solver, axis=1, drop_level=True)
800 # Drop the seed, then drop meta level as it is no longer needed
801 subdf = subdf.drop(
802 PerformanceDataFrame.column_seed,
803 axis=1,
804 level=PerformanceDataFrame.column_meta,
805 )
806 subdf = subdf.droplevel(PerformanceDataFrame.column_meta, axis=1)
807 # Ensure the objective is numeric
808 subdf = subdf.astype(float)
810 if instances: # Filter instances
811 subdf = subdf.loc[instances, :]
812 if configuration: # Filter configuration
813 if not isinstance(configuration, list):
814 configuration = [configuration]
815 subdf = subdf.filter(configuration, axis=1)
816 # Aggregate the runs
817 subdf = subdf.groupby(PerformanceDataFrame.index_instance).agg(
818 func=objective.run_aggregator.__name__
819 )
820 # Aggregate the instances
821 sub_series = subdf.agg(func=objective.instance_aggregator.__name__)
822 # Select the best configuration
823 best_conf = sub_series.idxmin() if objective.minimise else sub_series.idxmax()
824 if per_instance: # Return a list of instance results
825 return best_conf, subdf[best_conf].to_list()
826 return best_conf, sub_series[best_conf]
828 def best_configuration(
829 self: PerformanceDataFrame,
830 solver: str,
831 objective: SparkleObjective = None,
832 instances: list[str] = None,
833 ) -> tuple[str, float]:
834 """Return the best configuration for the given objective over the instances.
836 Args:
837 solver: The solver for which we determine the best configuration
838 objective: The objective for which we calculate the best configuration
839 instances: The instances which should be selected for the evaluation
841 Returns:
842 The best configuration id and its aggregated performance.
843 """
844 return self.configuration_performance(solver, None, objective, instances)
846 def best_instance_performance(
847 self: PerformanceDataFrame,
848 objective: str | SparkleObjective = None,
849 instances: list[str] = None,
850 run_id: int = None,
851 exclude_solvers: list[(str, str)] = None,
852 ) -> pd.Series:
853 """Return the best performance for each instance in the portfolio.
855 Args:
856 objective: The objective for which we calculate the best performance
857 instances: The instances which should be selected for the evaluation
858 run_id: The run for which we calculate the best performance. If None,
859 we consider all runs.
860 exclude_solvers: List of (solver, config_id) to exclude in the calculation.
862 Returns:
863 The best performance for each instance in the portfolio.
864 """
865 objective = self.verify_objective(objective)
866 if isinstance(objective, str):
867 objective = resolve_objective(objective)
868 subdf = self.drop( # Drop Seed, not needed
869 [PerformanceDataFrame.column_seed],
870 axis=1,
871 level=PerformanceDataFrame.column_meta,
872 )
873 subdf = subdf.xs(objective.name, level=0) # Drop objective
874 if exclude_solvers is not None:
875 subdf = subdf.drop(exclude_solvers, axis=1)
876 if instances is not None:
877 subdf = subdf.loc[instances, :]
878 if run_id is not None:
879 run_id = self.verify_run_id(run_id)
880 subdf = subdf.xs(run_id, level=1)
881 else:
882 # Drop the run level
883 subdf = subdf.droplevel(level=1)
884 # Ensure the objective is numeric
885 subdf = subdf.astype(float)
886 series = subdf.min(axis=1) if objective.minimise else subdf.max(axis=1)
887 # Ensure we always return the best for each run
888 series = series.sort_values(ascending=objective.minimise)
889 return series.groupby(series.index).first().astype(float)
891 def best_performance(
892 self: PerformanceDataFrame,
893 exclude_solvers: list[(str, str)] = [],
894 instances: list[str] = None,
895 objective: str | SparkleObjective = None,
896 ) -> float:
897 """Return the overall best performance of the portfolio.
899 Args:
900 exclude_solvers: List of (solver, config_id) to exclude in the calculation.
901 Defaults to none.
902 instances: The instances which should be selected for the evaluation
903 If None, use all instances.
904 objective: The objective for which we calculate the best performance
906 Returns:
907 The aggregated best performance of the portfolio over all instances.
908 """
909 objective = self.verify_objective(objective)
910 if isinstance(objective, str):
911 objective = resolve_objective(objective)
912 instance_best = self.best_instance_performance(
913 objective, instances=instances, exclude_solvers=exclude_solvers
914 ).to_numpy(dtype=float)
915 return objective.instance_aggregator(instance_best)
917 def schedule_performance(
918 self: PerformanceDataFrame,
919 schedule: dict[str : dict[str : (str, str, int)]],
920 target_solver: str | tuple[str, str] = None,
921 objective: str | SparkleObjective = None,
922 ) -> float:
923 """Return the performance of a selection schedule on the portfolio.
925 Args:
926 schedule: Compute the best performance according to a selection schedule.
927 A schedule is a dictionary of instances, with a schedule per instance,
928 consisting of a triple of solver, config_id and maximum runtime.
929 target_solver: If not None, store the found values in this solver of the DF.
930 objective: The objective for which we calculate the best performance
932 Returns:
933 The performance of the schedule over the instances in the dictionary.
934 """
935 objective = self.verify_objective(objective)
936 if isinstance(objective, str):
937 objective = resolve_objective(objective)
938 select = min if objective.minimise else max
939 performances = [0.0] * len(schedule.keys())
940 if not isinstance(target_solver, tuple):
941 target_conf = PerformanceDataFrame.default_configuration
942 else:
943 target_solver, target_conf = target_solver
944 if target_solver and target_solver not in self.solvers:
945 self.add_solver(target_solver)
946 for ix, instance in enumerate(schedule.keys()):
947 for iy, (solver, config, max_runtime) in enumerate(schedule[instance]):
948 performance = float(
949 self.get_value(solver, instance, config, objective.name)
950 )
951 if max_runtime is not None: # We are dealing with runtime
952 performances[ix] += performance
953 if performance < max_runtime:
954 break # Solver finished in time
955 else: # Quality, we take the best found performance
956 if iy == 0: # First solver, set initial value
957 performances[ix] = performance
958 continue
959 performances[ix] = select(performances[ix], performance)
960 if target_solver is not None:
961 self.set_value(
962 performances[ix],
963 target_solver,
964 instance,
965 target_conf,
966 objective.name,
967 )
968 return performances
970 def marginal_contribution(
971 self: PerformanceDataFrame,
972 objective: str | SparkleObjective = None,
973 instances: list[str] = None,
974 sort: bool = False,
975 ) -> list[float]:
976 """Return the marginal contribution of the solver configuration on the instances.
978 Args:
979 objective: The objective for which we calculate the marginal contribution.
980 instances: The instances which should be selected for the evaluation
981 sort: Whether to sort the results afterwards
982 Returns:
983 The marginal contribution of each solver (configuration) as:
984 [(solver, config_id, marginal_contribution, portfolio_best_performance_without_solver)]
985 """
986 output = []
987 objective = self.verify_objective(objective)
988 if isinstance(objective, str):
989 objective = resolve_objective(objective)
990 best_performance = self.best_performance(
991 objective=objective, instances=instances
992 )
993 for solver in self.solvers:
994 for config_id in self.get_configurations(solver):
995 # By calculating the best performance excluding this Solver,
996 # we can determine its relative impact on the portfolio.
997 missing_solver_config_best = self.best_performance(
998 exclude_solvers=[(solver, config_id)],
999 instances=instances,
1000 objective=objective,
1001 )
1002 # Now we need to see how much the portfolio's best performance
1003 # decreases without this solver.
1004 marginal_contribution = missing_solver_config_best / best_performance
1005 if missing_solver_config_best == best_performance:
1006 # No change, no contribution
1007 marginal_contribution = 0.0
1008 output.append(
1009 (
1010 solver,
1011 config_id,
1012 marginal_contribution,
1013 missing_solver_config_best,
1014 )
1015 )
1016 if sort:
1017 output.sort(key=lambda x: x[2], reverse=objective.minimise)
1018 return output
1020 def get_solver_ranking(
1021 self: PerformanceDataFrame,
1022 objective: str | SparkleObjective = None,
1023 instances: list[str] = None,
1024 ) -> list[tuple[str, dict, float]]:
1025 """Return a list with solvers ranked by average performance."""
1026 objective = self.verify_objective(objective)
1027 if isinstance(objective, str):
1028 objective = resolve_objective(objective)
1029 # Drop Seed
1030 sub_df = self.drop(
1031 [PerformanceDataFrame.column_seed],
1032 axis=1,
1033 level=PerformanceDataFrame.column_meta,
1034 )
1035 # Reduce objective
1036 sub_df: pd.DataFrame = sub_df.loc(axis=0)[objective.name, :, :]
1037 # Drop Objective, Meta multi index
1038 sub_df = sub_df.droplevel(PerformanceDataFrame.index_objective).droplevel(
1039 PerformanceDataFrame.column_meta, axis=1
1040 )
1041 if instances is not None: # Select instances
1042 sub_df = sub_df.loc(axis=0)[instances,]
1043 # Ensure data is numeric
1044 sub_df = sub_df.astype(float)
1045 # Aggregate runs
1046 sub_df = sub_df.groupby(PerformanceDataFrame.index_instance).agg(
1047 func=objective.run_aggregator.__name__
1048 )
1049 # Aggregate instances
1050 sub_series = sub_df.aggregate(func=objective.instance_aggregator.__name__)
1051 # Sort by objective
1052 sub_series.sort_values(ascending=objective.minimise, inplace=True)
1053 return [(index[0], index[1], sub_series[index]) for index in sub_series.index]
1055 def save_csv(self: PerformanceDataFrame, csv_filepath: Path = None) -> None:
1056 """Write a CSV to the given path.
1058 Args:
1059 csv_filepath: String path to the csv file. Defaults to self.csv_filepath.
1060 """
1061 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath
1062 self.to_csv(csv_filepath)
1063 # Append the configurations
1064 with csv_filepath.open("a") as fout:
1065 fout.write("\n$Solver,configuration_id,Configuration\n")
1066 for solver in self.solvers:
1067 for config_id in self.attrs[solver]:
1068 configuration = self.attrs[solver][config_id]
1069 fout.write(f"${solver},{config_id},{str(configuration)}\n")
1071 def clone(
1072 self: PerformanceDataFrame, csv_filepath: Path = None
1073 ) -> PerformanceDataFrame:
1074 """Create a copy of this object.
1076 Args:
1077 csv_filepath: The new filepath to use for saving the object to.
1078 If None, will not be saved.
1079 Warning: If the original path is used, it could lead to dataloss!
1080 """
1081 pd_copy = PerformanceDataFrame(
1082 csv_filepath=csv_filepath,
1083 solvers=self.solvers,
1084 configurations=self.configurations,
1085 objectives=self.objectives,
1086 instances=self.instances,
1087 n_runs=self.num_runs,
1088 )
1089 # Copy values
1090 for column_index in self.columns:
1091 for index in self.index:
1092 pd_copy.at[index, column_index] = self.loc[index, column_index]
1093 # Ensure everything is sorted?
1094 return pd_copy
1096 def clean_csv(self: PerformanceDataFrame) -> None:
1097 """Set all values in Performance Data to None."""
1098 self[:] = PerformanceDataFrame.missing_value
1099 self.save_csv()