Source code for pgsi_analyzer.models.combination

"""
Combine energy and time results from multiple execution methods.

This module provides functions to merge aggregated results from different
execution methods (e.g., CPython, PyPy, Cython) into comparison tables.
"""

import pandas as pd
from pathlib import Path
from typing import Union, List
from collections import defaultdict


[docs] def extract_algorithm_name(full_name: str, method_name: str = "") -> str: """ Extract algorithm name from full filename. When method_name is provided (e.g. from the parent directory), strips '_' + method_name from the end so that e.g. 'nbody_py_compile' with method 'py_compile' yields 'nbody' instead of 'nbody_py'. Args: full_name: Full filename (e.g., 'nbody_py_compile', 'nbody_cpython') method_name: Method name (e.g., 'py_compile', 'cpython'). If provided and full_name ends with '_' + method_name, that suffix is removed. Returns: Algorithm name (e.g., 'nbody') Examples: >>> extract_algorithm_name('nbody_py_compile', 'py_compile') 'nbody' >>> extract_algorithm_name('nbody_cpython', 'cpython') 'nbody' >>> extract_algorithm_name('bubble_sort_cpython') 'bubble_sort' """ if method_name and full_name.endswith("_" + method_name): return full_name[: -len(method_name) - 1].rstrip("_") return "_".join(full_name.split("_")[:-1])
[docs] def combine_energy_results( file_paths: List[Union[str, Path]], output_path: Union[str, Path] ) -> pd.DataFrame: """ Merge energy results from multiple execution methods. Combines aggregated energy results from different methods (e.g., CPython, PyPy) into a single comparison table with algorithms as rows and methods as columns. Args: file_paths: List of paths to aggregated energy CSV files. Method name is extracted from the parent directory name. output_path: Path to save the combined results CSV. Returns: DataFrame with: - 'algorithm' column - One column per method with average energy values Examples: >>> paths = [ ... 'cpython/energy_avg.csv', ... 'pypy/energy_avg.csv' ... ] >>> df = combine_energy_results(paths, 'energy_com.csv') """ energy_data = defaultdict(dict) # {algorithm: {method: value}} for file_path in file_paths: # Convert to Path if string path = Path(file_path) if isinstance(file_path, str) else file_path if not path.exists(): raise FileNotFoundError(f"File not found: {path}") # Extract method name from parent directory method_name = path.parent.name # Read CSV df = pd.read_csv(path) if 'filename' not in df.columns or 'average_package (uJ)' not in df.columns: raise ValueError(f"CSV must contain 'filename' and 'average_package (uJ)' columns: {path}") # Process each row for _, row in df.iterrows(): full_filename = row['filename'] algorithm = extract_algorithm_name(full_filename, method_name) try: avg_energy = float(row['average_package (uJ)']) energy_data[algorithm][method_name] = avg_energy except (ValueError, KeyError): continue if not energy_data: raise ValueError("No valid energy data found in input files") # Get all unique methods all_methods = sorted({method for algo_data in energy_data.values() for method in algo_data}) # Build DataFrame rows = [] for algorithm in sorted(energy_data): row = {'algorithm': algorithm} for method in all_methods: row[method] = energy_data[algorithm].get(method, '') rows.append(row) result_df = pd.DataFrame(rows) # Save to file output = Path(output_path) if isinstance(output_path, str) else output_path output.parent.mkdir(parents=True, exist_ok=True) result_df.to_csv(output, index=False) return result_df
[docs] def combine_time_results( file_paths: List[Union[str, Path]], output_path: Union[str, Path] ) -> pd.DataFrame: """ Merge execution time results from multiple execution methods. Combines aggregated time results from different methods (e.g., CPython, PyPy) into a single comparison table with algorithms as rows and methods as columns. Args: file_paths: List of paths to aggregated time CSV files. Method name is extracted from the parent directory name. output_path: Path to save the combined results CSV. Returns: DataFrame with: - 'algorithm' column - One column per method with average execution time values Examples: >>> paths = [ ... 'cpython/time_avg.csv', ... 'pypy/time_avg.csv' ... ] >>> df = combine_time_results(paths, 'time_com.csv') """ time_data = defaultdict(dict) # {algorithm: {method: time}} for file_path in file_paths: # Convert to Path if string path = Path(file_path) if isinstance(file_path, str) else file_path if not path.exists(): raise FileNotFoundError(f"File not found: {path}") # Extract method name from parent directory method_name = path.parent.name # Read CSV df = pd.read_csv(path) if 'filename' not in df.columns or 'execution_time (s)' not in df.columns: raise ValueError(f"CSV must contain 'filename' and 'execution_time (s)' columns: {path}") # Process each row for _, row in df.iterrows(): full_filename = row['filename'] algorithm = extract_algorithm_name(full_filename, method_name) try: exec_time = float(row['execution_time (s)']) time_data[algorithm][method_name] = exec_time except (ValueError, KeyError): continue if not time_data: raise ValueError("No valid time data found in input files") # Get all unique methods all_methods = sorted({method for algo_data in time_data.values() for method in algo_data}) # Build DataFrame rows = [] for algorithm in sorted(time_data): row = {'algorithm': algorithm} for method in all_methods: row[method] = time_data[algorithm].get(method, '') rows.append(row) result_df = pd.DataFrame(rows) # Save to file output = Path(output_path) if isinstance(output_path, str) else output_path output.parent.mkdir(parents=True, exist_ok=True) result_df.to_csv(output, index=False) return result_df