Source code for pgsi_analyzer.models.greenscore

"""
GreenScore calculation and metric normalization.

This module provides functions to calculate GreenScore, a composite metric
combining energy consumption, execution time, and carbon footprint with
configurable weights.
"""

import pandas as pd
from pathlib import Path
from typing import Union, Optional, Dict

# Methodology tag for "measured" (hardware); all others count as "estimated"
METHODOLOGY_MEASURED = "hardware_rapl_linux"



[docs]
def normalize_metrics(df: pd.DataFrame, output_path: Optional[Union[str, Path]] = None) -> pd.DataFrame:
    """
    Normalize metrics across methods (row-wise) per algorithm.

    Applies min-max normalization to each row, normalizing values between 0 and 1.
    This allows fair comparison across different algorithms with different scales.

    Args:
        df: DataFrame with one row per algorithm, 'algorithm' column, and metric columns.
        output_path: Optional path to save the normalized DataFrame as CSV.

    Returns:
        DataFrame with 'algorithm' and normalized method columns.

    Examples:
        >>> df = pd.DataFrame({
        ...     'algorithm': ['algo1', 'algo2'],
        ...     'method1': [100, 200],
        ...     'method2': [50, 150]
        ... })
        >>> normalized = normalize_metrics(df)
    """
    # Copy to avoid modifying original
    df = df.copy()
    
    # Extract algorithm names
    algorithm_names = df['algorithm']
    
    # Select only numeric method columns
    method_cols = df.columns.drop('algorithm')
    numeric_df = df[method_cols]
    
    # Apply row-wise normalization (min-max per algorithm)
    normalized_df = numeric_df.apply(
        lambda row: (row - row.min()) / (row.max() - row.min())
        if row.max() != row.min() else row * 0,  # Handle constant rows
        axis=1
    )
    
    # Add back the algorithm column
    normalized_df.insert(0, 'algorithm', algorithm_names)
    
    # Save to file if output path provided
    if output_path is not None:
        output = Path(output_path) if isinstance(output_path, str) else output_path
        output.parent.mkdir(parents=True, exist_ok=True)
        normalized_df.to_csv(output, index=False)
    
    return normalized_df



def _methodology_counts_from_aggregated(aggregated_energy_paths: Dict[str, Path]) -> Dict[str, Dict[str, int]]:
    """For each method, count points_measured (hardware_rapl_linux) and points_estimated (else)."""
    out = {}
    for method, path in aggregated_energy_paths.items():
        path = Path(path)
        if not path.exists():
            out[method] = {"points_measured": 0, "points_estimated": 0}
            continue
        df = pd.read_csv(path)
        if "methodology" not in df.columns:
            out[method] = {"points_measured": 0, "points_estimated": len(df)}
            continue
        measured = (df["methodology"] == METHODOLOGY_MEASURED).sum()
        out[method] = {"points_measured": int(measured), "points_estimated": int(len(df) - measured)}
    return out



[docs]
def calculate_greenscore(
    energy_df: pd.DataFrame,
    time_df: pd.DataFrame,
    carbon_df: pd.DataFrame,
    alpha: float = 0.4,
    beta: float = 0.4,
    gamma: float = 0.2,
    output_path: Optional[Union[str, Path]] = None,
    aggregated_energy_paths: Optional[Dict[str, Union[str, Path]]] = None,
) -> pd.DataFrame:
    """
    Compute the GreenScore for each method by combining normalized
    energy, time, and carbon scores with weighted averaging.

    GreenScore = α·energy + β·carbon + γ·time

    Lower scores indicate better sustainability (lower energy, time, and carbon).

    Args:
        energy_df: Raw energy DataFrame (with 'algorithm' column).
        time_df: Raw time DataFrame (with 'algorithm' column).
        carbon_df: Raw carbon DataFrame (with 'algorithm' column).
        alpha: Weight for energy component (default: 0.4).
        beta: Weight for carbon component (default: 0.4).
        gamma: Weight for time component (default: 0.2).
        output_path: Optional path to save the final ranking CSV.
        aggregated_energy_paths: Optional dict method -> path to method's energy_aggregated.csv
                                used to add points_measured / points_estimated to the output.

    Returns:
        DataFrame sorted by green score (ascending, lower is better):
        - 'method': Method name
        - 'energy_mean': Mean normalized energy
        - 'time_mean': Mean normalized time
        - 'carbon_mean': Mean normalized carbon
        - 'green_score': Composite GreenScore
        - 'points_measured': (if aggregated_energy_paths given) Count of hardware-measured points
        - 'points_estimated': (if aggregated_energy_paths given) Count of estimated points
        - 'data_source_consistency': "Consistent" or "Inconsistent Data Source" (when method has both hardware and estimation)

    Examples:
        >>> energy_df = pd.read_csv('energy.csv')
        >>> time_df = pd.read_csv('time.csv')
        >>> carbon_df = pd.read_csv('carbon.csv')
        >>> ranking = calculate_greenscore(energy_df, time_df, carbon_df)
    """
    # Validate weights sum to 1.0
    if abs(alpha + beta + gamma - 1.0) > 1e-6:
        raise ValueError(f"Weights must sum to 1.0, got: alpha={alpha}, beta={beta}, gamma={gamma}")
    
    # Step 1: Normalize each DataFrame
    energy_norm = normalize_metrics(energy_df)
    time_norm = normalize_metrics(time_df)
    carbon_norm = normalize_metrics(carbon_df)
    
    # Step 2: Drop algorithm column and compute column-wise means
    energy_mean = energy_norm.drop(columns=['algorithm']).mean()
    time_mean = time_norm.drop(columns=['algorithm']).mean()
    carbon_mean = carbon_norm.drop(columns=['algorithm']).mean()
    
    # Step 3: Method names from energy columns (already plain: cpython, py_compile, etc.)
    # Carbon columns have '_CO2e_g' suffix but we use energy index so names stay correct
    method_names = energy_mean.index
    
    # Step 4: Combine into a single DataFrame
    mean_df = pd.DataFrame({
        'method': method_names,
        'energy_mean': energy_mean.values,
        'time_mean': time_mean.values,
        'carbon_mean': carbon_mean.values
    })

    # Step 4b: Add methodology summary (points_measured vs points_estimated)
    if aggregated_energy_paths:
        paths_as_path = {k: Path(v) for k, v in aggregated_energy_paths.items()}
        counts = _methodology_counts_from_aggregated(paths_as_path)
        mean_df["points_measured"] = mean_df["method"].map(lambda m: counts.get(m, {}).get("points_measured", 0))
        mean_df["points_estimated"] = mean_df["method"].map(lambda m: counts.get(m, {}).get("points_estimated", 0))
    else:
        mean_df["points_measured"] = 0
        mean_df["points_estimated"] = 0

    # Step 4c: Methodology consistency — flag "Inconsistent Data Source" when a method has both hardware and estimation
    mean_df["data_source_consistency"] = mean_df.apply(
        lambda r: "Inconsistent Data Source" if (r["points_measured"] > 0 and r["points_estimated"] > 0) else "Consistent",
        axis=1,
    )
    
    # Step 5: Compute GreenScore = α·energy + β·carbon + γ·time
    mean_df['green_score'] = (
        alpha * mean_df['energy_mean'] +
        beta * mean_df['carbon_mean'] +
        gamma * mean_df['time_mean']
    )
    
    # Step 6: Sort methods by green score (lower is better)
    green_score_df = mean_df.sort_values(by='green_score').reset_index(drop=True)
    
    # Step 7: Save to file if output path provided
    if output_path is not None:
        output = Path(output_path) if isinstance(output_path, str) else output_path
        output.parent.mkdir(parents=True, exist_ok=True)
        green_score_df.to_csv(output, index=False)
    
    return green_score_df