diff --git a/.ci/scripts/analyze_benchmark_stability.py b/.ci/scripts/analyze_benchmark_stability.py
new file mode 100644
index 00000000000..47f984b7ce3
--- /dev/null
+++ b/.ci/scripts/analyze_benchmark_stability.py
@@ -0,0 +1,1523 @@
+import argparse
+import os
+import re
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+
+
+def print_section_header(title):
+    """Print a clearly visible section header to stdout"""
+    print("\n\n" + "=" * 100)
+    print(f"===== {title} ".ljust(99, "="))
+    print("=" * 100 + "\n")
+
+
+def normalize_tab_name(name):
+    """Normalize tab name for better matching"""
+    # Convert to lowercase and remove spaces
+    return name.lower().replace(" ", "")
+
+
+def parse_model_device(sheet_name):
+    """Extract model and device from sheet name using the 'model+device' pattern"""
+    parts = sheet_name.split("+", 1)
+    if len(parts) < 2:
+        return sheet_name, "Unknown"
+    return parts[0], parts[1]
+
+
+def extract_model_device_os(sheet_name):
+    """
+    Extract model, device, and OS from sheet name
+    Format expected: model+device_osname
+    Returns: (model, device_base, os_version)
+    """
+    model, device_full = parse_model_device(sheet_name)
+
+    # Use regex to separate device base name from OS version
+    # Pattern looks for device name followed by underscore or android/ios
+    match = re.match(r"(.*?)(android|ios|_)(.*)", device_full, re.IGNORECASE)
+
+    if match:
+        device_base = match.group(1).rstrip("_")
+        os_name = match.group(2)
+        os_version = match.group(3)
+        return model, device_base, f"{os_name}{os_version}"
+    else:
+        # If no OS version found, return the device as is with empty OS
+        return model, device_full, ""
+
+
+def is_matching_dataset(primary_sheet, reference_sheet):
+    """
+    Check if two datasets match for comparison based on model and device
+    Allows different OS versions for the same device
+    """
+    primary_model, primary_device, primary_os = extract_model_device_os(primary_sheet)
+    reference_model, reference_device, reference_os = extract_model_device_os(
+        reference_sheet
+    )
+
+    # Model must match exactly
+    if primary_model != reference_model:
+        return False
+
+    # Device base name must match exactly
+    if primary_device != reference_device:
+        return False
+
+    # If we get here, model and device base match, so it's a valid comparison
+    # even if OS versions differ
+    return True
+
+
+def analyze_latency_stability(  # noqa: C901
+    primary_file, reference_file=None, output_dir="stability_analysis_results"
+):
+    """
+    Analyze latency stability metrics from benchmark data in Excel files.
+
+    Parameters:
+    -----------
+    primary_file : str
+        Path to the Excel file containing primary (private) benchmark data
+    reference_file : str, optional
+        Path to the Excel file containing reference (public) benchmark data
+    output_dir : str
+        Directory to save output files
+    """
+    print(f"Analyzing latency stability from primary file: {primary_file}")
+    if reference_file:
+        print(f"Using reference file for comparison: {reference_file}")
+
+    # Create output directory if it doesn't exist
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # Load primary datasets
+    print_section_header("LOADING PRIMARY DATASETS (Private)")
+    primary_datasets = {}
+    primary_xls = pd.ExcelFile(primary_file)
+
+    for sheet in primary_xls.sheet_names:
+        print(f"Loading dataset: {sheet}")
+        df = pd.read_excel(primary_xls, sheet_name=sheet)
+        model, device = parse_model_device(sheet)
+
+        # Check if required columns exist
+        required_cols = ["InferenceTime", "Date"]
+        if "trimmean_inference_latency(ms)" in df.columns:
+            trimmed_col = "trimmean_inference_latency(ms)"
+            required_cols.append(trimmed_col)
+        else:
+            trimmed_col = None
+
+        if "TPS" in df.columns:
+            tps_col = "TPS"
+            required_cols.append(tps_col)
+        else:
+            tps_col = None
+
+        # Skip sheets without required columns
+        if not all(col in df.columns for col in required_cols):
+            print(f"  Skipping {sheet}: Missing required columns")
+            continue
+
+        # Convert Date to datetime
+        df["Date"] = pd.to_datetime(df["Date"])
+
+        # Calculate stability metrics
+        metrics = calculate_stability_metrics(df, "InferenceTime", trimmed_col, tps_col)
+
+        primary_datasets[sheet] = {
+            "df": df,
+            "metrics": metrics,
+            "model": model,
+            "device": device,
+            "sheet_name": sheet,
+        }
+
+    # Load reference datasets if provided
+    reference_datasets = {}
+    if reference_file:
+        print_section_header("LOADING REFERENCE DATASETS (Public)")
+        reference_xls = pd.ExcelFile(reference_file)
+
+        for sheet in reference_xls.sheet_names:
+            print(f"Loading reference dataset: {sheet}")
+            df = pd.read_excel(reference_xls, sheet_name=sheet)
+            model, device = parse_model_device(sheet)
+
+            # Check if required columns exist
+            required_cols = ["InferenceTime", "Date"]
+            if "trimmean_inference_latency(ms)" in df.columns:
+                trimmed_col = "trimmean_inference_latency(ms)"
+                required_cols.append(trimmed_col)
+            else:
+                trimmed_col = None
+
+            if "TPS" in df.columns:
+                tps_col = "TPS"
+                required_cols.append(tps_col)
+            else:
+                tps_col = None
+
+            # Skip sheets without required columns
+            if not all(col in df.columns for col in required_cols):
+                print(f"  Skipping reference {sheet}: Missing required columns")
+                continue
+
+            # Convert Date to datetime
+            df["Date"] = pd.to_datetime(df["Date"])
+
+            # Calculate stability metrics
+            metrics = calculate_stability_metrics(
+                df, "InferenceTime", trimmed_col, tps_col
+            )
+
+            reference_datasets[sheet] = {
+                "df": df,
+                "metrics": metrics,
+                "model": model,
+                "device": device,
+                "sheet_name": sheet,
+            }
+
+    # Process primary datasets
+    print_section_header("ANALYZING PRIMARY DATASETS")
+    for sheet, info in primary_datasets.items():
+        # Generate dataset report
+        generate_dataset_report(
+            sheet,
+            info["model"],
+            info["device"],
+            "Primary",
+            info["df"],
+            info["metrics"],
+            output_dir,
+        )
+
+        # Generate time series plot
+        if len(info["df"]) > 5:  # Only create plot if enough data points
+            generate_time_series_plot(sheet, info["df"], output_dir, "Primary")
+
+    # Process reference datasets if provided
+    if reference_file:
+        print_section_header("ANALYZING REFERENCE DATASETS")
+        for sheet, info in reference_datasets.items():
+            # Generate dataset report
+            generate_dataset_report(
+                sheet,
+                info["model"],
+                info["device"],
+                "Reference",
+                info["df"],
+                info["metrics"],
+                output_dir,
+            )
+
+            # Generate time series plot
+            if len(info["df"]) > 5:  # Only create plot if enough data points
+                generate_time_series_plot(sheet, info["df"], output_dir, "Reference")
+
+    # Generate comparison reports for matching datasets
+    if reference_file:
+        print_section_header("PRIVATE VS PUBLIC STABILITY COMPARISON")
+        matches_found = False
+
+        for primary_sheet, primary_info in primary_datasets.items():
+            found_match = False
+
+            for ref_sheet, ref_info in reference_datasets.items():
+                if is_matching_dataset(primary_sheet, ref_sheet):
+                    # Found a match
+                    print(
+                        f"Matched: {primary_sheet} (Private) with {ref_sheet} (Public)"
+                    )
+                    generate_comparison_report(
+                        primary_sheet,
+                        ref_sheet,
+                        primary_info["model"],
+                        primary_info["device"],
+                        ref_info["device"],
+                        primary_info["metrics"],
+                        ref_info["metrics"],
+                        output_dir,
+                    )
+                    found_match = True
+                    matches_found = True
+                    break
+
+            if not found_match:
+                print(f"Warning: No matching reference dataset for {primary_sheet}")
+
+        if not matches_found:
+            print("No matching datasets found between primary and reference files.")
+
+    # Generate intra-primary summary (comparing across different models/devices)
+    print_section_header("INTRA-PRIMARY STABILITY COMPARISON")
+    generate_intra_primary_summary(primary_datasets, output_dir)
+
+    # Generate summary report for all datasets
+    print_section_header("COMPREHENSIVE STABILITY SUMMARY")
+    generate_summary_report(
+        primary_datasets, reference_datasets if reference_file else None, output_dir
+    )
+
+    print(f"\nAnalysis complete. Results saved to {output_dir}/")
+    return primary_datasets, reference_datasets if reference_file else None
+
+
+def calculate_stability_metrics(  # noqa: C901
+    df, raw_col, trimmed_col=None, tps_col=None
+):
+    """Calculate stability metrics for the given dataset"""
+    metrics = {}
+
+    # Extract data
+    raw_latency = df[raw_col].values
+    if trimmed_col and trimmed_col in df.columns:
+        trimmed_latency = df[trimmed_col].values
+    else:
+        trimmed_latency = None
+    if tps_col and tps_col in df.columns:
+        tps = df[tps_col].values
+    else:
+        tps = None
+
+    # Central tendency metrics
+    metrics["mean_raw_latency"] = np.mean(raw_latency)
+    metrics["median_raw_latency"] = np.median(raw_latency)
+    if trimmed_latency is not None:
+        metrics["mean_trimmed_latency"] = np.mean(trimmed_latency)
+        metrics["median_trimmed_latency"] = np.median(trimmed_latency)
+
+    # Dispersion metrics
+    metrics["std_raw_latency"] = np.std(raw_latency, ddof=1)
+    metrics["cv_raw_latency"] = (
+        metrics["std_raw_latency"] / metrics["mean_raw_latency"]
+    ) * 100
+    metrics["iqr_raw_latency"] = np.percentile(raw_latency, 75) - np.percentile(
+        raw_latency, 25
+    )
+    if trimmed_latency is not None:
+        metrics["std_trimmed_latency"] = np.std(trimmed_latency, ddof=1)
+        metrics["cv_trimmed_latency"] = (
+            metrics["std_trimmed_latency"] / metrics["mean_trimmed_latency"]
+        ) * 100
+        metrics["iqr_trimmed_latency"] = np.percentile(
+            trimmed_latency, 75
+        ) - np.percentile(trimmed_latency, 25)
+
+    # Percentile metrics
+    for p in [50, 90, 95, 99]:
+        metrics[f"p{p}_raw_latency"] = np.percentile(raw_latency, p)
+        if trimmed_latency is not None:
+            metrics[f"p{p}_trimmed_latency"] = np.percentile(trimmed_latency, p)
+
+    # Inter-jitter metrics (variability between runs)
+    if np.min(raw_latency) > 0:
+        metrics["max_min_range_ratio_raw"] = np.max(raw_latency) / np.min(raw_latency)
+    else:
+        metrics["max_min_range_ratio_raw"] = float("inf")
+        print("Warning: Minimum latency value is zero, max/min ratio set to infinity")
+
+    metrics["p99_p50_ratio_raw"] = (
+        metrics["p99_raw_latency"] / metrics["p50_raw_latency"]
+    )
+
+    if trimmed_latency is not None:
+        if np.min(trimmed_latency) > 0:
+            metrics["max_min_range_ratio_trimmed"] = np.max(trimmed_latency) / np.min(
+                trimmed_latency
+            )
+        else:
+            metrics["max_min_range_ratio_trimmed"] = float("inf")
+            print(
+                "Warning: Minimum trimmed latency value is zero, max/min ratio set to infinity"
+            )
+
+        metrics["p99_p50_ratio_trimmed"] = (
+            metrics["p99_trimmed_latency"] / metrics["p50_trimmed_latency"]
+        )
+
+    # Intra-jitter proxy (if both raw and trimmed are available)
+    if trimmed_latency is not None:
+        trimming_effect = (raw_latency - trimmed_latency) / raw_latency
+        metrics["mean_trimming_effect_ratio"] = np.mean(trimming_effect)
+        metrics["max_trimming_effect_ratio"] = np.max(trimming_effect)
+
+    # TPS metrics
+    if tps is not None:
+        metrics["mean_tps"] = np.mean(tps)
+        metrics["std_tps"] = np.std(tps, ddof=1)
+        metrics["cv_tps"] = (metrics["std_tps"] / metrics["mean_tps"]) * 100
+
+    # Time-based stability (rolling window of 5 samples)
+    if len(df) >= 5:
+        df_sorted = df.sort_values("Date")
+        rolling_std = df_sorted[raw_col].rolling(window=5).std()
+        metrics["mean_rolling_std"] = rolling_std.mean()
+        metrics["max_rolling_std"] = rolling_std.max()
+
+    # Stability score calculation (0-100 scale)
+    # Weights for different components
+    cv_weight = 0.5
+    max_min_weight = 0.25
+    p99_p50_weight = 0.25
+
+    # Convert metrics to scores (lower is better for all these metrics)
+    cv_score = max(
+        0, 100 - (metrics["cv_raw_latency"] * 10)
+    )  # CV of 10% or more gets 0
+
+    if metrics["max_min_range_ratio_raw"] == float("inf"):
+        max_min_score = 0
+    else:
+        max_min_score = max(
+            0, 100 - ((metrics["max_min_range_ratio_raw"] - 1) * 50)
+        )  # Ratio of 3.0 or more gets 0
+
+    p99_p50_score = max(
+        0, 100 - ((metrics["p99_p50_ratio_raw"] - 1) * 100)
+    )  # Ratio of 2.0 or more gets 0
+
+    # Weighted average
+    metrics["stability_score"] = (
+        cv_weight * cv_score
+        + max_min_weight * max_min_score
+        + p99_p50_weight * p99_p50_score
+    )
+
+    # Stability rating based on score
+    if metrics["stability_score"] >= 90:
+        metrics["stability_rating"] = "Excellent"
+    elif metrics["stability_score"] >= 80:
+        metrics["stability_rating"] = "Good"
+    elif metrics["stability_score"] >= 60:
+        metrics["stability_rating"] = "Moderate"
+    else:
+        metrics["stability_rating"] = "Poor"
+
+    return metrics
+
+
+def generate_dataset_report(  # noqa: C901
+    sheet_name, model, device, dataset_type, df, metrics, output_dir
+):
+    """Generate a detailed report for a single dataset"""
+    report_file = f"{output_dir}/{sheet_name}_{dataset_type.lower()}_report.txt"
+
+    # Create a string buffer to hold the report content
+    report_content = []
+
+    # Header
+    report_content.append(f"Latency Stability Analysis: {sheet_name} ({dataset_type})")
+    report_content.append("=" * 80)
+    report_content.append(f"Model: {model}")
+    report_content.append(f"Device: {device}")
+    report_content.append("")
+
+    # Dataset overview
+    report_content.append("Dataset Overview:")
+    report_content.append(f"  - Number of samples: {len(df)}")
+    report_content.append(f"  - Date range: {df['Date'].min()} to {df['Date'].max()}")
+    report_content.append("")
+
+    # Central tendency metrics
+    report_content.append("Central Tendency Metrics:")
+    report_content.append(f"  - Mean latency: {metrics['mean_raw_latency']:.2f} ms")
+    report_content.append(
+        f"  - Median latency (P50): {metrics['median_raw_latency']:.2f} ms"
+    )
+    if (
+        "mean_trimmed_latency" in metrics
+        and metrics["mean_trimmed_latency"] is not None
+    ):
+        report_content.append(
+            f"  - Mean trimmed latency: {metrics['mean_trimmed_latency']:.2f} ms"
+        )
+        report_content.append(
+            f"  - Median trimmed latency: {metrics['median_trimmed_latency']:.2f} ms"
+        )
+    report_content.append("")
+
+    # Dispersion metrics
+    report_content.append("Dispersion Metrics:")
+    report_content.append(
+        f"  - Standard deviation: {metrics['std_raw_latency']:.2f} ms"
+    )
+    report_content.append(
+        f"  - Coefficient of variation (CV): {metrics['cv_raw_latency']:.2f}%"
+    )
+    report_content.append(
+        f"  - Interquartile range (IQR): {metrics['iqr_raw_latency']:.2f} ms"
+    )
+    if "std_trimmed_latency" in metrics and metrics["std_trimmed_latency"] is not None:
+        report_content.append(
+            f"  - Trimmed standard deviation: {metrics['std_trimmed_latency']:.2f} ms"
+        )
+        report_content.append(
+            f"  - Trimmed coefficient of variation: {metrics['cv_trimmed_latency']:.2f}%"
+        )
+    report_content.append("")
+
+    # Percentile metrics
+    report_content.append("Percentile Metrics:")
+    report_content.append(f"  - P50 (median): {metrics['p50_raw_latency']:.2f} ms")
+    report_content.append(f"  - P90: {metrics['p90_raw_latency']:.2f} ms")
+    report_content.append(f"  - P95: {metrics['p95_raw_latency']:.2f} ms")
+    report_content.append(f"  - P99: {metrics['p99_raw_latency']:.2f} ms")
+    report_content.append("")
+
+    # Jitter metrics
+    report_content.append("Inter-Jitter Metrics (variability between runs):")
+    if metrics["max_min_range_ratio_raw"] == float("inf"):
+        report_content.append("  - Max/Min ratio: Infinity (minimum value is zero)")
+    else:
+        report_content.append(
+            f"  - Max/Min ratio: {metrics['max_min_range_ratio_raw']:.4f}"
+        )
+    report_content.append(f"  - P99/P50 ratio: {metrics['p99_p50_ratio_raw']:.4f}")
+    if "mean_rolling_std" in metrics:
+        report_content.append(
+            f"  - Mean rolling std (window=5): {metrics['mean_rolling_std']:.2f} ms"
+        )
+    report_content.append("")
+
+    if (
+        "mean_trimming_effect_ratio" in metrics
+        and metrics["mean_trimming_effect_ratio"] is not None
+    ):
+        report_content.append("Intra-Jitter Metrics (variability within runs):")
+        report_content.append(
+            f"  - Mean trimming effect ratio: {metrics['mean_trimming_effect_ratio']*100:.2f}%"
+        )
+        report_content.append(
+            f"  - Max trimming effect ratio: {metrics['max_trimming_effect_ratio']*100:.2f}%"
+        )
+        report_content.append("")
+
+    # TPS metrics
+    if "mean_tps" in metrics and metrics["mean_tps"] is not None:
+        report_content.append("Throughput Metrics:")
+        report_content.append(f"  - Mean TPS: {metrics['mean_tps']:.2f}")
+        report_content.append(
+            f"  - TPS coefficient of variation: {metrics['cv_tps']:.2f}%"
+        )
+        report_content.append("")
+
+    # Stability assessment
+    report_content.append("Stability Assessment:")
+    report_content.append(
+        f"  - Overall stability score: {metrics['stability_score']:.1f}/100"
+    )
+    report_content.append(
+        f"  - Overall stability rating: {metrics['stability_rating']}"
+    )
+    report_content.append("")
+
+    # Interpretation
+    report_content.append("Interpretation:")
+
+    # Stability rating explanation
+    if metrics["stability_rating"] == "Excellent":
+        report_content.append(
+            f"  The benchmark shows excellent stability (score: {metrics['stability_score']:.1f}/100) with very low"
+        )
+        report_content.append(
+            f"  variation between runs (CV: {metrics['cv_raw_latency']:.2f}%)."
+        )
+        report_content.append(
+            "  This indicates highly consistent performance suitable for latency-sensitive applications."
+        )
+    elif metrics["stability_rating"] == "Good":
+        report_content.append(
+            f"  The benchmark shows good stability (score: {metrics['stability_score']:.1f}/100) with low"
+        )
+        report_content.append(
+            f"  variation between runs (CV: {metrics['cv_raw_latency']:.2f}%)."
+        )
+        report_content.append(
+            "  Performance is consistent and predictable for most use cases."
+        )
+    elif metrics["stability_rating"] == "Moderate":
+        report_content.append(
+            f"  The benchmark shows moderate stability (score: {metrics['stability_score']:.1f}/100) with noticeable"
+        )
+        report_content.append(
+            f"  variation between runs (CV: {metrics['cv_raw_latency']:.2f}%)."
+        )
+        report_content.append(
+            "  While average performance is acceptable, occasional latency spikes may occur."
+        )
+    else:
+        report_content.append(
+            f"  The benchmark shows poor stability (score: {metrics['stability_score']:.1f}/100) with significant"
+        )
+        report_content.append(
+            f"  variation between runs (CV: {metrics['cv_raw_latency']:.2f}%)."
+        )
+        report_content.append(
+            "  Performance is unpredictable and may lead to inconsistent user experience."
+        )
+
+    # Additional insights
+    if (
+        "mean_trimming_effect_ratio" in metrics
+        and metrics["mean_trimming_effect_ratio"] is not None
+        and metrics["mean_trimming_effect_ratio"] > 0.05
+    ):
+        report_content.append("")
+        report_content.append(
+            "  The significant difference between raw and trimmed means suggests"
+        )
+        report_content.append(
+            f"  considerable intra-run jitter ({metrics['mean_trimming_effect_ratio']*100:.1f}%) with occasional outliers within benchmark runs."
+        )
+
+    if (
+        metrics["max_min_range_ratio_raw"] != float("inf")
+        and metrics["max_min_range_ratio_raw"] > 1.2
+    ):
+        report_content.append("")
+        report_content.append(
+            f"  The max/min ratio of {metrics['max_min_range_ratio_raw']:.2f} indicates"
+        )
+        report_content.append(
+            "  substantial performance differences between the best and worst runs."
+        )
+
+    if metrics["p99_p50_ratio_raw"] > 1.1:
+        report_content.append("")
+        report_content.append(
+            f"  The P99/P50 ratio of {metrics['p99_p50_ratio_raw']:.2f} suggests"
+        )
+        report_content.append(
+            "  occasional latency spikes that could affect tail latency sensitive applications."
+        )
+
+    # Join all content with newlines to create the full report
+    full_report = "\n".join(report_content)
+
+    # Write to file
+    with open(report_file, "w") as f:
+        f.write(full_report)
+
+    # Also print to stdout
+    print("\n" + full_report + "\n")
+    print("=" * 80)
+
+
+def generate_time_series_plot(dataset_name, df, output_dir, dataset_type):
+    """Generate time series plot of latency values"""
+    plt.figure(figsize=(12, 6))
+
+    # Sort by date
+    df_sorted = df.sort_values("Date")
+
+    # Plot raw latency
+    plt.plot(df_sorted["Date"], df_sorted["InferenceTime"], "b-", label="Raw Latency")
+
+    # Plot trimmed latency if available
+    if "trimmean_inference_latency(ms)" in df_sorted.columns:
+        plt.plot(
+            df_sorted["Date"],
+            df_sorted["trimmean_inference_latency(ms)"],
+            "g-",
+            label="Trimmed Latency",
+        )
+
+    # Add rolling mean
+    window = min(5, len(df_sorted))
+    if window > 1:
+        rolling_mean = df_sorted["InferenceTime"].rolling(window=window).mean()
+        plt.plot(
+            df_sorted["Date"], rolling_mean, "r--", label=f"{window}-point Rolling Mean"
+        )
+
+    plt.title(f"Latency Over Time: {dataset_name} ({dataset_type})")
+    plt.xlabel("Date")
+    plt.ylabel("Latency (ms)")
+    plt.grid(True, linestyle="--", alpha=0.7)
+    plt.legend()
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+
+    # Save the plot
+    plt.savefig(f"{output_dir}/{dataset_name}_{dataset_type.lower()}_time_series.png")
+    print(
+        f"Generated time series plot: {output_dir}/{dataset_name}_{dataset_type.lower()}_time_series.png"
+    )
+    plt.close()
+
+
+def generate_comparison_report(  # noqa: C901
+    primary_sheet,
+    reference_sheet,
+    model,
+    primary_device,
+    reference_device,
+    primary_metrics,
+    reference_metrics,
+    output_dir,
+):
+    """Generate a comparison report between primary and reference datasets"""
+    report_file = f"{output_dir}/{primary_sheet}_vs_{reference_sheet}_comparison.txt"
+
+    # Create a string buffer to hold the report content
+    report_content = []
+
+    # Header
+    report_content.append("Private vs Public Stability Comparison")
+    report_content.append("=" * 80)
+    report_content.append(f"Private Dataset: {primary_sheet}")
+    report_content.append(f"Public Dataset: {reference_sheet}")
+    report_content.append(f"Model: {model}")
+    report_content.append(f"Private Device: {primary_device}")
+    report_content.append(f"Public Device: {reference_device}")
+    report_content.append("")
+
+    # Create comparison table
+    report_content.append("Metric Comparison:")
+
+    # Format the metrics table
+    headers = [
+        "Metric",
+        "Private (Primary)",
+        "Public (Reference)",
+        "Difference",
+        "% Change",
+    ]
+    rows = []
+
+    # Add key metrics to the table
+    metrics_to_compare = [
+        ("Mean Latency (ms)", "mean_raw_latency", "ms"),
+        ("Median Latency (ms)", "median_raw_latency", "ms"),
+        ("Standard Deviation (ms)", "std_raw_latency", "ms"),
+        ("CV (%)", "cv_raw_latency", "%"),
+        ("IQR (ms)", "iqr_raw_latency", "ms"),
+        ("P99 (ms)", "p99_raw_latency", "ms"),
+        ("Max/Min Ratio", "max_min_range_ratio_raw", ""),
+        ("P99/P50 Ratio", "p99_p50_ratio_raw", ""),
+        ("Stability Score", "stability_score", ""),
+    ]
+
+    for label, key, unit in metrics_to_compare:
+        if key in primary_metrics and key in reference_metrics:
+            primary_val = primary_metrics[key]
+            reference_val = reference_metrics[key]
+
+            # Handle infinity values
+            if primary_val == float("inf") or reference_val == float("inf"):
+                if primary_val == float("inf") and reference_val == float("inf"):
+                    diff = 0
+                    pct_change = 0
+                elif primary_val == float("inf"):
+                    diff = float("inf")
+                    pct_change = float("inf")
+                else:
+                    diff = float("-inf")
+                    pct_change = -100
+            else:
+                diff = primary_val - reference_val
+                # Calculate percent change, avoiding division by zero
+                if reference_val != 0:
+                    pct_change = (diff / reference_val) * 100
+                else:
+                    pct_change = float("inf")
+
+            # Format values based on the metric
+            if key == "stability_score":
+                if primary_val == float("inf"):
+                    primary_str = "Infinity"
+                else:
+                    primary_str = f"{primary_val:.1f}/100"
+
+                if reference_val == float("inf"):
+                    reference_str = "Infinity"
+                else:
+                    reference_str = f"{reference_val:.1f}/100"
+
+                if diff == float("inf"):
+                    diff_str = "Infinity"
+                elif diff == float("-inf"):
+                    diff_str = "-Infinity"
+                else:
+                    diff_str = f"{diff:.1f}"
+
+                if pct_change == float("inf"):
+                    pct_str = "Infinity"
+                elif pct_change == float("-inf"):
+                    pct_str = "-Infinity"
+                else:
+                    pct_str = f"{pct_change:.1f}%"
+
+                row = [label, primary_str, reference_str, diff_str, pct_str]
+            elif unit == "%":
+                if primary_val == float("inf"):
+                    primary_str = "Infinity%"
+                else:
+                    primary_str = f"{primary_val:.2f}%"
+
+                if reference_val == float("inf"):
+                    reference_str = "Infinity%"
+                else:
+                    reference_str = f"{reference_val:.2f}%"
+
+                if diff == float("inf"):
+                    diff_str = "Infinity%"
+                elif diff == float("-inf"):
+                    diff_str = "-Infinity%"
+                else:
+                    diff_str = f"{diff:.2f}%"
+
+                if pct_change == float("inf"):
+                    pct_str = "Infinity%"
+                elif pct_change == float("-inf"):
+                    pct_str = "-Infinity%"
+                else:
+                    pct_str = f"{pct_change:.1f}%"
+
+                row = [label, primary_str, reference_str, diff_str, pct_str]
+            elif unit == "ms":
+                if primary_val == float("inf"):
+                    primary_str = "Infinity ms"
+                else:
+                    primary_str = f"{primary_val:.2f} ms"
+
+                if reference_val == float("inf"):
+                    reference_str = "Infinity ms"
+                else:
+                    reference_str = f"{reference_val:.2f} ms"
+
+                if diff == float("inf"):
+                    diff_str = "Infinity ms"
+                elif diff == float("-inf"):
+                    diff_str = "-Infinity ms"
+                else:
+                    diff_str = f"{diff:.2f} ms"
+
+                if pct_change == float("inf"):
+                    pct_str = "Infinity%"
+                elif pct_change == float("-inf"):
+                    pct_str = "-Infinity%"
+                else:
+                    pct_str = f"{pct_change:.1f}%"
+
+                row = [label, primary_str, reference_str, diff_str, pct_str]
+            else:
+                if primary_val == float("inf"):
+                    primary_str = "Infinity"
+                else:
+                    primary_str = f"{primary_val:.4f}"
+
+                if reference_val == float("inf"):
+                    reference_str = "Infinity"
+                else:
+                    reference_str = f"{reference_val:.4f}"
+
+                if diff == float("inf"):
+                    diff_str = "Infinity"
+                elif diff == float("-inf"):
+                    diff_str = "-Infinity"
+                else:
+                    diff_str = f"{diff:.4f}"
+
+                if pct_change == float("inf"):
+                    pct_str = "Infinity%"
+                elif pct_change == float("-inf"):
+                    pct_str = "-Infinity%"
+                else:
+                    pct_str = f"{pct_change:.1f}%"
+
+                row = [label, primary_str, reference_str, diff_str, pct_str]
+
+            rows.append(row)
+
+    # Add stability ratings
+    rows.append(
+        [
+            "Stability Rating",
+            primary_metrics["stability_rating"],
+            reference_metrics["stability_rating"],
+            "N/A",
+            "N/A",
+        ]
+    )
+
+    # Format the table
+    table = tabulate(rows, headers=headers, tablefmt="grid")
+    report_content.append(table)
+    report_content.append("")
+
+    # Add interpretation
+    report_content.append("Interpretation:")
+
+    # Compare stability scores
+    if primary_metrics["stability_score"] > reference_metrics["stability_score"]:
+        if reference_metrics["stability_score"] != 0:
+            diff_pct = (
+                (
+                    primary_metrics["stability_score"]
+                    - reference_metrics["stability_score"]
+                )
+                / reference_metrics["stability_score"]
+                * 100
+            )
+            report_content.append(
+                f"  Private environment shows better stability with a {diff_pct:.1f}% higher stability score."
+            )
+        else:
+            report_content.append("  Private environment shows better stability.")
+        report_content.append(
+            f"  (Private: {primary_metrics['stability_score']:.1f}/100 vs Public: {reference_metrics['stability_score']:.1f}/100)"
+        )
+    elif primary_metrics["stability_score"] < reference_metrics["stability_score"]:
+        if primary_metrics["stability_score"] != 0:
+            diff_pct = (
+                (
+                    reference_metrics["stability_score"]
+                    - primary_metrics["stability_score"]
+                )
+                / reference_metrics["stability_score"]
+                * 100
+            )
+            report_content.append(
+                f"  Public environment shows better stability with a {diff_pct:.1f}% higher stability score."
+            )
+        else:
+            report_content.append("  Public environment shows better stability.")
+        report_content.append(
+            f"  (Private: {primary_metrics['stability_score']:.1f}/100 vs Public: {reference_metrics['stability_score']:.1f}/100)"
+        )
+    else:
+        report_content.append("  Both environments show identical stability scores.")
+
+    # Compare CV values
+    if primary_metrics["cv_raw_latency"] < reference_metrics["cv_raw_latency"]:
+        if reference_metrics["cv_raw_latency"] != 0:
+            diff_pct = (
+                (
+                    reference_metrics["cv_raw_latency"]
+                    - primary_metrics["cv_raw_latency"]
+                )
+                / reference_metrics["cv_raw_latency"]
+                * 100
+            )
+            report_content.append(
+                f"  Private environment has {diff_pct:.1f}% lower coefficient of variation, indicating more consistent performance."
+            )
+        else:
+            report_content.append(
+                "  Private environment has lower coefficient of variation, indicating more consistent performance."
+            )
+    elif primary_metrics["cv_raw_latency"] > reference_metrics["cv_raw_latency"]:
+        if reference_metrics["cv_raw_latency"] != 0:
+            diff_pct = (
+                (
+                    primary_metrics["cv_raw_latency"]
+                    - reference_metrics["cv_raw_latency"]
+                )
+                / reference_metrics["cv_raw_latency"]
+                * 100
+            )
+            report_content.append(
+                f"  Public environment has {diff_pct:.1f}% lower coefficient of variation, indicating more consistent performance."
+            )
+        else:
+            report_content.append(
+                "  Public environment has lower coefficient of variation, indicating more consistent performance."
+            )
+
+    # Compare latency
+    if primary_metrics["mean_raw_latency"] < reference_metrics["mean_raw_latency"]:
+        if reference_metrics["mean_raw_latency"] != 0:
+            diff_pct = (
+                (
+                    reference_metrics["mean_raw_latency"]
+                    - primary_metrics["mean_raw_latency"]
+                )
+                / reference_metrics["mean_raw_latency"]
+                * 100
+            )
+            report_content.append(
+                f"  Private environment has {diff_pct:.1f}% lower mean latency, indicating better performance."
+            )
+        else:
+            report_content.append(
+                "  Private environment has lower mean latency, indicating better performance."
+            )
+    elif primary_metrics["mean_raw_latency"] > reference_metrics["mean_raw_latency"]:
+        if primary_metrics["mean_raw_latency"] != 0:
+            diff_pct = (
+                (
+                    primary_metrics["mean_raw_latency"]
+                    - reference_metrics["mean_raw_latency"]
+                )
+                / reference_metrics["mean_raw_latency"]
+                * 100
+            )
+            report_content.append(
+                f"  Public environment has {diff_pct:.1f}% lower mean latency, indicating better performance."
+            )
+        else:
+            report_content.append(
+                "  Public environment has lower mean latency, indicating better performance."
+            )
+
+    # Note about OS version difference if applicable
+    _, primary_device_base, primary_os = extract_model_device_os(primary_sheet)
+    _, reference_device_base, reference_os = extract_model_device_os(reference_sheet)
+
+    if primary_os != reference_os and primary_os and reference_os:
+        report_content.append("")
+        report_content.append(
+            f"  Note: This comparison is between {primary_device_base} with {primary_os} (Private) and"
+        )
+        report_content.append(
+            f"  {reference_device_base} with {reference_os} (Public). OS version differences may"
+        )
+        report_content.append("  contribute to observed stability variations.")
+
+    # Recommendation
+    report_content.append("")
+    report_content.append("Recommendation:")
+    if primary_metrics["stability_score"] > reference_metrics["stability_score"]:
+        report_content.append(
+            "  The private environment provides better stability for this model+device combination."
+        )
+        report_content.append(
+            "  It is recommended for applications where consistent performance is critical."
+        )
+    elif primary_metrics["stability_score"] < reference_metrics["stability_score"]:
+        report_content.append(
+            "  The public environment provides better stability for this model+device combination."
+        )
+        report_content.append(
+            "  Consider investigating factors affecting stability in the private environment."
+        )
+    else:
+        report_content.append(
+            "  Both environments provide similar stability. Other factors like cost or availability"
+        )
+        report_content.append("  may be considered for choosing between them.")
+
+    # Join all content with newlines to create the full report
+    full_report = "\n".join(report_content)
+
+    # Write to file
+    with open(report_file, "w") as f:
+        f.write(full_report)
+
+    # Also print to stdout
+    print("\n" + full_report + "\n")
+    print("=" * 80)
+
+
+def generate_intra_primary_summary(primary_datasets, output_dir):  # noqa: C901
+    """Generate a summary comparing different models and devices within the primary dataset"""
+    report_file = f"{output_dir}/intra_primary_stability_summary.txt"
+
+    # Extract relevant data for comparison
+    data = []
+    for sheet_name, info in primary_datasets.items():
+        data.append(
+            {
+                "Sheet": sheet_name,
+                "Model": info["model"],
+                "Device": info["device"],
+                "Mean Latency (ms)": info["metrics"]["mean_raw_latency"],
+                "CV (%)": info["metrics"]["cv_raw_latency"],
+                "Stability Score": info["metrics"]["stability_score"],
+                "Stability Rating": info["metrics"]["stability_rating"],
+                "Max/Min Ratio": info["metrics"]["max_min_range_ratio_raw"],
+                "P99/P50 Ratio": info["metrics"]["p99_p50_ratio_raw"],
+            }
+        )
+
+    # Convert to DataFrame for easier analysis
+    df = pd.DataFrame(data)
+
+    # Sort by stability score (descending)
+    df = df.sort_values("Stability Score", ascending=False)
+
+    # Create a string buffer to hold the report content
+    report_content = []
+
+    # Header
+    report_content.append("Intra-Primary Stability Comparison")
+    report_content.append("=" * 80)
+    report_content.append("")
+
+    # Overall summary table
+    report_content.append("Overall Summary:")
+    report_content.append(
+        tabulate(df, headers="keys", tablefmt="grid", floatfmt=".2f", showindex=False)
+    )
+    report_content.append("")
+
+    # Best and worst performers
+    best_dataset = df.loc[df["Stability Score"].idxmax()]
+    worst_dataset = df.loc[df["Stability Score"].idxmin()]
+
+    report_content.append("Best and Worst Performers:")
+    report_content.append(
+        f"  Best stability: {best_dataset['Sheet']} (Score: {best_dataset['Stability Score']:.1f}/100)"
+    )
+    report_content.append(
+        f"  Worst stability: {worst_dataset['Sheet']} (Score: {worst_dataset['Stability Score']:.1f}/100)"
+    )
+    report_content.append("")
+
+    # Model-based comparison if multiple models exist
+    models = df["Model"].unique()
+    if len(models) > 1:
+        report_content.append("Model-based Comparison:")
+        model_stats = df.groupby("Model").agg(
+            {
+                "Stability Score": ["mean", "min", "max"],
+                "CV (%)": ["mean", "min", "max"],
+            }
+        )
+
+        # Sort by mean stability score (descending)
+        model_stats = model_stats.sort_values(
+            ("Stability Score", "mean"), ascending=False
+        )
+
+        report_content.append(
+            tabulate(model_stats, headers="keys", tablefmt="grid", floatfmt=".2f")
+        )
+
+        best_model = model_stats["Stability Score"]["mean"].idxmax()
+        report_content.append(
+            f"  Most stable model: {best_model} (Avg. Score: {model_stats.loc[best_model, ('Stability Score', 'mean')]:.1f}/100)"
+        )
+        report_content.append("")
+
+    # Device-based comparison
+    # First, extract base device names for grouping
+    device_base_map = {}
+    for sheet_name in primary_datasets:
+        _, device_base, _ = extract_model_device_os(sheet_name)
+        device_base_map[sheet_name] = device_base
+
+    # Add base device to DataFrame
+    df["Device Base"] = df["Sheet"].map(device_base_map)
+
+    # Group by base device
+    device_bases = df["Device Base"].unique()
+    if len(device_bases) > 1:
+        report_content.append("Device-based Comparison (Grouped by Base Device):")
+        device_stats = df.groupby("Device Base").agg(
+            {
+                "Stability Score": ["mean", "min", "max"],
+                "CV (%)": ["mean", "min", "max"],
+            }
+        )
+
+        # Sort by mean stability score (descending)
+        device_stats = device_stats.sort_values(
+            ("Stability Score", "mean"), ascending=False
+        )
+
+        report_content.append(
+            tabulate(device_stats, headers="keys", tablefmt="grid", floatfmt=".2f")
+        )
+
+        best_device = device_stats["Stability Score"]["mean"].idxmax()
+        report_content.append(
+            f"  Most stable device: {best_device} (Avg. Score: {device_stats.loc[best_device, ('Stability Score', 'mean')]:.1f}/100)"
+        )
+        report_content.append("")
+
+    # OS version comparison if multiple OS versions exist
+    os_versions = {}
+    for sheet_name in primary_datasets:
+        _, _, os_version = extract_model_device_os(sheet_name)
+        if os_version:  # Only include if OS version was extracted
+            os_versions[sheet_name] = os_version
+
+    if os_versions and len(set(os_versions.values())) > 1:
+        # Add OS version to DataFrame
+        df["OS Version"] = df["Sheet"].map(os_versions)
+
+        # Remove rows with no OS version
+        df_os = df[df["OS Version"].notna()]
+
+        if len(df_os) > 0:
+            report_content.append("OS Version Comparison:")
+            os_stats = df_os.groupby("OS Version").agg(
+                {
+                    "Stability Score": ["mean", "min", "max"],
+                    "CV (%)": ["mean", "min", "max"],
+                }
+            )
+
+            # Sort by mean stability score (descending)
+            os_stats = os_stats.sort_values(
+                ("Stability Score", "mean"), ascending=False
+            )
+
+            report_content.append(
+                tabulate(os_stats, headers="keys", tablefmt="grid", floatfmt=".2f")
+            )
+
+            best_os = os_stats["Stability Score"]["mean"].idxmax()
+            report_content.append(
+                f"  Most stable OS version: {best_os} (Avg. Score: {os_stats.loc[best_os, ('Stability Score', 'mean')]:.1f}/100)"
+            )
+            report_content.append("")
+
+    # Insights and recommendations
+    report_content.append("Insights and Recommendations:")
+
+    # Check for patterns in stability
+    if len(models) > 1:
+        model_cv = df.groupby("Model")["CV (%)"].mean()
+        most_stable_model = model_cv.idxmin()
+        least_stable_model = model_cv.idxmax()
+        report_content.append(
+            f"  - {most_stable_model} shows the most consistent performance across devices."
+        )
+        report_content.append(
+            f"  - {least_stable_model} shows more variability and may need further optimization."
+        )
+
+    if len(device_bases) > 1:
+        device_cv = df.groupby("Device Base")["CV (%)"].mean()
+        most_stable_device = device_cv.idxmin()
+        least_stable_device = device_cv.idxmax()
+        report_content.append(
+            f"  - {most_stable_device} provides the most stable environment for model execution."
+        )
+        report_content.append(
+            f"  - {least_stable_device} shows higher variability and may not be ideal for latency-sensitive applications."
+        )
+
+    if os_versions and len(set(os_versions.values())) > 1 and len(df_os) > 0:
+        os_cv = df_os.groupby("OS Version")["CV (%)"].mean()
+        most_stable_os = os_cv.idxmin()
+        least_stable_os = os_cv.idxmax()
+        report_content.append(
+            f"  - {most_stable_os} provides better stability than {least_stable_os} across tested devices."
+        )
+
+    # General recommendations
+    report_content.append(
+        "  - For critical applications requiring consistent performance, prefer:"
+    )
+    if len(models) > 1:
+        report_content.append(f"    * Model: {best_model}")
+    else:
+        report_content.append(f"    * Model: {df['Model'].iloc[0]}")
+
+    if len(device_bases) > 1:
+        report_content.append(f"    * Device: {best_device}")
+    else:
+        report_content.append(f"    * Device: {df['Device Base'].iloc[0]}")
+
+    if os_versions and len(set(os_versions.values())) > 1 and len(df_os) > 0:
+        report_content.append(f"    * OS Version: {best_os}")
+
+    # Join all content with newlines to create the full report
+    full_report = "\n".join(report_content)
+
+    # Write to file
+    with open(report_file, "w") as f:
+        f.write(full_report)
+
+    # Also print to stdout
+    print("\n" + full_report + "\n")
+    print("=" * 80)
+
+
+def generate_summary_report(  # noqa: C901
+    primary_datasets, reference_datasets, output_dir
+):
+    """Generate a comprehensive summary report"""
+    report_file = f"{output_dir}/comprehensive_stability_summary.txt"
+
+    # Create a string buffer to hold the report content
+    report_content = []
+
+    # Header
+    report_content.append("Comprehensive Latency Stability Analysis Summary")
+    report_content.append("=" * 80)
+    report_content.append("")
+
+    # Primary datasets summary
+    primary_data = []
+    for sheet_name, info in primary_datasets.items():
+        model, device_base, os_version = extract_model_device_os(sheet_name)
+        device_display = (
+            f"{device_base} ({os_version})" if os_version else info["device"]
+        )
+
+        primary_data.append(
+            {
+                "Dataset": sheet_name,
+                "Model": model,
+                "Device": device_display,
+                "Mean Latency (ms)": info["metrics"]["mean_raw_latency"],
+                "CV (%)": info["metrics"]["cv_raw_latency"],
+                "Stability Score": info["metrics"]["stability_score"],
+                "Stability Rating": info["metrics"]["stability_rating"],
+            }
+        )
+
+    primary_df = pd.DataFrame(primary_data).sort_values(
+        "Stability Score", ascending=False
+    )
+
+    report_content.append("Primary (Private) Datasets Summary:")
+    report_content.append(
+        tabulate(
+            primary_df, headers="keys", tablefmt="grid", floatfmt=".2f", showindex=False
+        )
+    )
+    report_content.append("")
+
+    # Reference datasets summary if available
+    if reference_datasets:
+        reference_data = []
+        for sheet_name, info in reference_datasets.items():
+            model, device_base, os_version = extract_model_device_os(sheet_name)
+            device_display = (
+                f"{device_base} ({os_version})" if os_version else info["device"]
+            )
+
+            reference_data.append(
+                {
+                    "Dataset": sheet_name,
+                    "Model": model,
+                    "Device": device_display,
+                    "Mean Latency (ms)": info["metrics"]["mean_raw_latency"],
+                    "CV (%)": info["metrics"]["cv_raw_latency"],
+                    "Stability Score": info["metrics"]["stability_score"],
+                    "Stability Rating": info["metrics"]["stability_rating"],
+                }
+            )
+
+        reference_df = pd.DataFrame(reference_data).sort_values(
+            "Stability Score", ascending=False
+        )
+
+        report_content.append("Reference (Public) Datasets Summary:")
+        report_content.append(
+            tabulate(
+                reference_df,
+                headers="keys",
+                tablefmt="grid",
+                floatfmt=".2f",
+                showindex=False,
+            )
+        )
+        report_content.append("")
+
+        # Comparison summary for matching datasets
+        comparison_data = []
+        for primary_sheet, primary_info in primary_datasets.items():
+            for ref_sheet, ref_info in reference_datasets.items():
+                if is_matching_dataset(primary_sheet, ref_sheet):
+                    primary_metrics = primary_info["metrics"]
+                    reference_metrics = ref_info["metrics"]
+
+                    # Extract model and device info for display
+                    model, primary_device_base, primary_os = extract_model_device_os(
+                        primary_sheet
+                    )
+                    _, reference_device_base, reference_os = extract_model_device_os(
+                        ref_sheet
+                    )
+
+                    primary_device_display = (
+                        f"{primary_device_base} ({primary_os})"
+                        if primary_os
+                        else primary_info["device"]
+                    )
+                    reference_device_display = (
+                        f"{reference_device_base} ({reference_os})"
+                        if reference_os
+                        else ref_info["device"]
+                    )
+
+                    comparison_data.append(
+                        {
+                            "Dataset": f"{model} on {primary_device_base}",
+                            "Private Device": primary_device_display,
+                            "Public Device": reference_device_display,
+                            "Private Score": primary_metrics["stability_score"],
+                            "Public Score": reference_metrics["stability_score"],
+                            "Score Diff": primary_metrics["stability_score"]
+                            - reference_metrics["stability_score"],
+                            "Private CV (%)": primary_metrics["cv_raw_latency"],
+                            "Public CV (%)": reference_metrics["cv_raw_latency"],
+                            "CV Diff (%)": primary_metrics["cv_raw_latency"]
+                            - reference_metrics["cv_raw_latency"],
+                        }
+                    )
+                    break  # Only use the first matching reference dataset
+
+        if comparison_data:
+            comparison_df = pd.DataFrame(comparison_data).sort_values(
+                "Score Diff", ascending=False
+            )
+
+            report_content.append("Private vs Public Comparison:")
+            report_content.append(
+                tabulate(
+                    comparison_df,
+                    headers="keys",
+                    tablefmt="grid",
+                    floatfmt=".2f",
+                    showindex=False,
+                )
+            )
+            report_content.append("")
+
+            # Count datasets where private is better
+            private_better_count = sum(
+                1 for row in comparison_data if row["Score Diff"] > 0
+            )
+            public_better_count = sum(
+                1 for row in comparison_data if row["Score Diff"] < 0
+            )
+            equal_count = sum(1 for row in comparison_data if row["Score Diff"] == 0)
+
+            report_content.append(
+                f"Private environment is more stable in {private_better_count} of {len(comparison_data)} cases."
+            )
+            report_content.append(
+                f"Public environment is more stable in {public_better_count} of {len(comparison_data)} cases."
+            )
+            if equal_count > 0:
+                report_content.append(
+                    f"Both environments show equal stability in {equal_count} of {len(comparison_data)} cases."
+                )
+            report_content.append("")
+
+    # Overall insights and recommendations
+    report_content.append("Overall Insights and Recommendations:")
+
+    # Stability distribution in primary datasets
+    stability_counts = primary_df["Stability Rating"].value_counts()
+    report_content.append("Stability Distribution in Private Datasets:")
+    for rating, count in stability_counts.items():
+        report_content.append(f"  - {rating}: {count} dataset(s)")
+    report_content.append("")
+
+    # Best configurations
+    best_primary = primary_df.iloc[0]
+    report_content.append("Best Configurations:")
+    report_content.append(
+        f"  - Most stable configuration: {best_primary['Dataset']} (Score: {best_primary['Stability Score']:.1f}/100)"
+    )
+    report_content.append(
+        f"    Model: {best_primary['Model']}, Device: {best_primary['Device']}"
+    )
+
+    # OS version insights if available
+    os_versions = {}
+    for sheet_name in primary_datasets:
+        _, _, os_version = extract_model_device_os(sheet_name)
+        if os_version:
+            os_versions[sheet_name] = os_version
+
+    if os_versions and len(set(os_versions.values())) > 1:
+        # Add OS version to primary DataFrame
+        primary_df["OS Version"] = primary_df["Dataset"].map(
+            lambda x: extract_model_device_os(x)[2]
+        )
+
+        # Remove rows with no OS version
+        df_os = primary_df[primary_df["OS Version"].notna()]
+
+        if len(df_os) > 0:
+            os_stats = (
+                df_os.groupby("OS Version")["Stability Score"]
+                .mean()
+                .sort_values(ascending=False)
+            )
+            best_os = os_stats.index[0]
+            report_content.append(
+                f"  - Most stable OS version: {best_os} (Avg. Score: {os_stats.iloc[0]:.1f}/100)"
+            )
+
+    # General recommendations
+    report_content.append("")
+    report_content.append("General Recommendations:")
+    report_content.append(
+        "  1. For datasets with 'Poor' or 'Moderate' stability, investigate potential causes"
+    )
+    report_content.append(
+        "     such as thermal throttling, background processes, or power management settings."
+    )
+    report_content.append(
+        "  2. Consider increasing warm-up iterations for datasets with high CV values."
+    )
+    report_content.append(
+        "  3. For critical applications, prefer models and devices with 'Good' or 'Excellent' stability."
+    )
+    if reference_datasets and comparison_data:
+        if private_better_count > public_better_count:
+            report_content.append(
+                "  4. Private environments generally provide better stability and should be preferred"
+            )
+            report_content.append(
+                "     for production deployments where consistent performance is critical."
+            )
+        elif public_better_count > private_better_count:
+            report_content.append(
+                "  4. Public environments show better stability in most cases. Consider investigating"
+            )
+            report_content.append(
+                "     factors affecting stability in the private environment."
+            )
+
+    # Join all content with newlines to create the full report
+    full_report = "\n".join(report_content)
+
+    # Write to file
+    with open(report_file, "w") as f:
+        f.write(full_report)
+
+    # Also print to stdout
+    print("\n" + full_report + "\n")
+    print("=" * 80)
+
+
+def main():
+    # Set up command line argument parsing
+    parser = argparse.ArgumentParser(
+        description="Analyze ML model latency stability from benchmark data."
+    )
+    parser.add_argument(
+        "primary_file",
+        help="Path to Excel file containing primary (private) benchmark data",
+    )
+    parser.add_argument(
+        "--reference_file",
+        help="Path to Excel file containing reference (public) benchmark data for comparison",
+        default=None,
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="stability_analysis_results",
+        help="Directory to save analysis results (default: stability_analysis_results)",
+    )
+
+    # Parse arguments
+    args = parser.parse_args()
+
+    # Run analysis
+    analyze_latency_stability(args.primary_file, args.reference_file, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()