Merge branch 'main' into awarno/reasoning-tokens

AWarno · web-flow · commit f28afb4c2b64 · 2025-09-23T15:51:48.000+02:00
diff --git a/packages/nemo-evaluator/src/nemo_evaluator/adapters/interceptors/response_stats_interceptor.py b/packages/nemo-evaluator/src/nemo_evaluator/adapters/interceptors/response_stats_interceptor.py
@@ -243,10 +243,17 @@ def _update_basic_stats(self, resp: AdapterResponse, current_time: float) -> Non
             # Update inference_run_times for current run
             run_id = self._stats["run_id"]
             if run_id not in self._stats["inference_run_times"]:
-                # First request in this run - set first_request_time
+                # First request in this run - estimate when inference actually started using latency
+                estimated_first_request_start = current_time
+                if hasattr(resp, "latency_ms") and resp.latency_ms is not None:
+                    # Estimate when this request was sent (current_time - latency)
+                    estimated_first_request_start = current_time - (
+                        resp.latency_ms / 1000.0
+                    )
+
                 self._stats["inference_run_times"][run_id] = {
                     "run_start": self._adapter_start_time,
-                    "first_request_time": current_time,
+                    "first_request_time": estimated_first_request_start,
                     "last_request_time": current_time,
                     "inference_time": 0.0,
                 }
diff --git a/packages/nemo-evaluator/src/nemo_evaluator/core/evaluate.py b/packages/nemo-evaluator/src/nemo_evaluator/core/evaluate.py
@@ -30,7 +30,10 @@
     EvaluationTarget,
 )
 from nemo_evaluator.core.input import prepare_output_directory, validate_configuration
-from nemo_evaluator.core.resources import monitor_memory_usage
+from nemo_evaluator.core.resources import (
+    aggregate_runtime_metrics,
+    monitor_memory_usage,
+)
 from nemo_evaluator.core.utils import run_command
 from nemo_evaluator.logging import get_logger
 
@@ -109,7 +112,10 @@ def run_evaluation_core():
         logger.info("No cache directory configured, token usage will not be collected")
 
     evaluation_result, metrics = monitor_memory_usage(
-        run_evaluation_core, interval_ms=100, cache_dir=cache_dir
+        run_evaluation_core,
+        interval_ms=100,
+        cache_dir=cache_dir,
+        output_dir=evaluation.config.output_dir,
     )
 
     metrics_path = os.path.join(
@@ -125,15 +131,34 @@ def run_evaluation_core():
         except (json.JSONDecodeError, IOError):
             pass  # Start fresh if file is corrupted
 
+    # Aggregate all run data from run_times directory
+    aggregated_metrics = aggregate_runtime_metrics(evaluation.config.output_dir)
+
+    if aggregated_metrics:
+        runtime = aggregated_metrics.get("runtime_seconds", 0)
+        inference_time = aggregated_metrics.get("inference_time_seconds", 0)
+        scoring_time = aggregated_metrics.get("scoring_time_seconds", 0)
+        logger.info(
+            "Aggregated metrics",
+            runtime_seconds=runtime,
+            inference_time_seconds=inference_time,
+            scoring_time_seconds=scoring_time,
+            peak_memory_bytes=aggregated_metrics.get("peak_memory_bytes", 0),
+            total_runs=aggregated_metrics.get("total_runs", 0),
+        )
+
+    # Use aggregated metrics if available, otherwise use current metrics
+    final_metrics = aggregated_metrics if aggregated_metrics else metrics
+
     # Merge with existing metrics, using "evaluation" as the key
     # If evaluation key already exists, merge the metrics instead of overwriting
     if "evaluation" in existing_metrics:
         # Aggregate existing evaluation metrics with new ones
         existing_eval = existing_metrics["evaluation"]
-        if isinstance(existing_eval, dict) and isinstance(metrics, dict):
+        if isinstance(existing_eval, dict) and isinstance(final_metrics, dict):
             # Merge dictionaries with appropriate aggregation strategy
             merged_eval = existing_eval.copy()
-            for key, value in metrics.items():
+            for key, value in final_metrics.items():
                 if (
                     key in merged_eval
                     and isinstance(merged_eval[key], (int, float))
@@ -153,9 +178,9 @@ def run_evaluation_core():
                     merged_eval[key] = value
             merged_metrics = {**existing_metrics, "evaluation": merged_eval}
         else:
-            merged_metrics = {**existing_metrics, "evaluation": metrics}
+            merged_metrics = {**existing_metrics, "evaluation": final_metrics}
     else:
-        merged_metrics = {**existing_metrics, "evaluation": metrics}
+        merged_metrics = {**existing_metrics, "evaluation": final_metrics}
 
     # Write merged metrics to file
     with open(metrics_path, "w") as f:
diff --git a/packages/nemo-evaluator/src/nemo_evaluator/core/resources.py b/packages/nemo-evaluator/src/nemo_evaluator/core/resources.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 
+import json
 import os
 import sqlite3
 import threading
@@ -67,7 +68,7 @@ def get_token_usage_from_cache_db(cache_db_path: str | Path) -> dict:
                     "total_cached_requests": row[3],
                 }
     except Exception as e:
-        logger.warning(f"Failed to read token usage from cache: {e}")
+        logger.warning("Failed to read token usage from cache", error=str(e))
 
     return {}
 
@@ -81,8 +82,126 @@ def get_token_usage_from_cache(cache_dir: str) -> dict:
     return get_token_usage_from_cache_db(cache_db_path)
 
 
+def aggregate_runtime_metrics(output_dir: str) -> dict[str, Any]:
+    """Aggregate all run data from run_times directory."""
+    run_times_dir = Path(output_dir) / "run_times"
+    aggregated_metrics = {}
+
+    if not run_times_dir.exists():
+        return aggregated_metrics
+
+    total_runtime = 0
+    earliest_start = None
+    latest_end = None
+    max_peak_memory = 0
+    max_peak_tree_memory = 0
+    run_count = 0
+
+    for run_file in run_times_dir.glob("runtime_*.json"):
+        try:
+            with open(run_file, "r") as f:
+                run_data = json.load(f)
+                total_runtime += run_data.get("runtime_seconds", 0)
+                run_count += 1
+
+                # Track earliest start and latest end
+                run_start = run_data.get("start_time", "")
+                run_end = run_data.get("end_time", "")
+                if earliest_start is None or run_start < earliest_start:
+                    earliest_start = run_start
+                if latest_end is None or run_end > latest_end:
+                    latest_end = run_end
+
+                # Track peak memory across all runs
+                max_peak_memory = max(
+                    max_peak_memory, run_data.get("peak_memory_bytes", 0)
+                )
+                max_peak_tree_memory = max(
+                    max_peak_tree_memory, run_data.get("peak_tree_memory_bytes", 0)
+                )
+        except Exception:
+            pass
+
+    if run_count > 0:
+        aggregated_metrics = {
+            "runtime_seconds": total_runtime,
+            "start_time": earliest_start,
+            "end_time": latest_end,
+            "peak_memory_bytes": max_peak_memory,
+            "peak_tree_memory_bytes": max_peak_tree_memory,
+            "total_runs": run_count,
+        }
+
+        # Try to get inference time from response stats and calculate scoring time
+        try:
+            metrics_file = Path(output_dir) / "eval_factory_metrics.json"
+            if metrics_file.exists():
+                with open(metrics_file, "r") as f:
+                    metrics_data = json.load(f)
+                    response_stats = metrics_data.get("response_stats", {})
+                    inference_time = response_stats.get("inference_time", 0.0)
+
+                    # Calculate scoring time as runtime - inference time
+                    scoring_time = max(0.0, total_runtime - inference_time)
+                    aggregated_metrics["inference_time_seconds"] = inference_time
+                    aggregated_metrics["scoring_time_seconds"] = scoring_time
+        except Exception as e:
+            # If we can't read response stats, just continue without scoring time
+            logger.warning(
+                "Could not extract inference time from response stats", error=str(e)
+            )
+
+    return aggregated_metrics
+
+
+def _update_persistent_metrics(
+    output_dir: str,
+    start_time: float,
+    peak_memory: int,
+    peak_tree_memory: int,
+    run_id: str,
+) -> None:
+    """Save individual run data and update peak memory only."""
+    try:
+        # Create run_times directory
+        run_times_dir = Path(output_dir) / "run_times"
+        run_times_dir.mkdir(exist_ok=True)
+
+        # Save individual run runtime
+        current_time = time.time()
+        current_runtime = current_time - start_time
+        run_file = run_times_dir / f"runtime_{run_id}.json"
+
+        with open(run_file, "w") as f:
+            json.dump(
+                {
+                    "run_id": run_id,
+                    "start_time": time.strftime(
+                        "%Y-%m-%dT%H:%M:%S.%fZ", time.gmtime(start_time)
+                    ),
+                    "end_time": time.strftime(
+                        "%Y-%m-%dT%H:%M:%S.%fZ", time.gmtime(current_time)
+                    ),
+                    "runtime_seconds": current_runtime,
+                    "peak_memory_bytes": peak_memory,
+                    "peak_tree_memory_bytes": peak_tree_memory,
+                },
+                f,
+            )
+
+    except Exception as e:
+        logger.warning(
+            "Failed to update persistent metrics", error=str(e), run_id=run_id
+        )
+
+
 def monitor_memory_usage(
-    func, *args, interval_ms, cache_dir: str | None = None, **kwargs
+    func,
+    *args,
+    interval_ms,
+    cache_dir: str | None = None,
+    output_dir: str | None = None,
+    **kwargs,
 ) -> tuple[EvaluationResult, dict[str, Any]]:
     """
     Run func(*args, **kwargs) while polling RSS via psutil.
@@ -91,8 +210,21 @@ def monitor_memory_usage(
     - peak_tree_rss_bytes: peak memory usage of the entire process tree (main + children)
     """
     proc = psutil.Process(os.getpid())
+
+    # Generate meaningful run ID (counter or date)
+    if output_dir:
+        run_times_dir = Path(output_dir) / "run_times"
+        run_times_dir.mkdir(exist_ok=True)
+        # Count existing runs to get next ID
+        existing_runs = list(run_times_dir.glob("runtime_*.json"))
+        run_id = str(len(existing_runs))
+    else:
+        run_id = "0"
+
+    # Initialize values
     peak = 0
     peak_tree = 0
+
     stop = False
     ret = None
 
@@ -111,6 +243,9 @@ def get_tree_memory(process):
 
     def sampler():
         nonlocal peak, peak_tree
+        last_save_time = 0
+        save_interval = 5.0  # Save every 5 seconds
+
         while not stop:
             # Get memory for current process
             rss = proc.memory_info().rss
@@ -120,6 +255,15 @@ def sampler():
             tree_rss = get_tree_memory(proc)
             peak_tree = max(peak_tree, tree_rss)
 
+            # Update persistent metrics file if output_dir is provided and enough time has passed
+            if output_dir:
+                current_time = time.time()
+                if current_time - last_save_time >= save_interval:
+                    _update_persistent_metrics(
+                        output_dir, start_time, peak, peak_tree, run_id
+                    )
+                    last_save_time = current_time
+
             time.sleep(interval_ms / 1000.0)
 
     th = threading.Thread(target=sampler, daemon=True)
@@ -144,15 +288,15 @@ def sampler():
         try:
             token_usage = get_token_usage_from_cache(cache_dir)
         except Exception as e:
-            logger.warning(f"Failed to get token usage from cache: {e}")
+            logger.warning("Failed to get token usage from cache", error=str(e))
 
     metrics = {
         "runtime_seconds": runtime_seconds,
         "start_time": time.strftime("%Y-%m-%dT%H:%M:%S.%fZ", time.gmtime(start_time)),
         "end_time": time.strftime("%Y-%m-%dT%H:%M:%S.%fZ", time.gmtime(end_time)),
         "token_usage": token_usage,
-        "peak_memory_bytes": peak,  # Memory of main process
-        "peak_tree_memory_bytes": peak_tree,  # Memory of entire process tree
+        "peak_memory_bytes": peak,
+        "peak_tree_memory_bytes": peak_tree,
     }
 
     return ret, metrics
diff --git a/packages/nemo-evaluator/tests/unit_tests/adapters/interceptors/test_response_stats_interceptor.py b/packages/nemo-evaluator/tests/unit_tests/adapters/interceptors/test_response_stats_interceptor.py
@@ -591,7 +591,9 @@ def test_caching_and_aggregation_with_multiple_runs(self, tmp_path):
         assert interceptor1._stats["avg_latency_ms"] == 150.0  # (100 + 200) / 2
         assert interceptor1._stats["max_latency_ms"] == 200.0
         run1_inference_time = interceptor1._stats["inference_time"]
-        assert 0.05 <= run1_inference_time <= 0.15, (
+        # With latency-based estimation: sleep_time + latency_adjustment
+        # Expected: ~0.1s (sleep) + ~0.1s (first request latency) = ~0.2s
+        assert 0.15 <= run1_inference_time <= 0.25, (
             f"Run 1 inference time {run1_inference_time} not in expected range"
         )
 
@@ -819,7 +821,9 @@ def test_comprehensive_cache_scenarios(
 
             # Verify Run 1 inference time
             run1_time = interceptor1._stats["inference_time"]
-            assert 0.05 <= run1_time <= 0.15, (
+            # With latency-based estimation: sleep_time + latency_adjustment
+            # Expected: ~0.1s (sleep) + ~0.1s (first request latency) = ~0.2s
+            assert 0.15 <= run1_time <= 0.25, (
                 f"Run 1 time {run1_time} not in expected range"
             )
             assert interceptor1._stats["run_id"] == 0
@@ -847,7 +851,9 @@ def test_comprehensive_cache_scenarios(
             # Verify Run 2 inference time
             # All run_ids should be integers after cache loading fix
             run2_time = interceptor2._stats["inference_run_times"][1]["inference_time"]
-            assert 0.05 <= run2_time <= 0.15, (
+            # With latency-based estimation: sleep_time + latency_adjustment
+            # Expected: ~0.08s (sleep) + ~0.3s (first request latency) = ~0.38s
+            assert 0.35 <= run2_time <= 0.45, (
                 f"Run 2 time {run2_time} not in expected range"
             )
             assert interceptor2._stats["run_id"] == 1
@@ -874,7 +880,9 @@ def test_comprehensive_cache_scenarios(
             # Verify Run 3 inference time
             # All run_ids should be integers after cache loading fix
             run3_time = interceptor3._stats["inference_run_times"][2]["inference_time"]
-            assert 0.05 <= run3_time <= 0.15, (
+            # With latency-based estimation: sleep_time + latency_adjustment
+            # Expected: ~0.06s (sleep) + ~0.5s (first request latency) = ~0.56s
+            assert 0.50 <= run3_time <= 0.65, (
                 f"Run 3 time {run3_time} not in expected range"
             )
             assert interceptor3._stats["run_id"] == 2