example fixups

AnuradhaKaruppiah · AnuradhaKaruppiah · commit affd57628d68 · 2025-08-11T13:58:44.000-07:00
Signed-off-by: Anuradha Karuppiah &lt;anuradhak@nvidia.com&gt;
diff --git a/docs/source/guides/evaluate-api.md b/docs/source/guides/evaluate-api.md
diff --git a/docs/source/reference/evaluate.md b/docs/source/reference/evaluate.md
@@ -138,6 +138,42 @@ eval:
               - sympy__sympy-21055
 ```
 
+### Custom Dataset Format
+You can use a dataset of a custom format by providing a custom dataset parser function.
+
+**Example:**
+`examples/evaluation_and_profiling/simple_calculator_eval/configs/config-custom-dataset-format.yml`:
+```yaml
+eval:
+  general:
+    dataset:
+      _type: custom
+      file_path: examples/evaluation_and_profiling/simple_calculator_eval/data/simple_calculator_nested.json
+      function: aiq_simple_calculator_eval.custom_dataset_parser.extract_nested_questions
+      kwargs:
+        filter_by_tag: "important"
+        max_rows: 5
+```
+This example uses a custom dataset parser function to extract the nested questions from the dataset, filter them by a tag and return only the first 5 questions.
+
+The custom dataset parser function is a Python function that takes a dataset file path and returns an `EvalInput` object.
+
+{py:class}`~aiq.eval.evaluator.evaluator_model.EvalInput` is a Pydantic model that contains the list of `EvalInputItem` objects.
+{py:class}`~aiq.eval.evaluator.evaluator_model.EvalInputItem` is a Pydantic model that contains the fields for an item in the dataset.
+The custom dataset parser function should fill the following fields in the `EvalInputItem` object:
+- `id`: The id of the item. Every item in the dataset must have a unique id of type `str` or `int`.
+- `input_obj`: This is the question.
+- `expected_output_obj`: This is the ground truth answer.
+- `output_obj`: This is the generated answer. This can be an empty string if it needs to be filled by running the workflow.
+- `expected_trajectory`: This is the expected trajectory. This can be an empty list if it needs to be filled by running the workflow.
+- `trajectory`: This is the trajectory. This can be an empty list if it needs to be filled by running the workflow.
+- `full_dataset_entry`: This is the entire dataset entry. This is passed to the evaluator.
+
+To run the evaluation, run the following command:
+```bash
+aiq eval --config_file=examples/evaluation_and_profiling/simple_calculator_eval/configs/config-custom-dataset-format.yml
+```
+
 ## NeMo Agent Toolkit Built-in Evaluators
 NeMo Agent toolkit provides the following built-in evaluator:
 - `ragas` - An evaluator to run and evaluate RAG-like workflows using the public RAGAS API.
diff --git a/examples/evaluation_and_profiling/simple_calculator_eval/pyproject.toml b/examples/evaluation_and_profiling/simple_calculator_eval/pyproject.toml
@@ -2,9 +2,6 @@
 build-backend = "setuptools.build_meta"
 requires = ["setuptools >= 64", "setuptools-scm>=8"]
 
-[tool.setuptools]
-packages = []
-
 [tool.setuptools_scm]
 root = "../../.."
 
diff --git a/examples/evaluation_and_profiling/simple_calculator_eval/src/aiq_simple_calculator_eval/configs/config-custom-dataset-format.yml b/examples/evaluation_and_profiling/simple_calculator_eval/src/aiq_simple_calculator_eval/configs/config-custom-dataset-format.yml
@@ -49,6 +49,11 @@ llms:
     model_name: meta/llama-3.3-70b-instruct
     temperature: 0.2
     max_tokens: 2048
+  eval_llm:
+    _type: nim
+    model_name: mistralai/mixtral-8x22b-instruct-v0.1
+    temperature: 0.0
+    max_tokens: 1024
 
 workflow:
   _type: react_agent
@@ -70,9 +75,9 @@ eval:
     dataset:
       _type: custom
       file_path: examples/evaluation_and_profiling/simple_calculator_eval/data/simple_calculator_nested.json
-      function: aiq_simple_calculator_eval.custom_dataset_parser.extract_nested_questions
+      function: aiq_simple_calculator_eval.scripts.custom_dataset_parser.extract_nested_questions
       kwargs:
-        filter_by_tag: "important"
+        difficulty: "medium"
         max_rows: 5
 
   evaluators:
diff --git a/examples/evaluation_and_profiling/simple_calculator_eval/src/aiq_simple_calculator_eval/scripts/__init__.py b/examples/evaluation_and_profiling/simple_calculator_eval/src/aiq_simple_calculator_eval/scripts/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/evaluation_and_profiling/simple_calculator_eval/src/aiq_simple_calculator_eval/scripts/custom_dataset_parser.py b/examples/evaluation_and_profiling/simple_calculator_eval/src/aiq_simple_calculator_eval/scripts/custom_dataset_parser.py
@@ -20,9 +20,14 @@
 from aiq.eval.evaluator.evaluator_model import EvalInputItem
 
 
-def extract_nested_questions(input_path: Path, filter_by_tag: str = None, max_rows: int = None, **kwargs) -> EvalInput:
+def extract_nested_questions(input_path: Path, difficulty: str = None, max_rows: int = None) -> EvalInput:
     """
-    Extract questions from a nested JSON structure with optional filtering.
+    This is a sample custom dataset parser that:
+    1. Loads a nested JSON file
+    2. Extracts the questions array from the nested structure
+    3. Applies optional filtering by difficulty (hard, medium, easy)
+    4. Applies an optional maximum number of questions to return
+    5. Creates an EvalInput object with the extracted questions and returns it
 
     Expects JSON format:
     {
@@ -36,9 +41,8 @@ def extract_nested_questions(input_path: Path, filter_by_tag: str = None, max_ro
 
     Args:
         input_path: Path to the nested JSON file
-        filter_by_tag: Optional tag to filter questions by (matches against category or difficulty)
+        difficulty: Optional difficulty to filter questions by
         max_rows: Optional maximum number of questions to return
-        **kwargs: Additional parameters (unused in this example)
 
     Returns:
         EvalInput object containing the extracted questions
@@ -50,17 +54,13 @@ def extract_nested_questions(input_path: Path, filter_by_tag: str = None, max_ro
 
     # Extract questions array from the nested structure
     questions = data.get('questions', [])
-    metadata = data.get('metadata', {})
-    configuration = data.get('configuration', {})
 
     # Apply filtering if specified
-    if filter_by_tag:
+    if difficulty:
         filtered_questions = []
         for question in questions:
-            # Check if filter_by_tag matches category, difficulty, or any other field
-            if (question.get('category', '').lower() == filter_by_tag.lower()
-                    or question.get('difficulty', '').lower() == filter_by_tag.lower()
-                    or filter_by_tag.lower() in str(question).lower()):
+            # Check if category matches category (hard, medium, easy)
+            if (question.get('difficulty', '').lower() == difficulty.lower()):
                 filtered_questions.append(question)
         questions = filtered_questions
 
@@ -71,26 +71,14 @@ def extract_nested_questions(input_path: Path, filter_by_tag: str = None, max_ro
     eval_items = []
 
     for item in questions:
-        # Create EvalInputItem with additional metadata in full_dataset_entry
-        full_entry = {
-            **item,  # Include original question data
-            'dataset_metadata': metadata,
-            'dataset_configuration': configuration,
-            'processing_info': {
-                'filtered_by_tag': filter_by_tag,
-                'max_rows_applied': max_rows,
-                'total_questions_in_dataset': len(data.get('questions', []))
-            }
-        }
-
         eval_item = EvalInputItem(
             id=item['id'],
             input_obj=item['question'],
             expected_output_obj=item['answer'],
             output_obj="",  # Will be filled by workflow
             expected_trajectory=[],
             trajectory=[],  # Will be filled by workflow
-            full_dataset_entry=full_entry)
+            full_dataset_entry=item)
         eval_items.append(eval_item)
 
     return EvalInput(eval_input_items=eval_items)