huggingface · NathanHB · Apr 8, 2025 · Apr 2, 2025
diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
@@ -52,6 +52,9 @@ def vllm(
     system_prompt: Annotated[
         Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANEL_NAME_4)
     ] = None,
+    cot_prompt: Annotated[
+        Optional[str], Option(help="Use chain of thought prompt for evaluation.", rich_help_panel=HELP_PANEL_NAME_4)
+    ] = None,
     dataset_loading_processes: Annotated[
         int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
     ] = 1,
@@ -128,6 +131,7 @@ def vllm(
         max_samples=max_samples,
         use_chat_template=use_chat_template,
         system_prompt=system_prompt,
+        cot_prompt=cot_prompt,
         load_responses_from_details_date_id=load_responses_from_details_date_id,
     )
 

diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
@@ -107,6 +107,7 @@ class PipelineParameters:
     max_samples: int | None = None
     use_chat_template: bool = False
     system_prompt: str | None = None
+    cot_prompt: str | None = None
     load_responses_from_details_date_id: str | None = None
 
     def __post_init__(self):  # noqa C901
@@ -236,6 +237,7 @@ def _init_tasks_and_requests(self, tasks: str):
                 evaluation_tracker=self.evaluation_tracker,
                 use_chat_template=self.pipeline_parameters.use_chat_template,
                 system_prompt=self.pipeline_parameters.system_prompt,
+                cot_prompt=self.pipeline_parameters.cot_prompt,
             )
 
             self.task_names_list = task_names_list

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -582,6 +582,7 @@ def create_requests_from_tasks(  # noqa: C901
     evaluation_tracker: "EvaluationTracker",
     use_chat_template: bool,
     system_prompt: str | None,
+    cot_prompt: str | None,
 ) -> Tuple[dict[RequestType, list[Request]], dict[SampleUid, Doc]]:
     """
     Takes a task dict and a fewshot dict and returns a dict of requests, a dict
@@ -599,6 +600,8 @@ def create_requests_from_tasks(  # noqa: C901
         max_samples (int): maximum number of samples.
         evaluation_tracker (EvaluationTracker): evaluation tracker.
         use_chat_template (bool): Whether to use the chat template.
+        system_prompt (str): System prompt
+        cot_prompt (str): Chain of thought prompt
 
     Raises:
         NotImplementedError: If the request type is not implemented for the
@@ -646,6 +649,7 @@ def create_requests_from_tasks(  # noqa: C901
                         truncate_few_shots=truncate_few_shots,
                         use_chat_template=use_chat_template,
                         system_prompt=system_prompt,
+                        cot_prompt=cot_prompt,
                     )
 
                     # Constructing the requests

diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
@@ -107,6 +107,7 @@ def add_context_to_doc(
         truncate_few_shots: bool = False,
         use_chat_template=False,
         system_prompt: str = None,
+        cot_prompt: str = None,
     ) -> Doc:
         is_multi_turn = doc.specific is not None and len(doc.specific.get("multi_turn_queries", [])) > 0
         if is_multi_turn:
@@ -121,6 +122,7 @@ def add_context_to_doc(
                 sampler=sampler,
                 use_chat_template=use_chat_template,
                 system_prompt=system_prompt,
+                cot_prompt=cot_prompt,
             )
         doc.num_effective_few_shots = num_effective_few_shots
         doc.num_asked_few_shots = num_fewshot
@@ -175,6 +177,7 @@ def _single_turn_context(
         truncate_few_shots: bool = False,
         use_chat_template=False,
         system_prompt: str = None,
+        cot_prompt: str = None,
     ):
         """Returns a fewshot context string that is made up of a prepended description
         (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -206,6 +209,7 @@ def _single_turn_context(
             fewshot_ex=fewshot_ex,
             system_prompt=system_prompt,
             use_chat_template=use_chat_template,
+            cot_prompt=cot_prompt,
         )
         if not use_chat_template:
             toks = self.model.tok_encode(output)
@@ -228,6 +232,7 @@ def _single_turn_context(
                     fewshot_ex=fewshot_ex[:num_effective_fewshots],
                     system_prompt=system_prompt,
                     use_chat_template=use_chat_template,
+                    cot_prompt=cot_prompt,
                 )
                 if not use_chat_template:
                     toks = self.model.tok_encode(output)
@@ -252,6 +257,7 @@ def get_examples(
         fewshot_ex: list[str],
         system_prompt: Union[str | None],
         use_chat_template: bool,
+        cot_prompt: Union[str | None],
     ):
         examples = []
         # Few shot examples
@@ -263,10 +269,12 @@ def get_examples(
                 examples.append(self.doc_to_text(ex, return_instructions=False) + self.doc_to_target(ex))
 
         # Actual example
+        content = example + cot_prompt if cot_prompt is not None else example
+
         if use_chat_template:
-            examples.append({"role": "user", "content": example})
+            examples.append({"role": "user", "content": content})
         else:
-            examples.append(example)
+            examples.append(content)
 
         # System prompt and instruction
         if use_chat_template: