From 61e11ef4af9759d3a4af3c7a52f18294c495926a Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Tue, 23 Jul 2024 16:20:42 +0330
Subject: [PATCH 01/14] Working on fewshot

---
 src/lighteval/tasks/default_prompts.py |  8 ++------
 src/lighteval/tasks/prompt_manager.py  |  8 +++-----
 src/lighteval/tasks/requests.py        | 15 ++++++---------
 3 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 3a9b97f07..136acb77b 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -176,7 +176,6 @@ def bbh_harness(line, task_name: str = None):
         query=query,
         choices=choices,
         gold_index=correct_index,
-        target_for_fewshot_sorting=choices,
         instruction=line.get("task_prefix", None),
     )
 
@@ -196,7 +195,6 @@ def bbh_lighteval(line, task_name: str = None):
         query=query,
         choices=LETTER_INDICES[: len(line["choices"])],
         gold_index=line["target_idx"],
-        target_for_fewshot_sorting=LETTER_INDICES[: len(line["choices"])],
         instruction=line.get("task_prefix", None),
     )
 
@@ -205,9 +203,8 @@ def bbh(line, instruction, choices, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=f"{instruction}Q: {line['input']}\nA:",
-        choices=choices,
+        choices=[(' ' if line["__few_shots"] else '') + c for c in choices],
         gold_index=choices.index(line["target"]),
-        target_for_fewshot_sorting=[f" {c}" for c in choices],
         instruction=instruction,
     )
 
@@ -793,10 +790,9 @@ def hellaswag_helm(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]],
+        choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]] + ([""] if line["__fewshot"] else []),
         gold_index=gold_ix,  # -1 for test,
         instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
-        target_for_fewshot_sorting=line["endings"][gold_ix] if gold_ix > -1 else "",
         specific={
             "label_to_choices": {f" {key}": choice for key, choice in zip(LETTER_INDICES, line["endings"])},
         },
diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
index ad4c8fc87..ff38f9224 100644
--- a/src/lighteval/tasks/prompt_manager.py
+++ b/src/lighteval/tasks/prompt_manager.py
@@ -65,20 +65,18 @@ def doc_to_text(doc: Doc, return_instructions: bool = False) -> Union[str, Tuple
         )
 
     @staticmethod
-    def doc_to_target(formatted_doc: Doc, few_shot: bool = False) -> str:
+    def doc_to_target(formatted_doc: Doc) -> str:
         """
         Returns the target of the given document.
 
         Args:
             formatted_doc (Doc): Formatted document.
-            few_shot (bool, optional): Whether the document is used for few
-                shot examples. Defaults to False.
 
         Returns:
             str: Target of the document, which is the correct answer for a document.
         """
         # likely we mostly need one example not all
-        return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
+        return as_list(formatted_doc.get_golds())[0]
 
     def add_context_to_doc(
         self,
@@ -363,7 +361,7 @@ def _init_fewshot_sampling_balanced(
         # Sort by counts of labels
         label_to_instances = defaultdict(list)
         for instance in fewshotpool:
-            target = PromptManager.doc_to_target(instance, few_shot=True)
+            target = instance.get_target_for_fewshot_sorting()
             label_to_instances[target].append(instance)
 
         counts_to_labels = defaultdict(list)
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index dd8b0d6d8..041b55067 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -177,7 +177,7 @@ class Doc:
 
     # For few-shot
     instruction: Optional[str] = ""
-    target_for_fewshot_sorting: Optional[str] = None  # will probably have to be removed in the future
+    target_for_fewshot_sorting: Optional[str] = None
 
     # Filled when parsing and adding the few-shot context
     ctx: Optional[str] = ""
@@ -193,19 +193,16 @@ def __post_init__(self):
         if self.instruction is None:
             self.instruction = ""
 
-    def get_golds(self, few_shot: bool = False):
+    def get_golds(self):
         """Return gold targets extracted from the target dict"""
         gold_indices = as_list(self.gold_index)
-        if few_shot and self.target_for_fewshot_sorting is not None:
-            choices = self.target_for_fewshot_sorting
-            if isinstance(choices, str):  # correct choice is already selected
-                return choices
-        else:
-            choices = self.choices
         golds = []
         for gold_ix in gold_indices:
-            golds.extend(as_list(choices[gold_ix]))
+            golds.extend(as_list(self.choices[gold_ix]))
         return golds
+    
+    def get_target_for_fewshot_sorting(self) -> str:
+        return self.target_for_fewshot_sorting or as_list(self.get_golds())[0]
 
     def __repr__(self):
         doc_dict = asdict(self)

From f7ed5ffc9adeb461d7f6738f7aef2f9f0e395a37 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 27 Jul 2024 08:54:08 +0330
Subject: [PATCH 02/14] Adapt prompts to removing target_for_fewshot_sorting

---
 src/lighteval/tasks/default_prompts.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 136acb77b..6db85f9e2 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -1348,7 +1348,6 @@ def mmlu(line, topic, task_name: str = None):
         choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
     )
 
 
@@ -1369,7 +1368,6 @@ def custom_mmlu_thom(line, task_name: str = None):
         choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
     )
 
 
@@ -1609,7 +1607,6 @@ def mmlu_harness(line, task_name: str = None):
     query += "Answer:"
 
     gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
-    "__few_shots" in line and line["__few_shots"] is True  # We are adding few shots
 
     return Doc(
         task_name=task_name,
@@ -1617,7 +1614,6 @@ def mmlu_harness(line, task_name: str = None):
         choices=[" A", " B", " C", " D"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
     )
 
 
@@ -1628,14 +1624,14 @@ def mmlu_helm(line, task_name: str = None):
     query += "\nAnswer:"
 
     gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
+    is_few_shots = line.get("__few_shots", False)  # We are adding few shots
 
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" A", " B", " C", " D"],
+        choices=[" A", " B", " C", " D"] if not is_few_shots else ["A", "B", "C", "D"], # specific to HELM evals
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=line["choices"][gold_ix],  # specific to HELM evals
     )
 
 
@@ -1794,6 +1790,7 @@ def openbookqa_helm(line, task_name: str = None):
     query += "Answer: "
 
     gold_ix = ["A", "B", "C", "D", "E"].index(line["answerKey"].strip())
+    # I don't get this.
     return Doc(
         task_name=task_name,
         query=query,
@@ -1821,7 +1818,7 @@ def piqa_helm(line, task_name: str = None):
     query += "Answer: "
 
     gold_ix = int(line["label"])
-
+    # Also this.
     return Doc(
         task_name=task_name,
         query=query,
@@ -1861,7 +1858,7 @@ def pubmed_qa_helm(line, task_name: str = None):
     )
     query += f"\n\nQuestion: {line['question']}\nAnswer: "
     gold_ix = ["yes", "no", "maybe"].index(line["final_decision"])
-
+    # And this
     return Doc(
         task_name=task_name,
         query=query,
@@ -2247,7 +2244,7 @@ def truthful_qa_helm(line, task_name: str = None):
     query = f"Question: {line['question']}\n"
     query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
     query += "Answer:"
-
+    # And this.
     return Doc(
         task_name=task_name,
         query=query,

From 6146844b7137e6d4c126acf739c75e0034a57d3f Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 27 Jul 2024 09:30:46 +0330
Subject: [PATCH 03/14] Fix a bug related to target_for_fewshot_sorting

---
 src/lighteval/tasks/default_prompts.py | 8 --------
 src/lighteval/tasks/lighteval_task.py  | 6 ++----
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 6db85f9e2..902de3d14 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -1790,14 +1790,12 @@ def openbookqa_helm(line, task_name: str = None):
     query += "Answer: "
 
     gold_ix = ["A", "B", "C", "D", "E"].index(line["answerKey"].strip())
-    # I don't get this.
     return Doc(
         task_name=task_name,
         query=query,
         choices=["A", "B", "C", "D", "E"],
         gold_index=gold_ix,
         instruction="The following are multiple choice questions (with answers) about common sense.\n",
-        target_for_fewshot_sorting=line["choices"]["text"][gold_ix],  # specific to HELM evals
     )
 
 
@@ -1818,14 +1816,12 @@ def piqa_helm(line, task_name: str = None):
     query += "Answer: "
 
     gold_ix = int(line["label"])
-    # Also this.
     return Doc(
         task_name=task_name,
         query=query,
         choices=["A", "B"],
         gold_index=gold_ix,
         instruction="The following are multiple choice questions (with answers) about common sense.\n",
-        target_for_fewshot_sorting=[line["sol1"], line["sol2"]][gold_ix],
     )
 
 
@@ -1858,13 +1854,11 @@ def pubmed_qa_helm(line, task_name: str = None):
     )
     query += f"\n\nQuestion: {line['question']}\nAnswer: "
     gold_ix = ["yes", "no", "maybe"].index(line["final_decision"])
-    # And this
     return Doc(
         task_name=task_name,
         query=query,
         choices=["A", "B", "C"],
         gold_index=gold_ix,
-        target_for_fewshot_sorting=["yes", "no", "maybe"][gold_ix],
     )
 
 
@@ -2244,13 +2238,11 @@ def truthful_qa_helm(line, task_name: str = None):
     query = f"Question: {line['question']}\n"
     query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
     query += "Answer:"
-    # And this.
     return Doc(
         task_name=task_name,
         query=query,
         choices=LETTER_INDICES[: len(line["choices"])],
         gold_index=line["gold_index"],
-        target_for_fewshot_sorting=line["choices"][line["gold_index"]],
     )
 
 
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 00b4763b8..e64116078 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -340,20 +340,18 @@ def eval_docs(self) -> list[Doc]:
                 self._docs = self.remove_duplicate_docs(self._docs)
         return self._docs
 
-    def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
+    def doc_to_target(self, formatted_doc: Doc) -> str:
         """
         Returns the target of the given document.
 
         Args:
             formatted_doc (Doc): Formatted document.
-            few_shot (bool, optional): Whether the document is used for few
-                shot examples. Defaults to False.
 
         Returns:
             str: Target of the document, which is the correct answer for a document.
         """
         # likely we mostly need one example not all
-        return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
+        return as_list(formatted_doc.get_golds())[0]
 
     def construct_requests(
         self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str

From dc8134369ef45c0946e17da5d9b208ff1788c82f Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 17 Aug 2024 20:15:46 +0330
Subject: [PATCH 04/14] Fix a tiny bug and apply ruff

---
 src/lighteval/tasks/default_prompts.py | 6 +++---
 src/lighteval/tasks/prompt_manager.py  | 4 +---
 src/lighteval/tasks/requests.py        | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 902de3d14..37ed922ea 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -203,7 +203,7 @@ def bbh(line, instruction, choices, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=f"{instruction}Q: {line['input']}\nA:",
-        choices=[(' ' if line["__few_shots"] else '') + c for c in choices],
+        choices=[(" " if line["__few_shots"] else "") + c for c in choices],
         gold_index=choices.index(line["target"]),
         instruction=instruction,
     )
@@ -790,7 +790,7 @@ def hellaswag_helm(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]] + ([""] if line["__fewshot"] else []),
+        choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]] + ([""] if line["__few_shot"] else []),
         gold_index=gold_ix,  # -1 for test,
         instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
         specific={
@@ -1629,7 +1629,7 @@ def mmlu_helm(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" A", " B", " C", " D"] if not is_few_shots else ["A", "B", "C", "D"], # specific to HELM evals
+        choices=[" A", " B", " C", " D"] if not is_few_shots else ["A", "B", "C", "D"],  # specific to HELM evals
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
     )
diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
index ff38f9224..d29f8360b 100644
--- a/src/lighteval/tasks/prompt_manager.py
+++ b/src/lighteval/tasks/prompt_manager.py
@@ -253,9 +253,7 @@ def get_examples(
 class FewShotSelectionMethod:
     sorting: str  # sorting method for the overall few shot pool (balanced, random, sequential)
     with_sampling: bool  # samples item randomly from the few shot pool
-    fewshotpool_unique: (
-        bool
-    )  # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
+    fewshotpool_unique: bool  # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
 
 
 class FewShotSelection(Enum):
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index 041b55067..a82521ed5 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -200,7 +200,7 @@ def get_golds(self):
         for gold_ix in gold_indices:
             golds.extend(as_list(self.choices[gold_ix]))
         return golds
-    
+
     def get_target_for_fewshot_sorting(self) -> str:
         return self.target_for_fewshot_sorting or as_list(self.get_golds())[0]
 

From aa36d2d1201c1a0b7bd50fd723ba731c5a16405b Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 16 Nov 2024 20:08:17 +0330
Subject: [PATCH 05/14] Apply review comments

---
 src/lighteval/tasks/default_prompts.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 37ed922ea..24affdd90 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -200,10 +200,11 @@ def bbh_lighteval(line, task_name: str = None):
 
 
 def bbh(line, instruction, choices, task_name: str = None):
+    is_few_shots = line.get("__few_shots", False)
     return Doc(
         task_name=task_name,
         query=f"{instruction}Q: {line['input']}\nA:",
-        choices=[(" " if line["__few_shots"] else "") + c for c in choices],
+        choices=[(" " if is_few_shots else "") + c for c in choices],
         gold_index=choices.index(line["target"]),
         instruction=instruction,
     )
@@ -790,9 +791,10 @@ def hellaswag_helm(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]] + ([""] if line["__few_shot"] else []),
+        choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]],
         gold_index=gold_ix,  # -1 for test,
         instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
+        target_for_fewshot_sorting=LETTER_INDICES[gold_ix] if gold_ix > -1 else "",
         specific={
             "label_to_choices": {f" {key}": choice for key, choice in zip(LETTER_INDICES, line["endings"])},
         },
@@ -1629,7 +1631,8 @@ def mmlu_helm(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" A", " B", " C", " D"] if not is_few_shots else ["A", "B", "C", "D"],  # specific to HELM evals
+        choices=[" A", " B", " C", " D"] if not is_few_shots else line["choices"],
+        target_for_fewshot_sorting=line["answer"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
     )

From 656a03fdd5c2f9c3c8addb5d6d7411afc1261358 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 16 Nov 2024 20:20:20 +0330
Subject: [PATCH 06/14] Update piqa_helm

---
 src/lighteval/tasks/default_prompts.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 24affdd90..be8d5d432 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -1819,12 +1819,14 @@ def piqa_helm(line, task_name: str = None):
     query += "Answer: "
 
     gold_ix = int(line["label"])
+    is_few_shots = line.get("__few_shots", False)
     return Doc(
         task_name=task_name,
         query=query,
-        choices=["A", "B"],
+        choices=["A", "B"] if not is_few_shots else [line["sol1"], line["sol2"]],
         gold_index=gold_ix,
         instruction="The following are multiple choice questions (with answers) about common sense.\n",
+        target_for_fewshot_sorting=["A", "B"][gold_ix],
     )
 
 

From c5c9936abdc669e0ebbac2b9ee38828bf38be703 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Mon, 18 Nov 2024 10:58:07 +0100
Subject: [PATCH 07/14] Update requests.py

---
 src/lighteval/tasks/requests.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index a82521ed5..b08b0d049 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -177,7 +177,6 @@ class Doc:
 
     # For few-shot
     instruction: Optional[str] = ""
-    target_for_fewshot_sorting: Optional[str] = None
 
     # Filled when parsing and adding the few-shot context
     ctx: Optional[str] = ""
@@ -201,9 +200,6 @@ def get_golds(self):
             golds.extend(as_list(self.choices[gold_ix]))
         return golds
 
-    def get_target_for_fewshot_sorting(self) -> str:
-        return self.target_for_fewshot_sorting or as_list(self.get_golds())[0]
-
     def __repr__(self):
         doc_dict = asdict(self)
         return json.dumps(doc_dict)

From c54586331ab8231c2cbee60391c7ce14ac01ea37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Mon, 18 Nov 2024 11:04:49 +0100
Subject: [PATCH 08/14] Update prompt_manager.py

---
 src/lighteval/tasks/prompt_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
index d29f8360b..6483dd643 100644
--- a/src/lighteval/tasks/prompt_manager.py
+++ b/src/lighteval/tasks/prompt_manager.py
@@ -359,7 +359,7 @@ def _init_fewshot_sampling_balanced(
         # Sort by counts of labels
         label_to_instances = defaultdict(list)
         for instance in fewshotpool:
-            target = instance.get_target_for_fewshot_sorting()
+            target = PromptManager.doc_to_target(instance)
             label_to_instances[target].append(instance)
 
         counts_to_labels = defaultdict(list)

From 3d967340d0c5c888462c1df400bf32032039642c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Mon, 18 Nov 2024 11:06:12 +0100
Subject: [PATCH 09/14] removing doc_to_target from task as it's now in the
 prompt manager

---
 src/lighteval/tasks/lighteval_task.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index e64116078..9c1d18d41 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -340,19 +340,6 @@ def eval_docs(self) -> list[Doc]:
                 self._docs = self.remove_duplicate_docs(self._docs)
         return self._docs
 
-    def doc_to_target(self, formatted_doc: Doc) -> str:
-        """
-        Returns the target of the given document.
-
-        Args:
-            formatted_doc (Doc): Formatted document.
-
-        Returns:
-            str: Target of the document, which is the correct answer for a document.
-        """
-        # likely we mostly need one example not all
-        return as_list(formatted_doc.get_golds())[0]
-
     def construct_requests(
         self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
     ) -> Dict[RequestType, List[Request]]:

From 05217d012262364c48c3cbe0a3bd0de3cdc08fab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine?= <cle.fourrier@gmail.com>
Date: Mon, 18 Nov 2024 12:24:54 +0100
Subject: [PATCH 10/14] added class for few shot back

---
 src/lighteval/tasks/prompt_manager.py | 25 ++++++++++++++++++++-----
 src/lighteval/tasks/requests.py       |  1 +
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
index c052e9188..7555b72a0 100644
--- a/src/lighteval/tasks/prompt_manager.py
+++ b/src/lighteval/tasks/prompt_manager.py
@@ -75,9 +75,24 @@ def doc_to_target(formatted_doc: Doc) -> str:
         Returns:
             str: Target of the document, which is the correct answer for a document.
         """
-        # likely we mostly need one example not all
         return as_list(formatted_doc.get_golds())[0]
 
+    @staticmethod
+    def doc_to_fewshot_sorting_class(formatted_doc: Doc) -> str:
+        """
+        In some cases, when selecting few-shot samples, we want to use specific document classes
+        which need to be specified separately from the target.
+        For example, a document where the gold is a json might want to use only one of the keys of
+        the json to define sorting classes in few shot samples. Else we take the gold.
+
+        Args:
+            formatted_doc (Doc): Formatted document.
+
+        Returns:
+            str: Class of the
+        """
+        return formatted_doc.fewshot_sorting_class or PromptManager.doc_to_target(formatted_doc)
+
     def add_context_to_doc(
         self,
         doc: Doc,
@@ -352,16 +367,16 @@ def _init_fewshot_sampling_balanced(
     ):
         fewshotpool = self.task.fewshot_docs()
 
-        # rnd = random.Random(variance_seed)
         random.seed(variance_seed)
 
-        # Build up balanced selection based on labels
-        # Sort by counts of labels
+        # Build up balanced selection based on fewshot_sorting_class
+        # (or the gold target, if the class is undefined)
         label_to_instances = defaultdict(list)
         for instance in fewshotpool:
-            target = PromptManager.doc_to_target(instance)
+            target = PromptManager.doc_to_fewshot_sorting_class(instance)
             label_to_instances[target].append(instance)
 
+        # Sort by counts of class labels
         counts_to_labels = defaultdict(list)
         for label, instances in sorted(label_to_instances.items()):
             counts_to_labels[len(instances)].append(label)
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index 6d999723a..cd75ad402 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -178,6 +178,7 @@ class Doc:
 
     # For few-shot
     instruction: Optional[str] = ""
+    fewshot_sorting_class: Optional[str] = None  # class to use to select balanced few-shot samples
 
     # Filled when parsing and adding the few-shot context
     ctx: Optional[str] = ""

From 85ddfa9e5985533a1ccbc7d74a5b74b8065b4a8b Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 18 Nov 2024 12:22:41 +0000
Subject: [PATCH 11/14] correct community tasks

---
 community_tasks/arabic_evals.py              |  9 --------
 community_tasks/serbian_eval.py              |  6 ------
 examples/model_configs/test.yaml             | 22 ++++++++++++++++++++
 examples/nanotron/custom_evaluation_tasks.py |  1 -
 examples/nanotron/custom_task.py             |  4 ++--
 5 files changed, 24 insertions(+), 18 deletions(-)
 create mode 100644 examples/model_configs/test.yaml

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
index 323120cd7..f575b5f07 100644
--- a/community_tasks/arabic_evals.py
+++ b/community_tasks/arabic_evals.py
@@ -72,7 +72,6 @@ def mmlu_arabic(line, task_name: str = None):
         choices=LETTER_INDICES_AR[:4],
         gold_index=gold_ix,
         instruction=instruction,
-        target_for_fewshot_sorting=LETTER_INDICES_AR[gold_ix],
     )
 
 
@@ -181,7 +180,6 @@ def arabic_exams(line, task_name: str = None):
         choices=LETTER_INDICES_AR[:4],
         gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=choices[answer_index],
     )
 
 
@@ -231,7 +229,6 @@ def alghafa_prompt(line, task_name: str = None):
         choices=choices,
         gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=choices[answer_index],
     )
 
 
@@ -371,7 +368,6 @@ def __init__(
 def boolq_prompt_arabic(line, task_name: str = None):
     question = line["question"]
     passage = line["passage"]
-    answer = "نعم" if line["answer"] else "لا"
     instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا"
     query = f"""{instruction}
     المقطع :
@@ -387,7 +383,6 @@ def boolq_prompt_arabic(line, task_name: str = None):
         choices=["نعم", "لا"],
         gold_index=0 if line["answer"] else 1,
         instruction=instruction,
-        target_for_fewshot_sorting=answer,
     )
 
 
@@ -423,7 +418,6 @@ def copa_prompt_arabic(line, task_name: str = None):
         choices=choices,
         gold_index=answer,
         instruction="",
-        target_for_fewshot_sorting=choices[answer],
     )
 
 
@@ -468,7 +462,6 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
         choices=endings,
         gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=endings[answer_index],
     )
 
 
@@ -506,7 +499,6 @@ def toxigen_prompt_arabic(line, task_name: str = None):
         choices=["لا", "نعم"],
         gold_index=label,
         instruction=instruction,
-        target_for_fewshot_sorting="نعم" if label == 1 else "لا",
     )
 
 
@@ -558,7 +550,6 @@ def sciq_prompt_arabic(line, task_name: str = None):
         choices=choices,
         gold_index=answer_index,
         instruction=instruction,
-        target_for_fewshot_sorting=choices[answer_index],
     )
 
 
diff --git a/community_tasks/serbian_eval.py b/community_tasks/serbian_eval.py
index e79485107..3b49c4cb0 100644
--- a/community_tasks/serbian_eval.py
+++ b/community_tasks/serbian_eval.py
@@ -200,8 +200,6 @@ def serbian_eval_prompt(line: dict, task_name: Optional[str] = None) -> Doc:
             - choices (list of str): The list of available answer choices.
             - gold_index (int): The index of the correct answer.
             - instruction (str): The instruction shown to the user in Serbian.
-            - target_for_fewshot_sorting (Union[str, list of str]): The correct answer, either as a
-              string (for regular tasks) or a list of strings (for MMLU tasks).
     """
 
     question = line["query"]
@@ -226,16 +224,12 @@ def serbian_eval_prompt(line: dict, task_name: Optional[str] = None) -> Doc:
 
     query += "\n\nKrajnji odgovor:"
 
-    # Finalize target_for_fewshot_sorting as we handle mmlu task group as string
-    target_for_fewshot_sorting = [choices[gold_index]] if task_name and "mmlu" in task_name else choices[gold_index]
-
     return Doc(
         task_name=task_name,
         query=query,
         choices=choices,
         gold_index=gold_index,
         instruction=instruction,
-        target_for_fewshot_sorting=target_for_fewshot_sorting,
     )
 
 
diff --git a/examples/model_configs/test.yaml b/examples/model_configs/test.yaml
new file mode 100644
index 000000000..9e68c5307
--- /dev/null
+++ b/examples/model_configs/test.yaml
@@ -0,0 +1,22 @@
+model:
+  type: "endpoint"
+  base_params:
+    endpoint_name: "smollm-360m-instruct-v0-2-q8-lvy" # needs to be lower case without special characters
+    model: HuggingFaceTB/SmolLM-360M-Instruct
+    revision: "main"
+    dtype: "default" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
+    reuse_existing: true # if true, ignore all params in instance, and don't delete the endpoint after evaluation
+  instance:
+    accelerator: "gpu"
+    region: "eu-west-1"
+    vendor: "aws"
+    instance_size: "medium"
+    instance_type: "g5.2xlarge"
+    framework: "pytorch"
+    endpoint_type: "protected"
+    namespace: null # The namespace under which to launch the endopint. Defaults to the current user's namespace
+    image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
+    env_vars:
+      null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
+  generation:
+    add_special_tokens: true
diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py
index 6d4edd62c..9ae066715 100644
--- a/examples/nanotron/custom_evaluation_tasks.py
+++ b/examples/nanotron/custom_evaluation_tasks.py
@@ -333,7 +333,6 @@ def mmlu_harness(line, task_name: str = None):
         task_name=task_name,
         query=prompt,
         choices=[" A", " B", " C", " D"],
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
     )
diff --git a/examples/nanotron/custom_task.py b/examples/nanotron/custom_task.py
index 493323214..05cea969f 100644
--- a/examples/nanotron/custom_task.py
+++ b/examples/nanotron/custom_task.py
@@ -36,7 +36,7 @@ def mmlu_signs(line, topic):
     return {
         "query": prompt,
         "choices": [" +", " *", " =", " #"] if is_few_shots else ["+", "*", "=", "#"],
-        "target_for_fewshot_sorting": [" +", " *", " =", " #"][gold_ix],
+        "fewshot_sorting_class": [" +", " *", " =", " #"][gold_ix],
         "gold_index": gold_ix,
         "instruction": f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n",
     }
@@ -58,7 +58,7 @@ def mmlu_numbers(line, topic):
     return {
         "query": prompt,
         "choices": [" 1", " 2", " 3", " 4"] if is_few_shots else ["1", "2", "3", "4"],
-        "target_for_fewshot_sorting": [" 1", " 2", " 3", " 4"][gold_ix],
+        "fewshot_sorting_class": [" 1", " 2", " 3", " 4"][gold_ix],
         "gold_index": gold_ix,
         "instruction": f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n",
     }

From 0c651c01dd124ded3c32a16cae49c4899f7a1b23 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 18 Nov 2024 12:23:57 +0000
Subject: [PATCH 12/14] updated tests

---
 src/lighteval/tasks/default_prompts.py      | 4 ++--
 tests/reference_scores/harness_prompts.json | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 67d9c2c2c..e5c47e79a 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -1625,13 +1625,13 @@ def mmlu_helm(line, task_name: str = None):
     query += "\nAnswer:"
 
     gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
-    is_few_shots = line.get("__few_shots", False)  # We are adding few shots
 
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" A", " B", " C", " D"] if not is_few_shots else line["choices"],
+        choices=[" A", " B", " C", " D"],
         gold_index=gold_ix,
+        fewshot_sorting_class=line["choices"][gold_ix],
         instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
     )
 
diff --git a/tests/reference_scores/harness_prompts.json b/tests/reference_scores/harness_prompts.json
index b79a6637a..6cd942efa 100644
--- a/tests/reference_scores/harness_prompts.json
+++ b/tests/reference_scores/harness_prompts.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:02a5551e1137c799c9a1535112d221c7a77fd07b72c2b38b640164be7ea70828
-size 20246141
+oid sha256:4d7055452bb1f282b8b2c040a3a30856f51aa8d44fe80e2c391cbbc375a19b95
+size 20244716

From 63edb8e8f5bdde600db20f32ae740d227da9ced5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Mon, 18 Nov 2024 16:45:26 +0100
Subject: [PATCH 13/14] Code review

---
 src/lighteval/tasks/default_prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index e5c47e79a..0e2b4eb48 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -204,7 +204,7 @@ def bbh(line, instruction, choices, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=f"{instruction}Q: {line['input']}\nA:",
-        choices=[(" " if is_few_shots else "") + c for c in choices],
+        choices=choices,
         gold_index=choices.index(line["target"]),
         instruction=instruction,
     )

From c72982a5c9caad262d378f4352801d002a38e53d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine?= <cle.fourrier@gmail.com>
Date: Tue, 19 Nov 2024 09:29:08 +0100
Subject: [PATCH 14/14] style

---
 src/lighteval/tasks/default_prompts.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 0e2b4eb48..5b6a33123 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -200,7 +200,6 @@ def bbh_lighteval(line, task_name: str = None):
 
 
 def bbh(line, instruction, choices, task_name: str = None):
-    is_few_shots = line.get("__few_shots", False)
     return Doc(
         task_name=task_name,
         query=f"{instruction}Q: {line['input']}\nA:",