Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions community_tasks/arabic_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def mmlu_arabic(line, task_name: str = None):
choices=LETTER_INDICES_AR[:4],
gold_index=gold_ix,
instruction=instruction,
target_for_fewshot_sorting=LETTER_INDICES_AR[gold_ix],
)


Expand Down Expand Up @@ -181,7 +180,6 @@ def arabic_exams(line, task_name: str = None):
choices=LETTER_INDICES_AR[:4],
gold_index=answer_index,
instruction=instruction,
target_for_fewshot_sorting=choices[answer_index],
)


Expand Down Expand Up @@ -231,7 +229,6 @@ def alghafa_prompt(line, task_name: str = None):
choices=choices,
gold_index=answer_index,
instruction=instruction,
target_for_fewshot_sorting=choices[answer_index],
)


Expand Down Expand Up @@ -371,7 +368,6 @@ def __init__(
def boolq_prompt_arabic(line, task_name: str = None):
question = line["question"]
passage = line["passage"]
answer = "نعم" if line["answer"] else "لا"
instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا"
query = f"""{instruction}
المقطع :
Expand All @@ -387,7 +383,6 @@ def boolq_prompt_arabic(line, task_name: str = None):
choices=["نعم", "لا"],
gold_index=0 if line["answer"] else 1,
instruction=instruction,
target_for_fewshot_sorting=answer,
)


Expand Down Expand Up @@ -423,7 +418,6 @@ def copa_prompt_arabic(line, task_name: str = None):
choices=choices,
gold_index=answer,
instruction="",
target_for_fewshot_sorting=choices[answer],
)


Expand Down Expand Up @@ -468,7 +462,6 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
choices=endings,
gold_index=answer_index,
instruction=instruction,
target_for_fewshot_sorting=endings[answer_index],
)


Expand Down Expand Up @@ -506,7 +499,6 @@ def toxigen_prompt_arabic(line, task_name: str = None):
choices=["لا", "نعم"],
gold_index=label,
instruction=instruction,
target_for_fewshot_sorting="نعم" if label == 1 else "لا",
)


Expand Down Expand Up @@ -558,7 +550,6 @@ def sciq_prompt_arabic(line, task_name: str = None):
choices=choices,
gold_index=answer_index,
instruction=instruction,
target_for_fewshot_sorting=choices[answer_index],
)


Expand Down
6 changes: 0 additions & 6 deletions community_tasks/serbian_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,6 @@ def serbian_eval_prompt(line: dict, task_name: Optional[str] = None) -> Doc:
- choices (list of str): The list of available answer choices.
- gold_index (int): The index of the correct answer.
- instruction (str): The instruction shown to the user in Serbian.
- target_for_fewshot_sorting (Union[str, list of str]): The correct answer, either as a
string (for regular tasks) or a list of strings (for MMLU tasks).
"""

question = line["query"]
Expand All @@ -226,16 +224,12 @@ def serbian_eval_prompt(line: dict, task_name: Optional[str] = None) -> Doc:

query += "\n\nKrajnji odgovor:"

# Finalize target_for_fewshot_sorting as we handle mmlu task group as string
target_for_fewshot_sorting = [choices[gold_index]] if task_name and "mmlu" in task_name else choices[gold_index]

return Doc(
task_name=task_name,
query=query,
choices=choices,
gold_index=gold_index,
instruction=instruction,
target_for_fewshot_sorting=target_for_fewshot_sorting,
)


Expand Down
22 changes: 22 additions & 0 deletions examples/model_configs/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
model:
type: "endpoint"
base_params:
endpoint_name: "smollm-360m-instruct-v0-2-q8-lvy" # needs to be lower case without special characters
model: HuggingFaceTB/SmolLM-360M-Instruct
revision: "main"
dtype: "default" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
reuse_existing: true # if true, ignore all params in instance, and don't delete the endpoint after evaluation
instance:
accelerator: "gpu"
region: "eu-west-1"
vendor: "aws"
instance_size: "medium"
instance_type: "g5.2xlarge"
framework: "pytorch"
endpoint_type: "protected"
namespace: null # The namespace under which to launch the endopint. Defaults to the current user's namespace
image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
env_vars:
null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
generation:
add_special_tokens: true
1 change: 0 additions & 1 deletion examples/nanotron/custom_evaluation_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,6 @@ def mmlu_harness(line, task_name: str = None):
task_name=task_name,
query=prompt,
choices=[" A", " B", " C", " D"],
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
)
Expand Down
4 changes: 2 additions & 2 deletions examples/nanotron/custom_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def mmlu_signs(line, topic):
return {
"query": prompt,
"choices": [" +", " *", " =", " #"] if is_few_shots else ["+", "*", "=", "#"],
"target_for_fewshot_sorting": [" +", " *", " =", " #"][gold_ix],
"fewshot_sorting_class": [" +", " *", " =", " #"][gold_ix],
"gold_index": gold_ix,
"instruction": f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
}
Expand All @@ -58,7 +58,7 @@ def mmlu_numbers(line, topic):
return {
"query": prompt,
"choices": [" 1", " 2", " 3", " 4"] if is_few_shots else ["1", "2", "3", "4"],
"target_for_fewshot_sorting": [" 1", " 2", " 3", " 4"][gold_ix],
"fewshot_sorting_class": [" 1", " 2", " 3", " 4"][gold_ix],
"gold_index": gold_ix,
"instruction": f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
}
Expand Down
23 changes: 5 additions & 18 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ def bbh_harness(line, task_name: str = None):
query=query,
choices=choices,
gold_index=correct_index,
target_for_fewshot_sorting=choices,
instruction=line.get("task_prefix", None),
)

Expand All @@ -196,18 +195,17 @@ def bbh_lighteval(line, task_name: str = None):
query=query,
choices=LETTER_INDICES[: len(line["choices"])],
gold_index=line["target_idx"],
target_for_fewshot_sorting=LETTER_INDICES[: len(line["choices"])],
instruction=line.get("task_prefix", None),
)


def bbh(line, instruction, choices, task_name: str = None):
is_few_shots = line.get("__few_shots", False)
return Doc(
task_name=task_name,
query=f"{instruction}Q: {line['input']}\nA:",
choices=choices,
choices=[(" " if is_few_shots else "") + c for c in choices],
gold_index=choices.index(line["target"]),
target_for_fewshot_sorting=[f" {c}" for c in choices],
instruction=instruction,
)

Expand Down Expand Up @@ -799,7 +797,6 @@ def hellaswag_generative(line, task_name: str = None):
choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]],
gold_index=gold_ix, # -1 for test,
instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
target_for_fewshot_sorting=line["endings"][gold_ix] if gold_ix > -1 else "",
)


Expand Down Expand Up @@ -1352,7 +1349,6 @@ def mmlu(line, topic, task_name: str = None):
choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
)


Expand All @@ -1373,7 +1369,6 @@ def custom_mmlu_thom(line, task_name: str = None):
choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
)


Expand Down Expand Up @@ -1613,15 +1608,13 @@ def mmlu_harness(line, task_name: str = None):
query += "Answer:"

gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
"__few_shots" in line and line["__few_shots"] is True # We are adding few shots

return Doc(
task_name=task_name,
query=query,
choices=[" A", " B", " C", " D"],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
)


Expand All @@ -1638,8 +1631,8 @@ def mmlu_helm(line, task_name: str = None):
query=query,
choices=[" A", " B", " C", " D"],
gold_index=gold_ix,
fewshot_sorting_class=line["choices"][gold_ix],
instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=line["choices"][gold_ix], # specific to HELM evals
)


Expand Down Expand Up @@ -1816,7 +1809,6 @@ def openbookqa_helm(line, task_name: str = None):
choices=["A", "B", "C", "D", "E"],
gold_index=gold_ix,
instruction="The following are multiple choice questions (with answers) about common sense.\n",
target_for_fewshot_sorting=line["choices"]["text"][gold_ix], # specific to HELM evals
)


Expand All @@ -1837,14 +1829,13 @@ def piqa_helm(line, task_name: str = None):
query += "Answer: "

gold_ix = int(line["label"])

is_few_shots = line.get("__few_shots", False)
return Doc(
task_name=task_name,
query=query,
choices=["A", "B"],
choices=["A", "B"] if not is_few_shots else [line["sol1"], line["sol2"]],
gold_index=gold_ix,
instruction="The following are multiple choice questions (with answers) about common sense.\n",
target_for_fewshot_sorting=[line["sol1"], line["sol2"]][gold_ix],
)


Expand Down Expand Up @@ -1877,13 +1868,11 @@ def pubmed_qa_helm(line, task_name: str = None):
)
query += f"\n\nQuestion: {line['question']}\nAnswer: "
gold_ix = ["yes", "no", "maybe"].index(line["final_decision"])

return Doc(
task_name=task_name,
query=query,
choices=["A", "B", "C"],
gold_index=gold_ix,
target_for_fewshot_sorting=["yes", "no", "maybe"][gold_ix],
)


Expand Down Expand Up @@ -2263,13 +2252,11 @@ def truthful_qa_helm(line, task_name: str = None):
query = f"Question: {line['question']}\n"
query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
query += "Answer:"

return Doc(
task_name=task_name,
query=query,
choices=LETTER_INDICES[: len(line["choices"])],
gold_index=line["gold_index"],
target_for_fewshot_sorting=line["choices"][line["gold_index"]],
)


Expand Down
15 changes: 0 additions & 15 deletions src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,21 +340,6 @@ def eval_docs(self) -> list[Doc]:
self._docs = self.remove_duplicate_docs(self._docs)
return self._docs

def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
"""
Returns the target of the given document.

Args:
formatted_doc (Doc): Formatted document.
few_shot (bool, optional): Whether the document is used for few
shot examples. Defaults to False.

Returns:
str: Target of the document, which is the correct answer for a document.
"""
# likely we mostly need one example not all
return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]

def construct_requests(
self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
) -> Dict[RequestType, List[Request]]:
Expand Down
35 changes: 23 additions & 12 deletions src/lighteval/tasks/prompt_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,20 +65,33 @@ def doc_to_text(doc: Doc, return_instructions: bool = False) -> Union[str, Tuple
)

@staticmethod
def doc_to_target(formatted_doc: Doc, few_shot: bool = False) -> str:
def doc_to_target(formatted_doc: Doc) -> str:
"""
Returns the target of the given document.

Args:
formatted_doc (Doc): Formatted document.
few_shot (bool, optional): Whether the document is used for few
shot examples. Defaults to False.

Returns:
str: Target of the document, which is the correct answer for a document.
"""
# likely we mostly need one example not all
return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
return as_list(formatted_doc.get_golds())[0]

@staticmethod
def doc_to_fewshot_sorting_class(formatted_doc: Doc) -> str:
"""
In some cases, when selecting few-shot samples, we want to use specific document classes
which need to be specified separately from the target.
For example, a document where the gold is a json might want to use only one of the keys of
the json to define sorting classes in few shot samples. Else we take the gold.

Args:
formatted_doc (Doc): Formatted document.

Returns:
str: Class of the
"""
return formatted_doc.fewshot_sorting_class or PromptManager.doc_to_target(formatted_doc)

def add_context_to_doc(
self,
Expand Down Expand Up @@ -255,9 +268,7 @@ def get_examples(
class FewShotSelectionMethod:
sorting: str # sorting method for the overall few shot pool (balanced, random, sequential)
with_sampling: bool # samples item randomly from the few shot pool
fewshotpool_unique: (
bool
) # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
fewshotpool_unique: bool # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set


class FewShotSelection(Enum):
Expand Down Expand Up @@ -356,16 +367,16 @@ def _init_fewshot_sampling_balanced(
):
fewshotpool = self.task.fewshot_docs()

# rnd = random.Random(variance_seed)
random.seed(variance_seed)

# Build up balanced selection based on labels
# Sort by counts of labels
# Build up balanced selection based on fewshot_sorting_class
# (or the gold target, if the class is undefined)
label_to_instances = defaultdict(list)
for instance in fewshotpool:
target = PromptManager.doc_to_target(instance, few_shot=True)
target = PromptManager.doc_to_fewshot_sorting_class(instance)
label_to_instances[target].append(instance)

# Sort by counts of class labels
counts_to_labels = defaultdict(list)
for label, instances in sorted(label_to_instances.items()):
counts_to_labels[len(instances)].append(label)
Expand Down
Loading
Loading