huggingface · NathanHB · Jan 2, 2025 · Nov 7, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
@@ -38,9 +38,22 @@
 logging_config = dict(  # noqa C408
     version=1,
     formatters={
-        "c": {
+        "json": {
+            "()": "lighteval.logger.JSONFormatter",
+            "fmt_keys": {
+                "level": "levelname",
+                "message": "message",
+                "timestamp": "timestamp",
+                "logger": "name",
+                "module": "module",
+                "function": "funcName",
+                "line": "lineno",
+                "thread_name": "threadName",
+            },
+        },
+        "colored": {
             "()": colorlog.ColoredFormatter,
-            "format": "[%(asctime)s] [%(log_color)s%(levelname)8s%(reset)s]: %(message)s (%(filename)s:%(lineno)s)",
+            "format": "[%(asctime)s] [%(log_color)s%(levelname)8s%(reset)s]: %(message)s",
             "log_colors": {
                 "DEBUG": "cyan",
                 "INFO": "green",
@@ -50,10 +63,22 @@
             },
         },
     },
-    handlers={"h": {"class": "logging.StreamHandler", "formatter": "c", "level": logging.INFO}},
-    root={
-        "handlers": ["h"],
-        "level": logging.INFO,
+    handlers={
+        "stdout": {"class": "logging.StreamHandler", "formatter": "colored", "level": logging.INFO},
+        "file": {
+            "class": "logging.handlers.RotatingFileHandler",
+            "formatter": "json",
+            "level": logging.INFO,
+            "filename": "lighteval.log",
+            "maxBytes": 10485760,
+            "backupCount": 2,
+        },
+    },
+    loggers={
+        "root": {
+            "handlers": ["stdout", "file"],
+            "level": logging.INFO,
+        }
     },
 )
 

diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py
@@ -390,3 +390,111 @@ def tgi(
     pipeline.save_and_push_results()
 
     return results
+
+
+@app.command(rich_help_panel="Evaluation Backends")
+def litellm(
+    # === general ===
+    provider: Annotated[str, Argument(help="")],
+    model: Annotated[str, Argument(help="")],
+    tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+    # === Common parameters ===
+    use_chat_template: Annotated[
+        bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = False,
+    system_prompt: Annotated[
+        Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = None,
+    dataset_loading_processes: Annotated[
+        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    custom_tasks: Annotated[
+        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = None,
+    cache_dir: Annotated[
+        str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = CACHE_DIR,
+    num_fewshot_seeds: Annotated[
+        int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    # === saving ===
+    output_dir: Annotated[
+        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = "results",
+    push_to_hub: Annotated[
+        bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    push_to_tensorboard: Annotated[
+        bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    public_run: Annotated[
+        bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    results_org: Annotated[
+        Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = None,
+    save_details: Annotated[
+        bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    # === debug ===
+    max_samples: Annotated[
+        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = None,
+    override_batch_size: Annotated[
+        int, Option(help="Override batch size for evaluation.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = -1,
+    job_id: Annotated[
+        int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = 0,
+):
+    """
+    Evaluate models using TGI as backend.
+    """
+
+    from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.models.model_config import LiteLLMModelConfig
+    from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
+
+    env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir)
+    evaluation_tracker = EvaluationTracker(
+        output_dir=output_dir,
+        save_details=save_details,
+        push_to_hub=push_to_hub,
+        push_to_tensorboard=push_to_tensorboard,
+        public=public_run,
+        hub_results_org=results_org,
+    )
+
+    # TODO (nathan): better handling of model_args
+    parallelism_manager = ParallelismManager.NONE
+
+    model_config = LiteLLMModelConfig(provider=provider, model=model)
+
+    pipeline_params = PipelineParameters(
+        launcher_type=parallelism_manager,
+        env_config=env_config,
+        job_id=job_id,
+        dataset_loading_processes=dataset_loading_processes,
+        custom_tasks_directory=custom_tasks,
+        override_batch_size=override_batch_size,
+        num_fewshot_seeds=num_fewshot_seeds,
+        max_samples=max_samples,
+        use_chat_template=use_chat_template,
+        system_prompt=system_prompt,
+    )
+    pipeline = Pipeline(
+        tasks=tasks,
+        pipeline_parameters=pipeline_params,
+        evaluation_tracker=evaluation_tracker,
+        model_config=model_config,
+    )
+
+    pipeline.evaluate()
+
+    pipeline.show_results()
+
+    results = pipeline.get_results()
+
+    pipeline.save_and_push_results()
+
+    return results
diff --git a/src/lighteval/models/litellm_model.py b/src/lighteval/models/litellm_model.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 import os
 import time
 from concurrent.futures import ThreadPoolExecutor
@@ -29,7 +30,6 @@
 from transformers import AutoTokenizer
 
 from lighteval.data import GenerativeTaskDataset
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.models.abstract_model import LightevalModel
 from lighteval.models.endpoint_model import ModelInfo
 from lighteval.models.model_output import (
@@ -46,14 +46,14 @@
 from lighteval.utils.imports import is_litellm_available
 
 
-if is_litellm_available():
-    import logging
+logger = logging.getLogger(__name__)
 
+if is_litellm_available():
     import litellm
     from litellm.caching.caching import Cache
 
-    logging.getLogger("litellm").setLevel(logging.ERROR)
-    logging.getLogger("httpx").setLevel(logging.ERROR)
+    logging.getLogger("LiteLLM").setLevel(logging.WARNING)
+    logging.getLogger("LiteLLM").handlers.clear()
 
     litellm.cache = Cache(type="disk")
 
@@ -84,6 +84,7 @@ def __init__(self, config, env_config) -> None:
         self._tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Use a dummy tokenizer for compatibility
         self.pairwise_tokenization = False
         litellm.drop_params = True
+        litellm.verbose = True
 
     def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_sequence, generation_size):
         for attempt in range(self.API_MAX_RETRY):
@@ -98,25 +99,18 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se
                     temperature=self.TEMPERATURE,
                     top_p=self.TOP_P,
                     stop=["\n"] if stop_sequence is None else stop_sequence,
-                    max_completion_tokens=generation_size if generation_size > 0 else None,
+                    # max_completion_tokens=generation_size if generation_size > 0 else None,
                     caching=True,
                 )
                 return response
-            except litellm.exceptions.RateLimitError:
-                if attempt == self.API_MAX_RETRY - 1:
-                    raise
+            except Exception as e:
                 wait_time = min(64, self.API_RETRY_SLEEP * (2**attempt))  # Exponential backoff with max 64s
-                hlog_warn(
-                    f"Rate limit hit. Waiting {wait_time} seconds before retry {attempt + 1}/{self.API_MAX_RETRY}"
+                logger.warning(
+                    f"Error in API call: {e}, waiting {wait_time} seconds before retry {attempt + 1}/{self.API_MAX_RETRY}"
                 )
                 time.sleep(wait_time)
-            except Exception as e:
-                hlog_warn(f"{type(e), e}")
-                if attempt == self.API_MAX_RETRY - 1:
-                    raise
-                wait_time = self.API_RETRY_SLEEP * (self.API_RETRY_MULTIPLIER**attempt)
-                hlog_warn(f"Retrying in {wait_time} seconds")
-                time.sleep(wait_time)
+
+        logger.error(f"API call failed after {self.API_MAX_RETRY} attempts, skipping entry.")
 
     def __call_api_parallel(
         self,

diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
@@ -317,4 +317,4 @@ def get_dtype_args(self) -> Dict[str, str]:
         return {}
 
     def get_custom_env_vars(self) -> Dict[str, str]:
-        return {k: str(v) for k, v in self.env_vars.items()} if self.env_vars else {}
+        return {k: str(v) for k, v in self.env_vars.items()} if self.env_vars else {}
diff --git a/src/lighteval/models/vllm_model.py b/src/lighteval/models/vllm_model.py
@@ -54,6 +54,12 @@
     from vllm import LLM, SamplingParams
     from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
     from vllm.transformers_utils.tokenizer import get_tokenizer
+
+    logging.getLogger("vllm").propagate = True
+    logging.getLogger("vllm").handlers.clear()
+
+    logging.getLogger("ray").propagate = True
+    logging.getLogger("ray").handlers.clear()
 else:
     LLM = None
     SamplingParams = None

diff --git a/src/lighteval/utils/imports.py b/src/lighteval/utils/imports.py
@@ -65,7 +65,9 @@ def is_tensorboardX_available() -> bool:
     return importlib.util.find_spec("tensorboardX") is not None
 
 
-NO_TENSORBOARDX_WARN_MSG = "You are trying to log using tensorboardX, which is not installed. Please install it using pip. Skipping."
+NO_TENSORBOARDX_WARN_MSG = (
+    "You are trying to log using tensorboardX, which is not installed. Please install it using pip. Skipping."
+)
 
 
 def is_openai_available() -> bool:
@@ -83,10 +85,7 @@ def is_litellm_available() -> bool:
 
 
 def is_vllm_available() -> bool:
-    return (
-        importlib.util.find_spec("vllm") is not None
-        and importlib.util.find_spec("ray") is not None
-    )
+    return importlib.util.find_spec("vllm") is not None and importlib.util.find_spec("ray") is not None
 
 
 NO_VLLM_ERROR_MSG = "You are trying to use an VLLM model, for which you need `vllm` and `ray`, which are not available in your environment. Please install them using pip, `pip install vllm ray`."