diff --git a/Makefile b/Makefile index ed3e2ddcc..a56b9a93d 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ clean: ## Clean all generated files run-ci: format lint type ## Running all CI checks run-benchmarks: ## Run benchmarks @echo "Running benchmarks..." - @cd $(GIT_ROOT)/tests/benchmarks && python benchmark.py + @cd $(GIT_ROOT)/tests/benchmarks && python benchmark_eval.py test: ## Run tests @echo "Running tests..." @pytest tests/unit diff --git a/README.md b/README.md index e416fa7f7..d4d00598c 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ Quickstart | Metrics | Community | + Open Analytics | FAQ | Hugging Face
@@ -86,28 +87,29 @@ Ragas measures your pipeline's performance against two dimensions Through repeated experiments, we have found that the quality of a RAG pipeline is highly dependent on these two dimensions. The final `ragas_score` is the harmonic mean of these two factors. To read more about our metrics, checkout [docs](/docs/metrics.md). -## :question: How to use Ragas to improve your pipeline? -*"Measurement is the first step that leads to control and eventually to improvement" - James Harrington* +## 🫂 Community +If you want to get more involved with Ragas, check out our [discord server](https://discord.gg/5djav8GGNZ). It's a fun community where we geek out about LLM, Retrieval, Production issues and more. -Here we assume that you already have your RAG pipeline ready. When it comes to RAG pipelines, there are mainly two parts - Retriever and generator. A change in any of this should also impact your pipelines's quality. +## 🔍 Open Analytics +We track very basic usage metrics to guide us to figure out what our users want, what is working and what's not. As a young startup, we have to be brutally honest about this which is why we are tracking these metrics. But as an Open Startup we open-source all the data we collect. You can read more about this [here](https://github.com/explodinggradients/ragas/issues/49). If you want to take a look at exactly what we track, feel free to check the [code](./src/ragas/_analytics.py) -1. First, decide one parameter that you're interested in adjusting. for example the number of retrieved documents, K. -2. Collect a set of sample prompts (min 20) to form your test set. -3. Run your pipeline using the test set before and after the change. Each time record the prompts with context and generated output. -4. Run ragas evaluation for each of them to generate evaluation scores. -5. Compare the scores and you will know how much the change has affected your pipelines' performance. +You can disable usage-tracking if you want by setting the `RAGAS_DO_NOT_TRACK` flag to true. -## 🫂 Community -If you want to get more involved with Ragas, check out our [discord server](https://discord.gg/5djav8GGNZ). It's a fun community where we geek out about LLM, Retrieval, Production issues and more. ## :raising_hand_man: FAQ 1. Why harmonic mean? Harmonic mean penalizes extreme values. For example, if your generated answer is fully factually consistent with the context (faithfulness = 1) but is not relevant to the question (relevancy = 0), a simple average would give you a score of 0.5 but a harmonic mean will give you 0.0 +2. How to use Ragas to improve your pipeline? +*"Measurement is the first step that leads to control and eventually to improvement" - James Harrington* +Here we assume that you already have your RAG pipeline ready. When it comes to RAG pipelines, there are mainly two parts - Retriever and generator. A change in any of this should also impact your pipelines's quality. - - +1. First, decide one parameter that you're interested in adjusting. for example the number of retrieved documents, K. +2. Collect a set of sample prompts (min 20) to form your test set. +3. Run your pipeline using the test set before and after the change. Each time record the prompts with context and generated output. +4. Run ragas evaluation for each of them to generate evaluation scores. +5. Compare the scores and you will know how much the change has affected your pipelines' performance. diff --git a/src/ragas/_analytics.py b/src/ragas/_analytics.py new file mode 100644 index 000000000..74a5067fc --- /dev/null +++ b/src/ragas/_analytics.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +import logging +import os +import typing as t +from dataclasses import asdict, dataclass +from functools import lru_cache, wraps + +import requests + +from ragas.utils import get_debug_mode + +if t.TYPE_CHECKING: + P = t.ParamSpec("P") + T = t.TypeVar("T") + AsyncFunc = t.Callable[P, t.Coroutine[t.Any, t.Any, t.Any]] + +logger = logging.getLogger(__name__) + + +USAGE_TRACKING_URL = "https://t.explodinggradients.com" +RAGAS_DO_NOT_TRACK = "RAGAS_DO_NOT_TRACK" +RAGAS_DEBUG_TRACKING = "__RAGAS_DEBUG_TRACKING" +USAGE_REQUESTS_TIMEOUT_SEC = 1 + + +@lru_cache(maxsize=1) +def do_not_track() -> bool: # pragma: no cover + # Returns True if and only if the environment variable is defined and has value True + # The function is cached for better performance. + return os.environ.get(RAGAS_DO_NOT_TRACK, str(False)).lower() == "true" + + +@lru_cache(maxsize=1) +def _usage_event_debugging() -> bool: + # For BentoML developers only - debug and print event payload if turned on + return os.environ.get(RAGAS_DEBUG_TRACKING, str(False)).lower() == "true" + + +def silent(func: t.Callable[P, T]) -> t.Callable[P, T]: # pragma: no cover + # Silent errors when tracking + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> t.Any: + try: + return func(*args, **kwargs) + except Exception as err: # pylint: disable=broad-except + if _usage_event_debugging(): + if get_debug_mode(): + logger.error( + "Tracking Error: %s", err, stack_info=True, stacklevel=3 + ) + else: + logger.info("Tracking Error: %s", err) + else: + logger.debug("Tracking Error: %s", err) + + return wrapper + + +@dataclass +class BaseEvent: + event_type: str + + +@dataclass +class EvaluationEvent(BaseEvent): + metrics: list[str] + evaluation_mode: str + num_rows: int + + +@silent +def track(event_properties: BaseEvent): + if do_not_track(): + return + + payload = asdict(event_properties) + + if _usage_event_debugging(): + # For internal debugging purpose + logger.info("Tracking Payload: %s", payload) + return + + requests.post(USAGE_TRACKING_URL, json=payload, timeout=USAGE_REQUESTS_TIMEOUT_SEC) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index dbb39b9c3..47447b7a5 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -6,6 +6,7 @@ import numpy as np from datasets import Dataset, concatenate_datasets +from ragas._analytics import EvaluationEvent, track from ragas.metrics.base import Metric EvaluationMode = Enum("EvaluationMode", "generative retrieval grounded") @@ -17,7 +18,7 @@ def get_evaluation_mode(ds: Dataset): possible evaluation types 1. (q,a,c) - 2. (q) + 2. (q,a) 3. (q,c) 4. (g,a) """ @@ -87,6 +88,17 @@ def evaluate( for metric in metrics: scores.append(metric.score(dataset).select_columns(metric.name)) + # log the evaluation event + metrics_names = [m.name for m in metrics] + track( + EvaluationEvent( + event_type="evaluation", + metrics=metrics_names, + evaluation_mode="", + num_rows=dataset.shape[0], + ) + ) + return Result(scores=concatenate_datasets(scores, axis=1), dataset=dataset) @@ -117,7 +129,9 @@ def to_pandas(self, batch_size: int | None = None, batched: bool = False): def __repr__(self) -> str: scores = self.copy() - ragas_score = scores.pop("ragas_score") - score_strs = [f"'ragas_score': {ragas_score:0.4f}"] + score_strs = [] + if "ragas_score" in scores: + ragas_score = scores.pop("ragas_score") + score_strs += f"'ragas_score': {ragas_score:0.4f}" score_strs.extend([f"'{k}': {v:0.4f}" for k, v in scores.items()]) return "{" + ", ".join(score_strs) + "}" diff --git a/src/ragas/utils.py b/src/ragas/utils.py index cc9eb84fa..4fc011089 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -1,12 +1,16 @@ from __future__ import annotations +import logging +import os import typing as t +from functools import lru_cache from warnings import warn import torch from torch import device as Device DEVICES = ["cpu", "cuda"] +DEBUG_ENV_VAR = "RAGAS_DEBUG" def device_check(device: t.Literal["cpu", "cuda"] | Device) -> torch.device: @@ -19,3 +23,12 @@ def device_check(device: t.Literal["cpu", "cuda"] | Device) -> torch.device: device = "cpu" return torch.device(device) + + +@lru_cache(maxsize=1) +def get_debug_mode() -> bool: + if os.environ.get(DEBUG_ENV_VAR, str(False)).lower() == "true": + logging.basicConfig(level=logging.DEBUG) + return True + else: + return False