New RAG example

esnible · esnible · commit ca278e17dfdd · 2025-02-24T10:39:26.000-05:00
Signed-off-by: Ed Snible &lt;snible@us.ibm.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -151,6 +151,9 @@ pdl-live/package-lock.json
 *_result.yaml
 *_trace.json
 
+# Demo files
+pdl-rag-demo.db
+
 # Built docs
 _site
 
diff --git a/examples/rag/README.md b/examples/rag/README.md
@@ -1,4 +1,30 @@
-This example requires you to install:
+This example uses [Ollama](../../tutorial/#using-ollama-models).  Fetch the models used in this example with
+
+```bash
+ollama pull mxbai-embed-large
+ollama pull granite-code:8b
 ```
-pip install scikit-learn
-```
+
+This example requires you to install pypdf, langchain, langchain-community, and milvus.
+
+```bash
+pip install pypdf milvus langchain langchain-community
+```
+
+To run the demo, first load a PDF document into the vector database:
+
+```bash
+pdl examples/rag/pdf_index.pdl
+```
+
+After the data has loaded, the program prints "Success!"
+
+Next, query the vector database for relevant text and use that text in a query to an LLM:
+
+```bash
+pdl examples/rag/pdf_query.pdl
+```
+
+This PDL program computes a data structure containing all questions and answers.  It is printed at the end.
+
+To cleanup, run `rm pdl-rag-demo.db`.
diff --git a/examples/rag/pdf_index.pdl b/examples/rag/pdf_index.pdl
@@ -0,0 +1,21 @@
+# Load PDF document into vector database
+
+description: Load document into vector database
+text:
+- include: rag_library1.pdl
+- call: ${ pdf_parse }
+  args:
+    filename: "docs/assets/pdl_quick_reference.pdf"
+    chunk_size: 400
+    chunk_overlap: 100
+  def: input_data
+  contribute: []
+- call: ${ rag_index }
+  args:
+    inp: ${ input_data }
+    encoder_model: "ollama/mxbai-embed-large"
+    embed_dimension: 1024
+    database_name: "./pdl-rag-demo.db"
+    collection_name: "pdl_rag_collection"  
+  contribute: []
+- "Success!"
diff --git a/examples/rag/pdf_query.pdl b/examples/rag/pdf_query.pdl
@@ -0,0 +1,52 @@
+# Query vector database for relevant passages; use passages to query LLM.
+
+defs:
+  QUESTIONS:
+    data: [
+      "Does PDL have a contribute keyword?",
+      "Is Brooklyn the capital of New York?"
+    ]
+  CONCLUSIONS:
+    lang: python
+    code: "result = {}"
+text:
+  - include: rag_library1.pdl
+  - lastOf:
+    - for:
+        question: ${ QUESTIONS }
+      repeat:
+        array:
+          # Define MATCHING_PASSAGES as the text retrieved from the vector DB
+          - def: MATCHING_PASSAGES
+            call: ${ rag_retrieve }
+            args:
+              # I am passing the client in implicitly.  NOT WHAT I WANT
+              inp: ${ question }
+              encoder_model: "ollama/mxbai-embed-large"
+              limit: 3
+              collection_name: "pdl_rag_collection"  
+              database_name: "./pdl-rag-demo.db"
+          # - lang: python
+          #   code: |
+          #      print(f"MATCHING_PASSAGES='{MATCHING_PASSAGES}'")
+          #      result = None
+          - model: ollama/granite-code:8b
+            def: CONCLUSION
+            input: >
+              Here is some information:
+              ${ MATCHING_PASSAGES }
+              Question: ${ question }
+              Answer:
+            parameters:
+              # I couldn't get this working
+              stop_sequences: ['Yes', 'No']
+              temperature: 0
+          - lang: python
+            code: |
+              # split()[0] needed because of stop_sequences not working
+              # print(f"CONCLUSION={CONCLUSION}")
+              CONCLUSIONS[question] = CONCLUSION.split()[0]
+              result = "dummy"
+    contribute: []
+  - text:
+    - "${ CONCLUSIONS | tojson }\n"
diff --git a/examples/rag/rag.py b/examples/rag/rag.py
@@ -0,0 +1,118 @@
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+from litellm import embedding
+from pymilvus import MilvusClient
+from pymilvus.exceptions import MilvusException
+
+
+def parse(filename: str, chunk_size: int, chunk_overlap: int) -> list[str]:
+    loader = PyPDFLoader(filename)
+
+    docs = loader.load()
+    # 'docs' will be a list[langchain_core.documents.base.Document],
+    # one entry per page.  We don't want to return this, because PDL only
+    # wants types that work in JSON schemas.
+
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        length_function=len,
+        is_separator_regex=False,
+    )
+
+    split_docs = text_splitter.split_documents(docs)
+
+    # Note that this throws away the metadata.
+    return [doc.page_content for doc in split_docs]
+
+
+def rag_index(
+    inp: list[str],
+    encoder_model: str,
+    embed_dimension: int,
+    database_name: str,
+    collection_name: str,
+):
+
+    # Have LiteLLM embed the passages
+    response = embedding(
+        model=encoder_model,
+        input=inp,
+    )
+
+    client = MilvusClient(
+        database_name
+    )  # Use URL if talking to remote Milvus (non-Lite)
+
+    if client.has_collection(collection_name=collection_name):
+        client.drop_collection(collection_name=collection_name)
+    client.create_collection(
+        collection_name=collection_name, dimension=embed_dimension, overwrite=True
+    )
+
+    mid = 0  # There is also an auto-id feature in Milvus, which we are not using
+    for text in inp:
+        vector = response.data[id]["embedding"]  # type: ignore
+        client.insert(
+            collection_name=collection_name,
+            data=[
+                {
+                    "id": mid,
+                    "text": text,
+                    "vector": vector,
+                    # We SHOULD set "source" and "url" based on the metadata we threw away in parse()
+                }
+            ],
+        )
+        mid = mid + 1
+
+    return True
+
+
+# Global cache of database clients.
+# (We do this so the PDL programmer doesn't need to explicitly maintain the client connection)
+DATABASE_CLIENTS: dict[str, MilvusClient] = {}
+
+
+def get_or_create_client(database_name: str):
+    if database_name in DATABASE_CLIENTS:
+        return DATABASE_CLIENTS[database_name]
+
+    client = MilvusClient(
+        database_name
+    )  # Use URL if talking to remote Milvus (non-Lite)
+    DATABASE_CLIENTS[database_name] = client
+    return client
+
+
+# Search vector database collection for input.
+# The output is 'limit' vectors, as strings, concatenated together
+def rag_retrieve(
+    inp: str, encoder_model: str, limit: int, database_name: str, collection_name: str
+) -> str:
+    # Embed the question as a vector
+    try:
+        response = embedding(
+            model=encoder_model,
+            input=[inp],
+        )
+    except MilvusException:  # This is usually a APIConnectionError
+        # Retry because of https://github.com/BerriAI/litellm/issues/7667
+        response = embedding(
+            model=encoder_model,
+            input=[inp],
+        )
+
+    data = response.data[0]["embedding"]
+
+    milvus_client = get_or_create_client(database_name)
+    search_res = milvus_client.search(
+        collection_name=collection_name,
+        data=[data],
+        limit=limit,  # Return top n results
+        search_params={"metric_type": "COSINE", "params": {}},
+        output_fields=["text"],  # Return the text field
+    )
+
+    # Note that this throws away document metadata (if any)
+    return "\n".join([res["entity"]["text"] for res in search_res[0]])
diff --git a/examples/rag/rag_library1.pdl b/examples/rag/rag_library1.pdl
@@ -0,0 +1,38 @@
+# This module can be included from a PDL program to bring in Python functions.
+
+description: RAG library for PDL
+text:
+- def: pdf_parse
+  function:
+    filename: str
+    chunk_size: int
+    chunk_overlap: int
+  return:
+    lang: python
+    code: |
+        from examples.rag.rag import parse
+        result = parse(filename, chunk_size, chunk_overlap)
+- def: rag_index
+  function:
+    inp: list # This is a list[str], but PDL doesn't allow that type
+    encoder_model: str
+    embed_dimension: int
+    database_name: str # optional, could also be URL?
+    collection_name: str
+  return:
+    lang: python
+    code: |
+        from examples.rag.rag import rag_index
+        result = rag_index(inp, encoder_model, embed_dimension, database_name, collection_name)
+- def: rag_retrieve
+  function:
+    inp: str
+    encoder_model: str
+    limit: int
+    collection_name: str
+    database_name: str # optional, could also be URL?
+  return:
+    lang: python
+    code: |
+        from examples.rag.rag import rag_retrieve
+        result = rag_retrieve(inp, encoder_model, limit, database_name, collection_name)
diff --git a/examples/tfidf_rag/README.md b/examples/tfidf_rag/README.md
@@ -0,0 +1,4 @@
+This example requires you to install:
+```
+pip install scikit-learn
+```
diff --git a/examples/tfidf_rag/rag.pdl b/examples/tfidf_rag/rag.pdl
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,10 @@ dev = [
   "pydantic~=2.9"
 ]
 examples = [
+  "pymilvus~=2.5",
+  "langchain~=0.3",
+  "langchain-community~=0.3",
+  "pypdf~==5.2",
   "wikipedia~=1.0",
   "textdistance~=4.0",
   "datasets>2,<4",

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +This example requires you to install:
 +```
 +pip install scikit-learn
 +```