CatchTheTornado · choinek · Apr 29, 2025 · Jan 19, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/Makefile b/Makefile
@@ -31,7 +31,7 @@ install:
 	printf "%s\n" "$$padding"; \
 	printf "\e[1;34m   Do you want to run the application locally or with Docker?\e[0m\n"; \
 	printf "\e[1;33m   [L] \e[0m Local - Run the application locally\n"; \
-	printf "\e[1;33m   [D] \e[0m Docker - Run the axpplication in Docker\n"; \
+	printf "\e[1;33m   [D] \e[0m Docker - Run the application in Docker\n"; \
 	read -p "   > " choice; \
 	case "$$choice" in \
 		[lL]) echo -e "\033[1;32m   ✔ You chose: Local Setup\033[0m"; $(MAKE) setup-local ;; \
@@ -47,38 +47,46 @@ setup-local:
 	  	cp .env.localhost.example .env.localhost; \
 	fi
 	@while true; do \
-		printf  "\n\e[1;34m   Python setup environment...\e[0m"; \
+		printf  "\n\e[1;34m   Python setup environment \e[0m"; \
 		printf "\e[1;34m\n   Do you want to install requirements?\e[0m\n"; \
-		printf "\e[1;33m   [y] \e[0m Yes - Install and then run application locally\n"; \
-		printf "\e[1;33m   [n] \e[0m No  - Skip and run application locally \n"; \
-	read -p "   > " choice; \
-		case "$$choice" in \
-			[yY]) \
-				echo -e "\033[1;32m   ✔ Installing Python dependencies...\033[0m"; \
-				$(MAKE) install-requirements; \
-				$(MAKE) run; \
-				break; \
-				;; \
-			[nN]|[sS]) \
-				echo -e "\033[1;33m   Skipping requirement installation. Starting the local server instead...\033[0m"; \
-				$(MAKE) run; \
-				break; \
-				;; \
-			*) \
-				echo -e "\033[1;31m   Invalid input: Please enter 'y', 'n', or 's' to proceed.\033[0m"; \
-				;; \
-		esac; \
+		printf "\e[1;33m   [Y] \e[0m Yes - Install and then run application locally\n"; \
+		printf "\e[1;33m   [N] \e[0m No  - Skip and run application locally \n"; \
+		read -p "   > " choice; \
+			case "$$choice" in \
+				[yY]) \
+					echo -e "\033[1;32m   ✔ Installing Python dependencies...\033[0m"; \
+					$(MAKE) install-requirements; \
+					$(MAKE) run; \
+					break; \
+					;; \
+				[nN]|[sS]) \
+					echo -e "\033[1;32m   ✔ Skipping requirement installation. Starting the local server instead...\033[0m"; \
+					$(MAKE) run; \
+					break; \
+					;; \
+				*) \
+					echo -e "\033[1;31m   Invalid input: Please enter 'y', 'n', or 's' to proceed.\033[0m"; \
+					;; \
+			esac; \
 	done
 
 .PHONY: install-linux
 install-linux:
-	@echo -e "\033[1;34m   Installing Linux dependencies...\033[0m"; \
+	@echo -e "\033[1;32m   ✔ Installing Linux dependencies...\033[0m"; \
 	sudo apt update && sudo apt install -y libmagic1 poppler-utils pkg-config
 
 .PHONY: install-macos
 install-macos:
-	@echo -e "\033[1;34m   Installing macOS dependencies...\033[0m"; \
-	brew update && brew install libmagic poppler pkg-config ghostscript ffmpeg automake autoconf
+	@commandToRun="brew update && brew install libmagic poppler pkg-config ghostscript ffmpeg automake autoconf"; \
+	printf "\e[1;34m\n   The installer will execute the following command:\n      > $$commandToRun\033[0m\n"; \
+	printf "   Press \e[1;33m[ANY KEY]\e[0m to proceed with the installation, or \e[1;33m[N]\e[0m to skip (ensure these dependencies are installed manually):\n"; \
+	read -p "   > " choice; \
+	if [ "$$choice" != "n" ] && [ "$$choice" != "N" ]; then \
+		sh -c "$$commandToRun"; \
+		echo -e "\033[1;32m   ✔ macOS dependencies installed successfully.\033[0m"; \
+	else \
+		echo -e "\033[2m   ➖ macOS dependency installation skipped.\033[0m"; \
+	fi
 
 .PHONY: install-requirements
 install-requirements:
@@ -89,7 +97,7 @@ install-requirements:
 .PHONY: run
 run:
 	@$(call load_env,.env.localhost)
-	@echo "Starting the local application server..."; \
+	@printf "\033[1;32m   ✔ Starting the local application server...\033[0m"; \
 	DISABLE_VENV=$(DISABLE_VENV) DISABLE_LOCAL_OLLAMA=$(DISABLE_LOCAL_OLLAMA) ./run.sh
 
 .PHONY: setup-docker
@@ -111,17 +119,17 @@ setup-docker:
 
 .PHONY: run-docker
 run-docker:
-	@echo -e "\033[1;34m   Starting Docker container with CPU support...\033[0m";
+	@echo -e "\033[1;32m   ✔ Starting Docker container with CPU support...\033[0m";
 	@docker-compose -f docker-compose.yml up --build
 
 .PHONY: run-docker-gpu
 run-docker-gpu:
-	@echo -e "\033[1;34m   Starting Docker container with GPU support...\033[0m";
+	@echo -e "\033[1;32m   ✔ Starting Docker container with GPU support...\033[0m";
 	@docker-compose -f docker-compose.gpu.yml -p text-extract-api-gpu up --build
 
 .PHONY: clean
 clean:
-	@echo "Cleaning project..."; \
+	@echo "\033[1;32m   ✔ Cleaning project...\033[0m"; \
 	docker-compose down -v; \
 	$(MAKE) clear-cache
 

diff --git a/config/strategies.yaml b/config/strategies.yaml
@@ -9,6 +9,10 @@ strategies:
       prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
    easyocr:
       class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy
-   remote:
-      class: text_extract_api.extract.strategies.remote.RemoteStrategy
-      url:
+   docling:
+      class: text_extract_api.extract.strategies.docling.DoclingStrategy
+
+   # remote strategy example:
+   #remote:
+   #   class: text_extract_api.extract.strategies.remote.RemoteStrategy
+   #   url:
diff --git a/examples/example-word-lorem.docx b/examples/example-word-lorem.docx
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "google-auth-httplib2",
     "google-auth-oauthlib",
     "transformers",
+    "accelerate",
     "boto3",
     "Pillow",
     "python-magic==0.4.27",
@@ -40,6 +41,8 @@ dependencies = [
     "numpy",
     "pydantic",
     "python-dotenv",
+    "docling",
+    "docling-parse"
 ]
 [project.optional-dependencies]
 dev = [

diff --git a/text_extract_api/extract/strategies/docling.py b/text_extract_api/extract/strategies/docling.py
@@ -0,0 +1,73 @@
+import tempfile
+
+from docling.document_converter import DocumentConverter
+from docling_core.types.doc.document import (  # Assuming a compatible Docling library or module
+    DoclingDocument,
+)
+
+from text_extract_api.extract.extract_result import ExtractResult
+from text_extract_api.extract.strategies.strategy import Strategy
+from text_extract_api.files.file_formats import FileFormat, PdfFileFormat
+
+class DoclingStrategy(Strategy):
+    """
+    Extraction strategy for processing PDF documents using Docling.
+    """
+
+    def name(self) -> str:
+        return "docling"
+
+    def extract_text(
+        self, file_format: FileFormat, language: str = "en"
+    ) -> ExtractResult:
+        """
+        Extracts text from a file using Docling and returns an ExtractResult.
+
+        :param file_format: Instance of FileFormat (which supports most docling formats).
+        :param language: Language of the text (default is 'en').
+        :return: ExtractResult containing the extracted DoclingDocument and metadata.
+        """
+
+        # Save file content to a temporary file
+        temp_file_path = self._save_to_temp_file(file_format)
+
+        # Convert the document using Docling
+        docling_document = self._convert_to_docling(temp_file_path)
+
+        # Return the result wrapped in ExtractResult
+        return ExtractResult(value=docling_document, text_gatherer=self.text_gatherer)
+
+    def text_gatherer(self, docling_document: DoclingDocument) -> str:
+        """
+        Gathers text content from a DoclingDocument in markdown format.
+
+        :param docling_document: Instance of DoclingDocument.
+        :return: Text content in markdown format.
+        """
+        return docling_document.export_to_markdown()
+
+    def _convert_to_docling(self, file_path: str) -> DoclingDocument:
+        """
+        Converts a PDF file into a DoclingDocument instance.
+
+        :param file_path: Path to the PDF file to be converted.
+        :return: DoclingDocument instance.
+        """
+        # Placeholder for actual conversion logic using the Docling API
+        try:
+            converter = DocumentConverter()
+            docling_document = converter.convert(file_path).document
+            return docling_document
+        except Exception as e:
+            raise RuntimeError(f"Failed to convert document using Docling: {e}")
+
+    def _save_to_temp_file(self, file_format: FileFormat) -> str:
+        """
+        Saves the content of a FileFormat instance to a temporary file.
+
+        :param file_format: Instance of FileFormat.
+        :return: Path to the temporary file containing the file content.
+        """
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+            temp_file.write(file_format.binary)
+            return temp_file.name
diff --git a/text_extract_api/extract/strategies/ollama.py b/text_extract_api/extract/strategies/ollama.py
@@ -2,7 +2,8 @@
 import tempfile
 import time
 
-import ollama
+import httpx
+from ollama import Client
 
 from extract.extract_result import ExtractResult
 from text_extract_api.extract.strategies.strategy import Strategy
@@ -26,7 +27,6 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
             raise TypeError(
                 f"Ollama OCR - format {file_format.mime_type} is not supported (yet?)"
             )
-
         images = FileFormat.convert_to(file_format, ImageFileFormat)
         extracted_text = ""
         start_time = time.time()
@@ -38,9 +38,10 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
                 temp_file.write(image.binary)
                 temp_filename = temp_file.name
 
-            print(self._strategy_config)
             # Generate text using the specified model
             try:
+                timeout = httpx.Timeout(connect=180.0, read=180.0, write=180.0, pool=180.0) # @todo move those values to .env
+                ollama = Client(timeout=timeout)
                 response = ollama.chat(self._strategy_config.get('model'), [{
                     'role': 'user',
                     'content': self._strategy_config.get('prompt'),

diff --git a/text_extract_api/files/file_formats/__init__.py b/text_extract_api/files/file_formats/__init__.py
@@ -0,0 +1,9 @@
+
+### WARNING
+###    This file is generated dynamically before git commit.
+###    Run ./scripts/dev/gen-file-format-init.sh from repository root.
+
+from .file_format import FileFormat
+from .docling import DoclingFileFormat
+from .pdf import PdfFileFormat
+from .image import ImageFileFormat
diff --git a/text_extract_api/files/file_formats/docling.py b/text_extract_api/files/file_formats/docling.py
@@ -0,0 +1,47 @@
+from typing import Type, Dict, Callable, Iterator
+from text_extract_api.files.file_formats.file_format import FileFormat
+
+
+class DoclingFileFormat(FileFormat):
+    DEFAULT_FILENAME: str = "document.docling"
+    DEFAULT_MIME_TYPE: str = "application/vnd.docling"
+
+    @staticmethod
+    def accepted_mime_types() -> list[str]:
+        return [
+            "application/pdf",  # PDF documents
+            "application/vnd.docling",  # Docling documents
+            "text/plain",
+            "text/markdown",
+            "text/html",  # HTML documents
+            "application/msword",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "application/vnd.oasis.opendocument.text",
+            "application/vnd.ms-excel",
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            "application/vnd.ms-powerpoint",
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+            "image/jpeg",
+            "image/png",
+            "text/csv",
+            "application/json",
+            "application/xml",
+        ]
+
+    @staticmethod
+    def is_pageable() -> bool:
+        return True
+
+    @classmethod
+    def default_iterator_file_format(cls) -> Type[FileFormat]:
+        return cls
+
+    @staticmethod
+    def convertible_to() -> Dict[Type["FileFormat"], Callable[[], Iterator["FileFormat"]]]:
+        # No specific converters needed as the strategy will handle conversion
+        return {}
+
+    @staticmethod
+    def validate(binary_file_content: bytes):
+        if not binary_file_content or len(binary_file_content) == 0:
+            raise ValueError("Empty file content")
diff --git a/text_extract_api/files/file_formats/file_format.py b/text_extract_api/files/file_formats/file_format.py
@@ -65,6 +65,8 @@ def from_binary(
             filename: Optional[str] = None,
             mime_type: Optional[str] = None
     ) -> Type["FileFormat"]:
+        if mime_type == "application/octet-stream":
+            mime_type = None
         mime_type = mime_type or FileFormat._guess_mime_type(binary_data=binary, filename=filename)
         from text_extract_api.files.file_formats.pdf import PdfFileFormat  # type: ignore
         file_format_class = cls._get_file_format_class(mime_type)
@@ -196,6 +198,7 @@ def unify(self) -> "FileFormat":
     def _get_file_format_class(mime_type: str) -> Type["FileFormat"]:
         import text_extract_api.files.file_formats.pdf  # noqa - its not unused import @todo autodiscover
         import text_extract_api.files.file_formats.image  # noqa - its not unused import @todo autodiscover
+        import text_extract_api.files.file_formats.docling  # noqa - its not unused import @todo autodiscover
         for subclass in FileFormat.__subclasses__():
             if mime_type in subclass.accepted_mime_types():
                 return subclass

diff --git a/text_extract_api/main.py b/text_extract_api/main.py
@@ -19,7 +19,6 @@
 # Define base path as text_extract_api - required for keeping absolute namespaces
 sys.path.insert(0, str(pathlib.Path(__file__).parent.resolve()))
 
-
 def storage_profile_exists(profile_name: str) -> bool:
     profile_path = os.path.abspath(
         os.path.join(os.getenv('STORAGE_PROFILE_PATH', './storage_profiles'), f'{profile_name}.yaml'))
@@ -29,13 +28,11 @@ def storage_profile_exists(profile_name: str) -> bool:
         return os.path.isfile(sub_profile_path)
     return True
 
-
 app = FastAPI()
 # Connect to Redis
 redis_url = os.getenv('REDIS_CACHE_URL', 'redis://redis:6379/1')
 redis_client = redis.StrictRedis.from_url(redis_url)
 
-
 @app.post("/ocr")
 async def ocr_endpoint(
         strategy: str = Form(...),