Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 36 additions & 28 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ install:
printf "%s\n" "$$padding"; \
printf "\e[1;34m Do you want to run the application locally or with Docker?\e[0m\n"; \
printf "\e[1;33m [L] \e[0m Local - Run the application locally\n"; \
printf "\e[1;33m [D] \e[0m Docker - Run the axpplication in Docker\n"; \
printf "\e[1;33m [D] \e[0m Docker - Run the application in Docker\n"; \
read -p " > " choice; \
case "$$choice" in \
[lL]) echo -e "\033[1;32m ✔ You chose: Local Setup\033[0m"; $(MAKE) setup-local ;; \
Expand All @@ -47,38 +47,46 @@ setup-local:
cp .env.localhost.example .env.localhost; \
fi
@while true; do \
printf "\n\e[1;34m Python setup environment...\e[0m"; \
printf "\n\e[1;34m Python setup environment \e[0m"; \
printf "\e[1;34m\n Do you want to install requirements?\e[0m\n"; \
printf "\e[1;33m [y] \e[0m Yes - Install and then run application locally\n"; \
printf "\e[1;33m [n] \e[0m No - Skip and run application locally \n"; \
read -p " > " choice; \
case "$$choice" in \
[yY]) \
echo -e "\033[1;32m ✔ Installing Python dependencies...\033[0m"; \
$(MAKE) install-requirements; \
$(MAKE) run; \
break; \
;; \
[nN]|[sS]) \
echo -e "\033[1;33m Skipping requirement installation. Starting the local server instead...\033[0m"; \
$(MAKE) run; \
break; \
;; \
*) \
echo -e "\033[1;31m Invalid input: Please enter 'y', 'n', or 's' to proceed.\033[0m"; \
;; \
esac; \
printf "\e[1;33m [Y] \e[0m Yes - Install and then run application locally\n"; \
printf "\e[1;33m [N] \e[0m No - Skip and run application locally \n"; \
read -p " > " choice; \
case "$$choice" in \
[yY]) \
echo -e "\033[1;32m ✔ Installing Python dependencies...\033[0m"; \
$(MAKE) install-requirements; \
$(MAKE) run; \
break; \
;; \
[nN]|[sS]) \
echo -e "\033[1;32m ✔ Skipping requirement installation. Starting the local server instead...\033[0m"; \
$(MAKE) run; \
break; \
;; \
*) \
echo -e "\033[1;31m Invalid input: Please enter 'y', 'n', or 's' to proceed.\033[0m"; \
;; \
esac; \
done

.PHONY: install-linux
install-linux:
@echo -e "\033[1;34m Installing Linux dependencies...\033[0m"; \
@echo -e "\033[1;32m ✔ Installing Linux dependencies...\033[0m"; \
sudo apt update && sudo apt install -y libmagic1 poppler-utils pkg-config

.PHONY: install-macos
install-macos:
@echo -e "\033[1;34m Installing macOS dependencies...\033[0m"; \
brew update && brew install libmagic poppler pkg-config ghostscript ffmpeg automake autoconf
@commandToRun="brew update && brew install libmagic poppler pkg-config ghostscript ffmpeg automake autoconf"; \
printf "\e[1;34m\n The installer will execute the following command:\n > $$commandToRun\033[0m\n"; \
printf " Press \e[1;33m[ANY KEY]\e[0m to proceed with the installation, or \e[1;33m[N]\e[0m to skip (ensure these dependencies are installed manually):\n"; \
read -p " > " choice; \
if [ "$$choice" != "n" ] && [ "$$choice" != "N" ]; then \
sh -c "$$commandToRun"; \
echo -e "\033[1;32m ✔ macOS dependencies installed successfully.\033[0m"; \
else \
echo -e "\033[2m ➖ macOS dependency installation skipped.\033[0m"; \
fi

.PHONY: install-requirements
install-requirements:
Expand All @@ -89,7 +97,7 @@ install-requirements:
.PHONY: run
run:
@$(call load_env,.env.localhost)
@echo "Starting the local application server..."; \
@printf "\033[1;32m ✔ Starting the local application server...\033[0m"; \
DISABLE_VENV=$(DISABLE_VENV) DISABLE_LOCAL_OLLAMA=$(DISABLE_LOCAL_OLLAMA) ./run.sh

.PHONY: setup-docker
Expand All @@ -111,17 +119,17 @@ setup-docker:

.PHONY: run-docker
run-docker:
@echo -e "\033[1;34m Starting Docker container with CPU support...\033[0m";
@echo -e "\033[1;32m ✔ Starting Docker container with CPU support...\033[0m";
@docker-compose -f docker-compose.yml up --build

.PHONY: run-docker-gpu
run-docker-gpu:
@echo -e "\033[1;34m Starting Docker container with GPU support...\033[0m";
@echo -e "\033[1;32m ✔ Starting Docker container with GPU support...\033[0m";
@docker-compose -f docker-compose.gpu.yml -p text-extract-api-gpu up --build

.PHONY: clean
clean:
@echo "Cleaning project..."; \
@echo "\033[1;32m ✔ Cleaning project...\033[0m"; \
docker-compose down -v; \
$(MAKE) clear-cache

Expand Down
10 changes: 7 additions & 3 deletions config/strategies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ strategies:
prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
easyocr:
class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy
remote:
class: text_extract_api.extract.strategies.remote.RemoteStrategy
url:
docling:
class: text_extract_api.extract.strategies.docling.DoclingStrategy

# remote strategy example:
#remote:
# class: text_extract_api.extract.strategies.remote.RemoteStrategy
# url:
Binary file added examples/example-word-lorem.docx
Binary file not shown.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ dependencies = [
"google-auth-httplib2",
"google-auth-oauthlib",
"transformers",
"accelerate",
"boto3",
"Pillow",
"python-magic==0.4.27",
Expand All @@ -40,6 +41,8 @@ dependencies = [
"numpy",
"pydantic",
"python-dotenv",
"docling",
"docling-parse"
]
[project.optional-dependencies]
dev = [
Expand Down
73 changes: 73 additions & 0 deletions text_extract_api/extract/strategies/docling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import tempfile

from docling.document_converter import DocumentConverter
from docling_core.types.doc.document import ( # Assuming a compatible Docling library or module
DoclingDocument,
)

from text_extract_api.extract.extract_result import ExtractResult
from text_extract_api.extract.strategies.strategy import Strategy
from text_extract_api.files.file_formats import FileFormat, PdfFileFormat

class DoclingStrategy(Strategy):
"""
Extraction strategy for processing PDF documents using Docling.
"""

def name(self) -> str:
return "docling"

def extract_text(
self, file_format: FileFormat, language: str = "en"
) -> ExtractResult:
"""
Extracts text from a file using Docling and returns an ExtractResult.

:param file_format: Instance of FileFormat (which supports most docling formats).
:param language: Language of the text (default is 'en').
:return: ExtractResult containing the extracted DoclingDocument and metadata.
"""

# Save file content to a temporary file
temp_file_path = self._save_to_temp_file(file_format)

# Convert the document using Docling
docling_document = self._convert_to_docling(temp_file_path)

# Return the result wrapped in ExtractResult
return ExtractResult(value=docling_document, text_gatherer=self.text_gatherer)

def text_gatherer(self, docling_document: DoclingDocument) -> str:
"""
Gathers text content from a DoclingDocument in markdown format.

:param docling_document: Instance of DoclingDocument.
:return: Text content in markdown format.
"""
return docling_document.export_to_markdown()

def _convert_to_docling(self, file_path: str) -> DoclingDocument:
"""
Converts a PDF file into a DoclingDocument instance.

:param file_path: Path to the PDF file to be converted.
:return: DoclingDocument instance.
"""
# Placeholder for actual conversion logic using the Docling API
try:
converter = DocumentConverter()
docling_document = converter.convert(file_path).document
return docling_document
except Exception as e:
raise RuntimeError(f"Failed to convert document using Docling: {e}")

def _save_to_temp_file(self, file_format: FileFormat) -> str:
"""
Saves the content of a FileFormat instance to a temporary file.

:param file_format: Instance of FileFormat.
:return: Path to the temporary file containing the file content.
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(file_format.binary)
return temp_file.name
7 changes: 4 additions & 3 deletions text_extract_api/extract/strategies/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import tempfile
import time

import ollama
import httpx
from ollama import Client

from extract.extract_result import ExtractResult
from text_extract_api.extract.strategies.strategy import Strategy
Expand All @@ -26,7 +27,6 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
raise TypeError(
f"Ollama OCR - format {file_format.mime_type} is not supported (yet?)"
)

images = FileFormat.convert_to(file_format, ImageFileFormat)
extracted_text = ""
start_time = time.time()
Expand All @@ -38,9 +38,10 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
temp_file.write(image.binary)
temp_filename = temp_file.name

print(self._strategy_config)
# Generate text using the specified model
try:
timeout = httpx.Timeout(connect=180.0, read=180.0, write=180.0, pool=180.0) # @todo move those values to .env
ollama = Client(timeout=timeout)
response = ollama.chat(self._strategy_config.get('model'), [{
'role': 'user',
'content': self._strategy_config.get('prompt'),
Expand Down
9 changes: 9 additions & 0 deletions text_extract_api/files/file_formats/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

### WARNING
### This file is generated dynamically before git commit.
### Run ./scripts/dev/gen-file-format-init.sh from repository root.

from .file_format import FileFormat
from .docling import DoclingFileFormat
from .pdf import PdfFileFormat
from .image import ImageFileFormat
47 changes: 47 additions & 0 deletions text_extract_api/files/file_formats/docling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from typing import Type, Dict, Callable, Iterator
from text_extract_api.files.file_formats.file_format import FileFormat


class DoclingFileFormat(FileFormat):
DEFAULT_FILENAME: str = "document.docling"
DEFAULT_MIME_TYPE: str = "application/vnd.docling"

@staticmethod
def accepted_mime_types() -> list[str]:
return [
"application/pdf", # PDF documents
"application/vnd.docling", # Docling documents
"text/plain",
"text/markdown",
"text/html", # HTML documents
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.oasis.opendocument.text",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"image/jpeg",
"image/png",
"text/csv",
"application/json",
"application/xml",
]

@staticmethod
def is_pageable() -> bool:
return True

@classmethod
def default_iterator_file_format(cls) -> Type[FileFormat]:
return cls

@staticmethod
def convertible_to() -> Dict[Type["FileFormat"], Callable[[], Iterator["FileFormat"]]]:
# No specific converters needed as the strategy will handle conversion
return {}

@staticmethod
def validate(binary_file_content: bytes):
if not binary_file_content or len(binary_file_content) == 0:
raise ValueError("Empty file content")
3 changes: 3 additions & 0 deletions text_extract_api/files/file_formats/file_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def from_binary(
filename: Optional[str] = None,
mime_type: Optional[str] = None
) -> Type["FileFormat"]:
if mime_type == "application/octet-stream":
mime_type = None
mime_type = mime_type or FileFormat._guess_mime_type(binary_data=binary, filename=filename)
from text_extract_api.files.file_formats.pdf import PdfFileFormat # type: ignore
file_format_class = cls._get_file_format_class(mime_type)
Expand Down Expand Up @@ -196,6 +198,7 @@ def unify(self) -> "FileFormat":
def _get_file_format_class(mime_type: str) -> Type["FileFormat"]:
import text_extract_api.files.file_formats.pdf # noqa - its not unused import @todo autodiscover
import text_extract_api.files.file_formats.image # noqa - its not unused import @todo autodiscover
import text_extract_api.files.file_formats.docling # noqa - its not unused import @todo autodiscover
for subclass in FileFormat.__subclasses__():
if mime_type in subclass.accepted_mime_types():
return subclass
Expand Down
3 changes: 0 additions & 3 deletions text_extract_api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# Define base path as text_extract_api - required for keeping absolute namespaces
sys.path.insert(0, str(pathlib.Path(__file__).parent.resolve()))


def storage_profile_exists(profile_name: str) -> bool:
profile_path = os.path.abspath(
os.path.join(os.getenv('STORAGE_PROFILE_PATH', './storage_profiles'), f'{profile_name}.yaml'))
Expand All @@ -29,13 +28,11 @@ def storage_profile_exists(profile_name: str) -> bool:
return os.path.isfile(sub_profile_path)
return True


app = FastAPI()
# Connect to Redis
redis_url = os.getenv('REDIS_CACHE_URL', 'redis://redis:6379/1')
redis_client = redis.StrictRedis.from_url(redis_url)


@app.post("/ocr")
async def ocr_endpoint(
strategy: str = Form(...),
Expand Down