Skip to content

Commit 9a253fc

Browse files
Merge pull request #197 from SyncfusionExamples/980096
980096: Offline Text Extraction from Scanned PDFs Using Tesseract 5 and Syncfusion
2 parents b5c8d74 + 55519cc commit 9a253fc

File tree

80 files changed

+75040
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+75040
-0
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
**/.classpath
2+
**/.dockerignore
3+
**/.env
4+
**/.git
5+
**/.gitignore
6+
**/.project
7+
**/.settings
8+
**/.toolstarget
9+
**/.vs
10+
**/.vscode
11+
**/*.*proj.user
12+
**/*.dbmdl
13+
**/*.jfm
14+
**/azds.yaml
15+
**/bin
16+
**/charts
17+
**/docker-compose*
18+
**/Dockerfile*
19+
**/node_modules
20+
**/npm-debug.log
21+
**/obj
22+
**/secrets.dev.yaml
23+
**/values.dev.yaml
24+
LICENSE
25+
README.md
26+
!**/.gitignore
27+
!.git/HEAD
28+
!.git/config
29+
!.git/packed-refs
30+
!.git/refs/heads/**
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio Version 17
4+
VisualStudioVersion = 17.14.36616.10 d17.14
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OCR-with-Tesseract-in-Docker-on-Linux", "OCR-with-Tesseract-in-Docker-on-Linux\OCR-with-Tesseract-in-Docker-on-Linux.csproj", "{40EBF01A-F47E-433F-9C5F-1E118D6BE123}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{40EBF01A-F47E-433F-9C5F-1E118D6BE123}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {82BE2249-6CF6-4098-8CFB-FE96ABAEE376}
24+
EndGlobalSection
25+
EndGlobal
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
using Microsoft.AspNetCore.Mvc;
2+
using OCR_with_Tesseract_in_Docker_on_Linux.Models;
3+
using Syncfusion.Drawing;
4+
using Syncfusion.OCRProcessor;
5+
using Syncfusion.Pdf.Graphics;
6+
using Syncfusion.Pdf.Parsing;
7+
using System.Diagnostics;
8+
using System.Xml.Linq;
9+
10+
namespace OCR_with_Tesseract_in_Docker_on_Linux.Controllers
11+
{
12+
public class HomeController : Controller
13+
{
14+
private readonly ILogger<HomeController> _logger;
15+
16+
public HomeController(ILogger<HomeController> logger)
17+
{
18+
_logger = logger;
19+
}
20+
21+
public IActionResult Index()
22+
{
23+
return View();
24+
}
25+
26+
public IActionResult Privacy()
27+
{
28+
return View();
29+
}
30+
public IActionResult PerformOCR()
31+
{
32+
string docPath = Path.GetFullPath(@"Data/Input.pdf");
33+
//Initialize the OCR processor.
34+
using (OCRProcessor processor = new OCRProcessor())
35+
{
36+
FileStream fileStream = new FileStream(docPath, FileMode.Open, FileAccess.Read);
37+
//Load a PDF document
38+
PdfLoadedDocument lDoc = new PdfLoadedDocument(fileStream);
39+
//Set OCR language to process
40+
processor.Settings.Language = Languages.English;
41+
IOcrEngine tesseractEngine = new Tesseract5OCREngine();
42+
processor.ExternalEngine = tesseractEngine;
43+
//Process OCR by providing the PDF document.
44+
processor.PerformOCR(lDoc);
45+
//Create memory stream
46+
using (MemoryStream stream = new MemoryStream())
47+
{
48+
//Save the document to memory stream
49+
lDoc.Save(stream);
50+
lDoc.Close();
51+
//Set the position as '0'
52+
stream.Position = 0;
53+
//Download the PDF document in the browser
54+
FileStreamResult fileStreamResult = new FileStreamResult(stream, "application/pdf");
55+
fileStreamResult.FileDownloadName = "Sample.pdf";
56+
return fileStreamResult;
57+
}
58+
}
59+
60+
}
61+
[ResponseCache(Duration = 0, Location = ResponseCacheLocation.None, NoStore = true)]
62+
public IActionResult Error()
63+
{
64+
return View(new ErrorViewModel { RequestId = Activity.Current?.Id ?? HttpContext.TraceIdentifier });
65+
}
66+
}
67+
// Tesseract5OcrEngine implementation
68+
class Tesseract5OCREngine : IOcrEngine
69+
{
70+
private float imageHeight;
71+
private float imageWidth;
72+
73+
public OCRLayoutResult PerformOCR(Stream stream)
74+
{
75+
if (stream == null || !stream.CanRead)
76+
throw new ArgumentException("Input stream is null or not readable for OCR.", nameof(stream));
77+
78+
stream.Position = 0;
79+
80+
using (MemoryStream tempMemStream = new MemoryStream())
81+
{
82+
stream.CopyTo(tempMemStream);
83+
tempMemStream.Position = 0;
84+
PdfTiffImage pdfTiffImage = new PdfTiffImage(tempMemStream);
85+
imageHeight = pdfTiffImage.Height;
86+
imageWidth = pdfTiffImage.Width;
87+
}
88+
89+
string tempImageFile = Path.GetTempFileName();
90+
string tempHocrFile = tempImageFile + ".hocr";
91+
92+
// Write stream to temp image file
93+
using (FileStream tempFileStream = new FileStream(tempImageFile, FileMode.Create, FileAccess.Write))
94+
{
95+
stream.Position = 0;
96+
stream.CopyTo(tempFileStream);
97+
}
98+
99+
ProcessStartInfo startInfo = new ProcessStartInfo
100+
{
101+
FileName = "tesseract",
102+
Arguments = $"\"{tempImageFile}\" \"{tempImageFile}\" -l eng hocr",
103+
RedirectStandardError = true,
104+
UseShellExecute = false,
105+
CreateNoWindow = true
106+
};
107+
108+
string hocrText = null;
109+
using (Process process = new Process { StartInfo = startInfo })
110+
{
111+
process.Start();
112+
string errorOutput = process.StandardError.ReadToEnd();
113+
process.WaitForExit();
114+
115+
if (process.ExitCode != 0)
116+
throw new Exception($"Tesseract process failed with exit code {process.ExitCode}. Error: {errorOutput}");
117+
118+
if (!File.Exists(tempHocrFile))
119+
throw new Exception("HOCR output file not found. Tesseract might have failed or not produced output.");
120+
121+
hocrText = File.ReadAllText(tempHocrFile);
122+
}
123+
124+
// Clean up temp files
125+
if (File.Exists(tempImageFile)) File.Delete(tempImageFile);
126+
if (File.Exists(tempHocrFile)) File.Delete(tempHocrFile);
127+
128+
if (string.IsNullOrEmpty(hocrText))
129+
throw new Exception("HOCR text could not be generated or was empty.");
130+
131+
var ocrLayoutResult = new OCRLayoutResult();
132+
BuildOCRLayoutResult(ocrLayoutResult, hocrText, imageWidth, imageHeight);
133+
ocrLayoutResult.ImageWidth = imageWidth;
134+
ocrLayoutResult.ImageHeight = imageHeight;
135+
136+
return ocrLayoutResult;
137+
}
138+
139+
void BuildOCRLayoutResult(OCRLayoutResult ocr, string hOcrText, float imageWidth, float imageHeight)
140+
{
141+
var doc = XDocument.Parse(hOcrText, LoadOptions.None);
142+
var ns = "http://www.w3.org/1999/xhtml";
143+
144+
foreach (var pageElement in doc.Descendants(ns + "div").Where(d => d.Attribute("class")?.Value == "ocr_page"))
145+
{
146+
Page ocrPage = new Page();
147+
148+
foreach (var lineElement in pageElement.Descendants(ns + "span")
149+
.Where(s => s.Attribute("class")?.Value == "ocr_line" || s.Attribute("class")?.Value == "ocr_header"))
150+
{
151+
Line ocrLine = new Line();
152+
153+
foreach (var wordElement in lineElement.Descendants(ns + "span")
154+
.Where(s => s.Attribute("class")?.Value == "ocrx_word"))
155+
{
156+
Word ocrWord = new Word { Text = wordElement.Value };
157+
String title = wordElement.Attribute("title")?.Value;
158+
159+
if (title != null)
160+
{
161+
String bboxString = title.Split(';')[0].Replace("bbox", "").Trim();
162+
int[] coords = bboxString.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(int.Parse).ToArray();
163+
164+
if (coords.Length == 4)
165+
{
166+
float x = coords[0];
167+
float y = coords[1];
168+
float width = coords[2] - coords[0];
169+
float height = coords[3] - coords[1];
170+
ocrWord.Rectangle = new RectangleF(x, y, width, height);
171+
}
172+
}
173+
174+
ocrLine.Add(ocrWord);
175+
}
176+
177+
ocrPage.Add(ocrLine);
178+
}
179+
180+
ocr.Add(ocrPage);
181+
}
182+
}
183+
}
184+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# See https://aka.ms/customizecontainer to learn how to customize your debug container and how Visual Studio uses this Dockerfile to build your images for faster debugging.
2+
3+
4+
# This stage is used when running from VS in fast mode (Default for Debug configuration)
5+
FROM mcr.microsoft.com/dotnet/runtime:8.0 AS
6+
RUN apt-get update && apt-get install -y tesseract-ocr
7+
USER $APP_UID
8+
WORKDIR /app
9+
10+
11+
# This stage is used to build the service project
12+
FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build
13+
ARG BUILD_CONFIGURATION=Release
14+
WORKDIR /src
15+
COPY ["OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj", "OCR-with-Tesseract-in-Docker-on-Linux/"]
16+
RUN dotnet restore "./OCR-with-Tesseract-in-Docker-on-Linux/OCR-with-Tesseract-in-Docker-on-Linux.csproj"
17+
COPY . .
18+
WORKDIR "/src/OCR-with-Tesseract-in-Docker-on-Linux"
19+
RUN dotnet build "./OCR-with-Tesseract-in-Docker-on-Linux.csproj" -c $BUILD_CONFIGURATION -o /app/build
20+
21+
# This stage is used to publish the service project to be copied to the final stage
22+
FROM build AS publish
23+
ARG BUILD_CONFIGURATION=Release
24+
RUN dotnet publish "./OCR-with-Tesseract-in-Docker-on-Linux.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
25+
26+
# This stage is used in production or when running from VS in regular mode (Default when not using the Debug configuration)
27+
FROM base AS final
28+
WORKDIR /app
29+
COPY --from=publish /app/publish .
30+
ENTRYPOINT ["dotnet", "OCR-with-Tesseract-in-Docker-on-Linux.dll"]
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
namespace OCR_with_Tesseract_in_Docker_on_Linux.Models
2+
{
3+
public class ErrorViewModel
4+
{
5+
public string? RequestId { get; set; }
6+
7+
public bool ShowRequestId => !string.IsNullOrEmpty(RequestId);
8+
}
9+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<Project Sdk="Microsoft.NET.Sdk.Web">
2+
3+
<PropertyGroup>
4+
<TargetFramework>net8.0</TargetFramework>
5+
<Nullable>enable</Nullable>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<RootNamespace>OCR_with_Tesseract_in_Docker_on_Linux</RootNamespace>
8+
<UserSecretsId>16743565-eaf2-4e18-8eb6-e6ba08388c1f</UserSecretsId>
9+
<DockerDefaultTargetOS>Linux</DockerDefaultTargetOS>
10+
</PropertyGroup>
11+
12+
<ItemGroup>
13+
<PackageReference Include="Microsoft.VisualStudio.Azure.Containers.Tools.Targets" Version="1.22.1" />
14+
<PackageReference Include="Syncfusion.PDF.OCR.Net.Core" Version="31.2.3" />
15+
</ItemGroup>
16+
17+
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3+
<PropertyGroup>
4+
<ActiveDebugProfile>IIS Express</ActiveDebugProfile>
5+
</PropertyGroup>
6+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
7+
<DebuggerFlavor>ProjectDebugger</DebuggerFlavor>
8+
</PropertyGroup>
9+
</Project>
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
var builder = WebApplication.CreateBuilder(args);
2+
3+
// Add services to the container.
4+
builder.Services.AddControllersWithViews();
5+
6+
var app = builder.Build();
7+
8+
// Configure the HTTP request pipeline.
9+
if (!app.Environment.IsDevelopment())
10+
{
11+
app.UseExceptionHandler("/Home/Error");
12+
// The default HSTS value is 30 days. You may want to change this for production scenarios, see https://aka.ms/aspnetcore-hsts.
13+
app.UseHsts();
14+
}
15+
16+
app.UseHttpsRedirection();
17+
app.UseStaticFiles();
18+
19+
app.UseRouting();
20+
21+
app.UseAuthorization();
22+
23+
app.MapControllerRoute(
24+
name: "default",
25+
pattern: "{controller=Home}/{action=Index}/{id?}");
26+
27+
app.Run();
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
{
2+
"profiles": {
3+
"http": {
4+
"commandName": "Project",
5+
"launchBrowser": true,
6+
"environmentVariables": {
7+
"ASPNETCORE_ENVIRONMENT": "Development"
8+
},
9+
"dotnetRunMessages": true,
10+
"applicationUrl": "http://localhost:5294"
11+
},
12+
"https": {
13+
"commandName": "Project",
14+
"launchBrowser": true,
15+
"environmentVariables": {
16+
"ASPNETCORE_ENVIRONMENT": "Development"
17+
},
18+
"dotnetRunMessages": true,
19+
"applicationUrl": "https://localhost:7239;http://localhost:5294"
20+
},
21+
"IIS Express": {
22+
"commandName": "IISExpress",
23+
"launchBrowser": true,
24+
"environmentVariables": {
25+
"ASPNETCORE_ENVIRONMENT": "Development"
26+
}
27+
},
28+
"Container (Dockerfile)": {
29+
"commandName": "Docker",
30+
"launchBrowser": true,
31+
"launchUrl": "{Scheme}://{ServiceHost}:{ServicePort}",
32+
"environmentVariables": {
33+
"ASPNETCORE_HTTPS_PORTS": "8081",
34+
"ASPNETCORE_HTTP_PORTS": "8080"
35+
},
36+
"publishAllPorts": true,
37+
"useSSL": true
38+
}
39+
},
40+
"$schema": "http://json.schemastore.org/launchsettings.json",
41+
"iisSettings": {
42+
"windowsAuthentication": false,
43+
"anonymousAuthentication": true,
44+
"iisExpress": {
45+
"applicationUrl": "http://localhost:25397",
46+
"sslPort": 44375
47+
}
48+
}
49+
}

0 commit comments

Comments
 (0)