Skip to content

Commit

Permalink
Sdk: Make OCR sdk-friendly
Browse files Browse the repository at this point in the history
  • Loading branch information
cyanfish committed Dec 29, 2023
1 parent 9a6fb36 commit dd83057
Show file tree
Hide file tree
Showing 10 changed files with 100 additions and 24 deletions.
7 changes: 1 addition & 6 deletions NAPS2.Lib/Modules/CommonModule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,7 @@ protected override void Load(ContainerBuilder builder)
}).SingleInstance();
builder.Register<IOcrEngine>(ctx =>
{
var tesseractPath = PlatformCompat.System.UseSystemTesseract
? "tesseract"
: NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName);
var engine = new TesseractOcrEngine(
tesseractPath,
ctx.Resolve<TesseractLanguageManager>().TessdataBasePath);
var engine = TesseractOcrEngine.BundledWithModes(ctx.Resolve<TesseractLanguageManager>().TessdataBasePath);
var errorOutput = ctx.Resolve<ErrorOutput>();
engine.OcrError += (_, args) => errorOutput.DisplayError(SdkResources.OcrError, args.Exception);
engine.OcrTimeout += (_, _) => errorOutput.DisplayError(SdkResources.OcrTimeout);
Expand Down
43 changes: 43 additions & 0 deletions NAPS2.Sdk.Samples/OcrSample.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using NAPS2.Images.Gdi;
using NAPS2.Ocr;
using NAPS2.Pdf;
using NAPS2.Scan;

namespace NAPS2.Sdk.Samples;

public class OcrSample
{
public static async Task OcrAndExportPdf()
{
// Exporting PDFs with OCR requires the optional NAPS2.Tesseract.Binaries Nuget package to be installed.
// Or, alternatively, you can use the system-installed Tesseract or provide a custom path to a Tesseract EXE.

using var scanningContext = new ScanningContext(new GdiImageContext());

// The NAPS2.Tesseract.Binaries package doesn't include all the actual language data (1GB+ for 100+ languages).
// You can download .traineddata files from one of these repos:
// - https://github.com/tesseract-ocr/tessdata_fast
// - https://github.com/tesseract-ocr/tessdata_best
// Then specify the folder where those .traineddata files are stored.
scanningContext.OcrEngine = TesseractOcrEngine.Bundled(@"C:\path\to\my\traineddata\files\");

// Or if you know Tesseract is installed on the system PATH you can just do this without needing any extra
// packages or downloads.
scanningContext.OcrEngine = TesseractOcrEngine.System();

// Or if you have a custom path to the tesseract EXE you can do this.
scanningContext.OcrEngine = TesseractOcrEngine.Custom(@"C:\path\to\tesseract.exe");

// Scan some images
var controller = new ScanController(scanningContext);
var devices = await controller.GetDeviceList();
var options = new ScanOptions { Device = devices.First() };
var images = await controller.Scan(options).ToListAsync();

// Export to PDF with OCR
var pdfExporter = new PdfExporter(scanningContext);
// We specify the language code for OCR. This is based on the name of the .traineddata file, and is found here:
// https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
await pdfExporter.Export("doc.pdf", images, ocrParams: new OcrParams("eng"));
}
}
3 changes: 1 addition & 2 deletions NAPS2.Sdk.Tests/ContextualTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ public void SetUpOcr()
var tesseractPath = NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName, depsRoot);
CopyResourceToFile(BinaryResources.eng_traineddata, fast, "eng.traineddata");
CopyResourceToFile(BinaryResources.heb_traineddata, fast, "heb.traineddata");
ScanningContext.OcrEngine =
new TesseractOcrEngine(tesseractPath, FolderPath);
ScanningContext.OcrEngine = TesseractOcrEngine.CustomWithModes(tesseractPath, FolderPath);
}

public void SetUpFakeOcr() => SetUpFakeOcr(new());
Expand Down
2 changes: 1 addition & 1 deletion NAPS2.Sdk/Ocr/OcrParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
/// For language codes, see
/// https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016
/// </summary>
public record OcrParams(string? LanguageCode, OcrMode Mode, double TimeoutInSeconds)
public record OcrParams(string? LanguageCode, OcrMode Mode = OcrMode.Default, double TimeoutInSeconds = 0)
{
private OcrParams()
: this(null, OcrMode.Default, 0)
Expand Down
56 changes: 51 additions & 5 deletions NAPS2.Sdk/Ocr/TesseractOcrEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,62 @@
using System.Xml;
using Microsoft.Extensions.Logging;
using NAPS2.Scan;
using NAPS2.Unmanaged;

namespace NAPS2.Ocr;

public class TesseractOcrEngine : IOcrEngine
{
private readonly string _tesseractPath;
private readonly string? _languageDataBasePath;
private readonly bool _withModes;

public TesseractOcrEngine(string tesseractPath, string? languageDataBasePath = null)
/// <summary>
/// Gets a TesseractOcrEngine instance configured to use the Tesseract executable on the system PATH with the
/// system-installed language data.
/// </summary>
public static TesseractOcrEngine System() =>
new("tesseract");

/// <summary>
/// Gets a TesseractOcrEngine instance configured to use the Tesseract executable from the NAPS2.Tesseract.Binaries
/// nuget package using language data .traineddata files in the specified folder.
/// </summary>
public static TesseractOcrEngine Bundled(string languageDataPath) =>
new(BundlePath, languageDataPath, false);

/// <summary>
/// Gets a TesseractOcrEngine instance configured to use the Tesseract executable from the NAPS2.Tesseract.Binaries
/// nuget package using language data .traineddata files in the specified folder. The folder is expected to have
/// subfolders named "best" and "fast" with the actual .trainneddata files that will be used based on the OcrMode.
/// </summary>
public static TesseractOcrEngine BundledWithModes(string languageDataBasePath) =>
new(BundlePath, languageDataBasePath, true);

/// <summary>
/// Gets a TesseractOcrEngine instance configured to use the specified Tesseract executable, optionally looking for
/// .traineddata files in the specified folder.
/// </summary>
public static TesseractOcrEngine Custom(string tesseractExePath, string? languageDataPath = null) =>
new(tesseractExePath, languageDataPath, false);

/// <summary>
/// Gets a TesseractOcrEngine instance configured to use the specified Tesseract executable using language data
/// .traineddata files in the specified folder. The folder is expected to have subfolders named "best" and "fast"
/// with the actual .trainneddata files that will be used based on the OcrMode.
/// </summary>
public static TesseractOcrEngine CustomWithModes(string tesseractExePath, string languageDataBasePath) =>
new(tesseractExePath, languageDataBasePath, true);

private static string BundlePath => NativeLibrary.FindExePath(PlatformCompat.System.TesseractExecutableName);

private TesseractOcrEngine(string tesseractPath, string? languageDataBasePath = null, bool withModes = true)
{
_tesseractPath = tesseractPath;
_languageDataBasePath = languageDataBasePath;
_withModes = withModes;
}

public async Task<OcrResult?> ProcessImage(ScanningContext scanningContext, string imagePath, OcrParams ocrParams,
CancellationToken cancelToken)
{
Expand All @@ -36,8 +78,12 @@ public TesseractOcrEngine(string tesseractPath, string? languageDataBasePath = n
};
if (_languageDataBasePath != null)
{
string subfolder = ocrParams.Mode == OcrMode.Best ? "best" : "fast";
string languageDataPath = Path.Combine(_languageDataBasePath, subfolder);
string languageDataPath = _languageDataBasePath;
if (_withModes)
{
string subfolder = ocrParams.Mode == OcrMode.Best ? "best" : "fast";
languageDataPath = Path.Combine(languageDataPath, subfolder);
}
startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = languageDataPath;
var tessdata = new DirectoryInfo(languageDataPath);
EnsureHocrConfigExists(tessdata);
Expand Down Expand Up @@ -193,7 +239,7 @@ private void EnsureHocrConfigExists(DirectoryInfo tessdata)
}
return bounds;
}

// TODO: Consider adding back CanProcess, or otherwise using this code to get the languages from a system engine
// private void CheckIfInstalled()
// {
Expand Down
2 changes: 0 additions & 2 deletions NAPS2.Sdk/Platform/ISystemCompat.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ internal interface ISystemCompat

bool ShouldRememberBackgroundOperations { get; }

bool UseSystemTesseract { get; }

bool RenderInWorker { get; }

bool SupportsWinX86Worker { get; }
Expand Down
2 changes: 0 additions & 2 deletions NAPS2.Sdk/Platform/LinuxSystemCompat.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ internal class LinuxSystemCompat : ISystemCompat

public bool ShouldRememberBackgroundOperations => true;

public bool UseSystemTesseract => false;

public bool RenderInWorker => false;

public bool SupportsWinX86Worker => false;
Expand Down
2 changes: 0 additions & 2 deletions NAPS2.Sdk/Platform/MacSystemCompat.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ internal class MacSystemCompat : ISystemCompat

public bool ShouldRememberBackgroundOperations => true;

public bool UseSystemTesseract => false;

public bool RenderInWorker => false;

public bool SupportsWinX86Worker => false;
Expand Down
2 changes: 0 additions & 2 deletions NAPS2.Sdk/Platform/WindowsSystemCompat.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ internal abstract class WindowsSystemCompat : ISystemCompat

public bool ShouldRememberBackgroundOperations => true;

public bool UseSystemTesseract => false;

public bool RenderInWorker => true;

public bool SupportsWinX86Worker => true;
Expand Down
5 changes: 3 additions & 2 deletions NAPS2.Sdk/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ NAPS2.Sdk is modular, and depending on your needs you may have to reference a di
- **[NAPS2.Sdk.Worker.Win32](https://www.nuget.org/packages/NAPS2.Sdk.Worker.Win32/)**
- For scanning with [TWAIN on Windows](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/TwainSample.cs).
- **[NAPS2.Pdfium.Binaries](https://www.nuget.org/packages/NAPS2.Pdfium.Binaries/)**
- For [importing PDFs]().
- For [importing PDFs](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/PdfImportSample.cs).
- **[NAPS2.Sane.Binaries](https://www.nuget.org/packages/NAPS2.Sane.Binaries/)**
- For [using SANE drivers]() on Mac. (Linux has them pre-installed, and Windows isn't supported.)
- **[NAPS2.Tesseract.Binaries](https://www.nuget.org/packages/NAPS2.Tesseract.Binaries/)**
- For [running OCR](). (You can also use a separate Tesseract installation if you like.)
- For [running OCR](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/OcrSample.cs). (You can also use a separate Tesseract installation if you like.)
- **[NAPS2.Escl.Server](https://www.nuget.org/packages/NAPS2.Escl.Server/)**
- For [sharing scanners](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/NetworkSharingSample.cs) across the local network.

Expand Down Expand Up @@ -73,6 +73,7 @@ More [samples](https://github.com/cyanfish/naps2/tree/master/NAPS2.Sdk.Samples):
- [Scan with TWAIN drivers](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/TwainSample.cs)
- [Scan to System.Drawing.Bitmap](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/ScanToBitmapSample.cs)
- [Import and export PDFs](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/PdfImportSample.cs)
- [Export PDFs with OCR](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/OcrSample.cs)
- [Store image data on the filesystem](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/FileStorageSample.cs)
- [Share scanners on the local network](https://github.com/cyanfish/naps2/blob/master/NAPS2.Sdk.Samples/NetworkSharingSample.cs)

Expand Down

0 comments on commit dd83057

Please sign in to comment.