diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a087fd..71f5271 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,20 @@ # Changelog +### [3.2.16] - 2026-04-24 +- Added global `Utils.MuPDFLock` and synchronized MuPDF native calls for improved thread safety. +- Improved Tesseract OCR stability in the `PDF4LLM` OCR pipeline and hardened OCR helper behavior. +- Fixed a regression in Llama `LoadData` and added a new `TableExtract` demo sample. +- Updated `PDF4LLM` package metadata and NuGet project files. + +### [3.2.15] - 2026-04-17 +- Migrated the helper package from `MuPDF.NET4LLM` to `PDF4LLM` and refreshed the package layout, demos, and documentation. +- Added file-path overloads for `ToMarkdown`, `ToJson`, and `ToText` helpers. +- Updated `PDF4LLM` package support for the latest MuPDF bindings and metadata. + +### [3.2.14] - 2026-03-23 +- Fixed issue #234 in page/text utilities and added a regression test in `UtilsTest`. +- Minor `PDF4LLM` documentation and comment updates. + ### [3.2.13] - 2026-03-18 - Added **MuPDF.NET4LLM** as a separate NuGet package: LLM/RAG helpers for PDF-to-Markdown conversion, layout parsing, document structure analysis, and LlamaIndex integration. Install via `dotnet add package MuPDF.NET4LLM`; depends on MuPDF.NET. - Fixed `DocumentWriter` leak in `Story.WriteWithLinks` and `Story.WriteStabilizedWithLinks` (dispose via `using`). diff --git a/Demo/Program.cs b/Demo/Program.cs index 765247c..c5d29c0 100644 --- a/Demo/Program.cs +++ b/Demo/Program.cs @@ -1,4 +1,6 @@ -namespace Demo +using System.Threading.Tasks; + +namespace Demo { /// /// GitHub samples entry point. With no arguments, all samples run; see . diff --git a/Demo/SampleMenu.cs b/Demo/SampleMenu.cs index 25c73f2..3a4b1af 100644 --- a/Demo/SampleMenu.cs +++ b/Demo/SampleMenu.cs @@ -80,6 +80,7 @@ private sealed record Sample(string Category, string Name, string Description, A new("Regression & diagnostics", "issue-213", "Repro: drawing paths / line width", _ => Program.TestIssue213()), new("Regression & diagnostics", "issue-1880", "Repro: read Data Matrix barcodes", _ => Program.TestIssue1880()), new("Regression & diagnostics", "issue-234", "Repro: pixmap scale + insert image", _ => Program.TestIssue234()), + new("Regression & diagnostics", "pixmap-parallel", "Repro: parallel Pixmap.ToBytes rendering", _ => Program.TestPixmapParallel()), new("Regression & diagnostics", "jbig2", "Rewrite images with FAX recompression", _ => Program.TestRecompressJBIG2()), }; diff --git a/Demo/Samples/Regression/Program.Regression.cs b/Demo/Samples/Regression/Program.Regression.cs index e9b10b8..26842f5 100644 --- a/Demo/Samples/Regression/Program.Regression.cs +++ b/Demo/Samples/Regression/Program.Regression.cs @@ -1,3 +1,5 @@ +using System.Threading.Tasks; + namespace Demo { internal partial class Program @@ -16,6 +18,8 @@ internal static void TestIssue234() page.Dispose(); doc.Save("issue_234.pdf"); doc.Close(); + + Console.WriteLine("Saved issue_234.pdf"); } internal static void TestRecompressJBIG2() @@ -35,6 +39,8 @@ internal static void TestRecompressJBIG2() doc.Save(@"e:\TestRecompressJBIG2.pdf"); doc.Close(); + + Console.WriteLine("Saved e:\\TestRecompressJBIG2.pdf"); } internal static void TestIssue1880() @@ -164,5 +170,34 @@ internal static void TestIssue213() //writer.Close(); } + internal static void TestPixmapParallel() + { + const int iterations = 300; + const int degreeOfParallelism = 10; + + var pdfPath = Path.Combine(@"..\..\..\TestDocuments\TestPdf1.pdf"); + var pdf = File.ReadAllBytes(pdfPath); + + Console.WriteLine($"MuPDF.NET parallel Pixmap.ToBytes repro"); + Console.WriteLine($"PDF: {pdfPath}"); + Console.WriteLine($"Iterations: {iterations}"); + Console.WriteLine($"Degree of parallelism: {degreeOfParallelism}"); + Console.WriteLine(); + + Parallel.ForEach( + Enumerable.Range(0, iterations), + new ParallelOptions { MaxDegreeOfParallelism = degreeOfParallelism }, + iteration => + { + using var document = new Document(stream: pdf, fileType: "pdf"); + using var page = document[0]; + using var pixmap = page.GetPixmap(new Matrix(2, 2)); + + var png = pixmap.ToBytes("png"); + Console.WriteLine($"Iteration {iteration + 1}: rendered {png.Length} bytes"); + }); + + Console.WriteLine("Completed without crashing."); + } } } diff --git a/Demo/TestDocuments/TestPdf1.pdf b/Demo/TestDocuments/TestPdf1.pdf new file mode 100644 index 0000000..ae0f872 Binary files /dev/null and b/Demo/TestDocuments/TestPdf1.pdf differ diff --git a/MuPDF.NET.Test/PixmapTest.cs b/MuPDF.NET.Test/PixmapTest.cs index 49faa2b..546916d 100644 --- a/MuPDF.NET.Test/PixmapTest.cs +++ b/MuPDF.NET.Test/PixmapTest.cs @@ -124,5 +124,65 @@ public void InvertIrect1() //Assert.Pass(); } + [Test] + public void TestPixmapToBytes() + { + // Test single pixmap creation and PNG conversion with matrix scaling + Document doc = new Document("../../../resources/cython.pdf"); + Page page = doc[0]; + Pixmap pixmap = page.GetPixmap(new Matrix(2, 2)); + + byte[] png = pixmap.ToBytes("png"); + + // Verify PNG bytes are generated and valid + Assert.That(png.Length, Is.GreaterThan(0)); + // PNG magic bytes: 137, 80, 78, 71 + Assert.That(png[0], Is.EqualTo(137)); + Assert.That(png[1], Is.EqualTo(80)); + Assert.That(png[2], Is.EqualTo(78)); + Assert.That(png[3], Is.EqualTo(71)); + + pixmap.Dispose(); + page.Dispose(); + doc.Close(); + } + + [Test] + public void TestPixmapParallel() + { + // Test parallel pixmap rendering with PNG conversion (simulating TestPixmap()) + const int iterations = 50; // Reduced from 500 for unit test performance + const int degreeOfParallelism = 4; // Reduced from 10 for unit test performance + + using (var document = new Document("../../../resources/cython.pdf")) + { + var renderResults = new System.Collections.Concurrent.ConcurrentBag(); + var errors = new System.Collections.Concurrent.ConcurrentBag(); + + Parallel.ForEach( + Enumerable.Range(0, iterations), + new ParallelOptions { MaxDegreeOfParallelism = degreeOfParallelism }, + iteration => + { + try + { + using var page = document[0]; + using var pixmap = page.GetPixmap(new Matrix(2, 2)); + var png = pixmap.ToBytes("png"); + renderResults.Add(png.Length); + } + catch (Exception ex) + { + errors.Add(ex); + } + }); + + // Verify all iterations completed successfully + Assert.That(errors.Count, Is.EqualTo(0), $"Parallel rendering encountered errors: {string.Join(", ", errors.Select(e => e.Message))}"); + Assert.That(renderResults.Count, Is.EqualTo(iterations), "Not all iterations completed"); + Assert.That(renderResults.All(size => size > 0), Is.True, "All PNG bytes should be valid"); + } + } + } } diff --git a/MuPDF.NET/DisplayList.cs b/MuPDF.NET/DisplayList.cs index 3e89f9e..4571980 100644 --- a/MuPDF.NET/DisplayList.cs +++ b/MuPDF.NET/DisplayList.cs @@ -33,38 +33,52 @@ public Rect Rect /// The page's rectangle. public DisplayList(Rect rect) { - _nativeDisplayList = new FzDisplayList(rect.ToFzRect()); + lock (Utils.MuPDFLock) + { + _nativeDisplayList = new FzDisplayList(rect.ToFzRect()); + } ThisOwn = true; } public DisplayList(DisplayList displayList) { - _nativeDisplayList = displayList.ToFzDisplayList(); + lock (Utils.MuPDFLock) + { + _nativeDisplayList = displayList.ToFzDisplayList(); + } } public DisplayList(PdfPage pdfPage, int annots = 1) { - _fzPage = new FzPage(pdfPage); - if (annots != 0) - _nativeDisplayList = mupdf.mupdf.fz_new_display_list_from_page(_fzPage); - else - _nativeDisplayList = mupdf.mupdf.fz_new_display_list_from_page_contents(_fzPage); + lock (Utils.MuPDFLock) + { + _fzPage = new FzPage(pdfPage); + if (annots != 0) + _nativeDisplayList = mupdf.mupdf.fz_new_display_list_from_page(_fzPage); + else + _nativeDisplayList = mupdf.mupdf.fz_new_display_list_from_page_contents(_fzPage); + } } public void Dispose() { if (_nativeDisplayList != null) { - _nativeDisplayList.Dispose(); + lock (Utils.MuPDFLock) + { + _nativeDisplayList.Dispose(); + } _nativeDisplayList = null; ThisOwn = false; } if (_fzPage != null) { - _fzPage.Dispose(); + lock (Utils.MuPDFLock) + { + _fzPage.Dispose(); + } _fzPage = null; - ThisOwn = false; } } diff --git a/MuPDF.NET/Document.cs b/MuPDF.NET/Document.cs index a775bfb..ce1dbe4 100644 --- a/MuPDF.NET/Document.cs +++ b/MuPDF.NET/Document.cs @@ -122,7 +122,10 @@ public int PageCount if (IsClosed) throw new Exception("document closed"); - return _nativeDocument.fz_count_pages(); + lock (Utils.MuPDFLock) + { + return _nativeDocument.fz_count_pages(); + } } } @@ -152,7 +155,10 @@ public int ChapterCount if (IsClosed) throw new Exception("document closed"); - return _nativeDocument.fz_count_chapters(); + lock (Utils.MuPDFLock) + { + return _nativeDocument.fz_count_chapters(); + } } } @@ -498,103 +504,106 @@ public Document( } FzDocument doc = null; - if (stream != null) + lock (Utils.MuPDFLock) { - IntPtr dataPtr = Marshal.AllocHGlobal(stream.Length); - Marshal.Copy(stream, 0, dataPtr, stream.Length); - SWIGTYPE_p_unsigned_char swigData = new SWIGTYPE_p_unsigned_char(dataPtr, true); - FzStream data = mupdf.mupdf.fz_open_memory(swigData, (uint)stream.Length); - if (string.IsNullOrEmpty(fileName) || string.IsNullOrEmpty(fileType)) - fileName = "pdf"; - - string magic = fileName; - if (magic == null) - magic = fileType; - try + if (stream != null) { - doc = mupdf.mupdf.fz_open_document_with_stream(magic, data); - } - catch(Exception e) - { - throw new Exception("Failed to open stream : " + e.Message); + IntPtr dataPtr = Marshal.AllocHGlobal(stream.Length); + Marshal.Copy(stream, 0, dataPtr, stream.Length); + SWIGTYPE_p_unsigned_char swigData = new SWIGTYPE_p_unsigned_char(dataPtr, true); + FzStream data = mupdf.mupdf.fz_open_memory(swigData, (uint)stream.Length); + if (string.IsNullOrEmpty(fileName) || string.IsNullOrEmpty(fileType)) + fileName = "pdf"; + + string magic = fileName; + if (magic == null) + magic = fileType; + try + { + doc = mupdf.mupdf.fz_open_document_with_stream(magic, data); + } + catch(Exception e) + { + throw new Exception("Failed to open stream : " + e.Message); + } + data.Dispose(); } - data.Dispose(); - } - else - { - if (!string.IsNullOrEmpty(fileName)) + else { - if (string.IsNullOrEmpty(fileType)) + if (!string.IsNullOrEmpty(fileName)) { - - try + if (string.IsNullOrEmpty(fileType)) { - doc = mupdf.mupdf.fz_open_document(fileName); - } - catch(Exception) - { - throw new Exception("Failed to open document"); + + try + { + doc = mupdf.mupdf.fz_open_document(fileName); + } + catch(Exception) + { + throw new Exception("Failed to open document"); + } } - } - else - { - fz_document_handler handler = mupdf.mupdf.ll_fz_recognize_document( - fileType - ); - if (handler != null) + else { - if (handler.open != null) + fz_document_handler handler = mupdf.mupdf.ll_fz_recognize_document( + fileType + ); + if (handler != null) { - try + if (handler.open != null) { - FzStream _stream = new FzStream(fileName); - FzStream accel = new FzStream(); - FzArchive archive = new FzArchive(); - // mupdf version greater than 1.25.0 - /*{ - doc = new FzDocument( - mupdf.mupdf.ll_fz_document_handler_open(handler, _stream.m_internal, accel.m_internal, archive.m_internal, null) - ); - }*/ + try { - doc = new FzDocument(mupdf.mupdf.ll_fz_document_handler_open(handler, _stream.m_internal, accel.m_internal, archive.m_internal, null)); + FzStream _stream = new FzStream(fileName); + FzStream accel = new FzStream(); + FzArchive archive = new FzArchive(); + // mupdf version greater than 1.25.0 + /*{ + doc = new FzDocument( + mupdf.mupdf.ll_fz_document_handler_open(handler, _stream.m_internal, accel.m_internal, archive.m_internal, null) + ); + }*/ + { + doc = new FzDocument(mupdf.mupdf.ll_fz_document_handler_open(handler, _stream.m_internal, accel.m_internal, archive.m_internal, null)); + } + } + catch (Exception) + { + throw new Exception( + Utils.ErrorMessages["MSG_BAD_DOCUMENT"] + ); } } - catch (Exception) + else if ( + mupdf.mupdf.FZ_VERSION_MAJOR >= 1 + && mupdf.mupdf.FZ_VERSION_MINOR >= 24 + ) { - throw new Exception( - Utils.ErrorMessages["MSG_BAD_DOCUMENT"] - ); + Debug.Assert(false); + ///////////////////////// in less than version 1.24 + /*data = mupdf.mupdf.fz_open_file(filename); + doc.m_internal = mupdf.mupdf.ll_fz_document_open_with_stream_fn_call(handler.open_with_stream, data.m_internal);*/ } } - else if ( - mupdf.mupdf.FZ_VERSION_MAJOR >= 1 - && mupdf.mupdf.FZ_VERSION_MINOR >= 24 - ) + else { - Debug.Assert(false); - ///////////////////////// in less than version 1.24 - /*data = mupdf.mupdf.fz_open_file(filename); - doc.m_internal = mupdf.mupdf.ll_fz_document_open_with_stream_fn_call(handler.open_with_stream, data.m_internal);*/ + throw new Exception(Utils.ErrorMessages["MSG_BAD_FILETYPE"]); } } - else - { - throw new Exception(Utils.ErrorMessages["MSG_BAD_FILETYPE"]); - } + } + else + { + PdfDocument pdf = new PdfDocument(); + doc = new FzDocument(pdf); } } - else - { - PdfDocument pdf = new PdfDocument(); - doc = new FzDocument(pdf); - } + if (w > 0 && h > 0) + doc.fz_layout_document(w, h, fontSize); + else if (doc.fz_is_document_reflowable() != 0) + doc.fz_layout_document(400, 600, 11); + _nativeDocument = doc; } - if (w > 0 && h > 0) - doc.fz_layout_document(w, h, fontSize); - else if (doc.fz_is_document_reflowable() != 0) - doc.fz_layout_document(400, 600, 11); - _nativeDocument = doc; ThisOwn = true; @@ -1318,7 +1327,11 @@ public Page LoadPage(int pageId) if (Utils.INRANGE(pageId, 0, PageCount - 1) == false) throw new Exception("document page count is not enough"); - FzPage page = _nativeDocument.fz_load_page(pageId); + FzPage page; + lock (Utils.MuPDFLock) + { + page = _nativeDocument.fz_load_page(pageId); + } Page val = new Page(page, this); val.ThisOwn = true; @@ -1342,7 +1355,11 @@ public Page LoadPage(int chapter, int pagenum) if (IsClosed || IsEncrypted) throw new Exception("document closed or encrypted"); - FzPage page = _nativeDocument.fz_load_chapter_page(chapter, pagenum); + FzPage page; + lock (Utils.MuPDFLock) + { + page = _nativeDocument.fz_load_chapter_page(chapter, pagenum); + } Page val = new Page(page, this); val.ThisOwn = true; @@ -6026,7 +6043,10 @@ public void Dispose() ResetPageRefs(); IsClosed = true; GraftMaps = new Dictionary(); - _nativeDocument.Dispose(); + lock (Utils.MuPDFLock) + { + _nativeDocument.Dispose(); + } _nativeDocument = null; } diff --git a/MuPDF.NET/MuPDF.NET.nuspec b/MuPDF.NET/MuPDF.NET.nuspec index 5360ffd..245c644 100644 --- a/MuPDF.NET/MuPDF.NET.nuspec +++ b/MuPDF.NET/MuPDF.NET.nuspec @@ -2,7 +2,7 @@ MuPDF.NET - 3.2.14 + 3.2.16 Artifex Software Inc. true LICENSE.md diff --git a/MuPDF.NET/Page.cs b/MuPDF.NET/Page.cs index 74ac682..2262cf0 100644 --- a/MuPDF.NET/Page.cs +++ b/MuPDF.NET/Page.cs @@ -389,33 +389,42 @@ public override string ToString() public Page(PdfPage pdfPage, Document parent) { - _pdfPage = pdfPage; - _nativePage = pdfPage.super(); - Parent = parent; + lock (Utils.MuPDFLock) + { + _pdfPage = pdfPage; + _nativePage = pdfPage.super(); + Parent = parent; - if (_pdfPage.m_internal == null) - Number = 0; - else - Number = _pdfPage.m_internal.super.number; + if (_pdfPage.m_internal == null) + Number = 0; + else + Number = _pdfPage.m_internal.super.number; + } } public Page(FzPage fzPage, Document parent) { - _pdfPage = fzPage.pdf_page_from_fz_page(); - _nativePage = fzPage; - Parent = parent; + lock (Utils.MuPDFLock) + { + _pdfPage = fzPage.pdf_page_from_fz_page(); + _nativePage = fzPage; + Parent = parent; - if (_pdfPage.m_internal == null) - Number = 0; - else - Number = _pdfPage.m_internal.super.number; + if (_pdfPage.m_internal == null) + Number = 0; + else + Number = _pdfPage.m_internal.super.number; + } } public void Dispose() { if (_pdfPage != null) { - _pdfPage.Dispose(); + lock (Utils.MuPDFLock) + { + _pdfPage.Dispose(); + } _pdfPage = null; } if (_nativePage != null) @@ -3344,42 +3353,45 @@ public Pixmap GetPixmap( bool annots = true ) { - if (matrix == null) - matrix = new Matrix(1.0f, 1.0f); - - float zoom; - if (dpi != 0) + lock (Utils.MuPDFLock) { - zoom = dpi / 72f; - matrix = new Matrix(zoom, zoom); - } + if (matrix == null) + matrix = new Matrix(1.0f, 1.0f); - ColorSpace _colorSpace; - if (string.IsNullOrEmpty(colorSpace)) - _colorSpace = new ColorSpace(Utils.CS_RGB); - else if (colorSpace.ToUpper() == "GRAY") - _colorSpace = new ColorSpace(Utils.CS_GRAY); - else if (colorSpace.ToUpper() == "CMYK") - _colorSpace = new ColorSpace(Utils.CS_CMYK); - else - _colorSpace = new ColorSpace(Utils.CS_RGB); + float zoom; + if (dpi != 0) + { + zoom = dpi / 72f; + matrix = new Matrix(zoom, zoom); + } - if (!(new List() { 1, 3, 4 }).Contains(_colorSpace.N)) - throw new Exception("unsupported colorspace"); + ColorSpace _colorSpace; + if (string.IsNullOrEmpty(colorSpace)) + _colorSpace = new ColorSpace(Utils.CS_RGB); + else if (colorSpace.ToUpper() == "GRAY") + _colorSpace = new ColorSpace(Utils.CS_GRAY); + else if (colorSpace.ToUpper() == "CMYK") + _colorSpace = new ColorSpace(Utils.CS_CMYK); + else + _colorSpace = new ColorSpace(Utils.CS_RGB); - DisplayList dl = GetDisplayList(annots ? 1 : 0); - Pixmap pix = dl.GetPixmap( - matrix, - colorSpace: _colorSpace, - alpha: alpha ? 1 : 0, - clip: clip - ); - dl.Dispose(); + if (!(new List() { 1, 3, 4 }).Contains(_colorSpace.N)) + throw new Exception("unsupported colorspace"); - if (dpi != 0) - pix.SetDpi(dpi, dpi); + DisplayList dl = GetDisplayList(annots ? 1 : 0); + Pixmap pix = dl.GetPixmap( + matrix, + colorSpace: _colorSpace, + alpha: alpha ? 1 : 0, + clip: clip + ); + dl.Dispose(); - return pix; + if (dpi != 0) + pix.SetDpi(dpi, dpi); + + return pix; + } } public DisplayList GetDisplayList(int annots = 1) @@ -4414,6 +4426,10 @@ public string GetTextbox(Rect rect = null, TextPage textPage = null) /// whether to OCR the full page image, or only its images (default) /// /// Optional preprocessing filters applied to raster images before OCR. + /// + /// Maximum size of the full-page pixmap sample buffer before OCR is split vertically into strips. + /// Default 120,000,000 bytes (~114 MiB). Set lower on machines with limited RAM. + /// /// public TextPage GetTextPageOcr( int flags = 0, @@ -4421,30 +4437,79 @@ public TextPage GetTextPageOcr( int dpi = 72, bool full = false, string tessdata = null, - ImageFilterPipeline imageFilters = null + ImageFilterPipeline imageFilters = null, + long maxOcrPixmapBytes = 3000000 ) { if (string.IsNullOrEmpty(Utils.TESSDATA_PREFIX) && string.IsNullOrEmpty(tessdata)) throw new Exception("No OCR support: TESSDATA_PREFIX not set"); + if (maxOcrPixmapBytes <= 0) + throw new ArgumentOutOfRangeException(nameof(maxOcrPixmapBytes), "maxOcrPixmapBytes must be greater than zero."); TextPage FullOcr(Page page, int _dpi, string _language, int _flags, ImageFilterPipeline filters) { - float zoom = _dpi / 72.0f; + if (_dpi <= 0) + throw new ArgumentOutOfRangeException(nameof(_dpi), "dpi must be greater than zero."); + + long GetPixmapBytes(Pixmap p) + { + byte[] samples = p.SAMPLES; + if (samples != null && samples.Length > 0) + return samples.Length; + return (long)p.Stride * p.H; + } + + int effectiveDpi = _dpi; + for (int i = 0; i < 3; i++) + { + float probeZoom = effectiveDpi / 72.0f; + using (Pixmap probe = page.GetPixmap(matrix: new Matrix(probeZoom, probeZoom))) + { + long probeBytes = GetPixmapBytes(probe); + if (probeBytes <= maxOcrPixmapBytes) + break; + + double scale = Math.Sqrt((double)maxOcrPixmapBytes / probeBytes); + int nextDpi = Math.Max(1, (int)Math.Floor(effectiveDpi * scale)); + if (nextDpi >= effectiveDpi) + nextDpi = effectiveDpi - 1; + if (nextDpi < 1) + nextDpi = 1; + effectiveDpi = nextDpi; + } + } + + float zoom = effectiveDpi / 72.0f; Matrix mat = new Matrix(zoom, zoom); - Pixmap pix = page.GetPixmap(matrix: mat); - if (filters != null) - pix = Pixmap.ApplyImageFilters(pix, filters); - Document ocrPdf = new Document("pdf", pix.PdfOCR2Bytes(true, _language, tessdata)); - - Page ocrPage = ocrPdf.LoadPage(0); - float unZoom = page.Rect.Width / ocrPage.Rect.Width; - Matrix ctm = new Matrix(unZoom, unZoom) * page.DerotationMatrix; - TextPage _tp = ocrPage.GetTextPage(flags: _flags, matrix: ctm); - ocrPdf.Close(); - - pix.Dispose(); - _tp.Parent = this; - return _tp; + using (Pixmap basePix = page.GetPixmap(matrix: mat)) + { + Pixmap workingPix = basePix; + Pixmap filteredPix = null; + try + { + if (filters != null) + { + filteredPix = Pixmap.ApplyImageFilters(basePix, filters); + if (filteredPix != null) + workingPix = filteredPix; + } + + using (Document ocrPdf = new Document("pdf", workingPix.PdfOCR2Bytes(true, _language, tessdata))) + using (Page ocrPage = ocrPdf.LoadPage(0)) + { + float unZoom = page.Rect.Width / ocrPage.Rect.Width; + Matrix ctm = new Matrix(unZoom, unZoom) * page.DerotationMatrix; + TextPage ocrTp = ocrPage.GetTextPage(flags: _flags, matrix: ctm); + ocrTp.Parent = this; + return ocrTp; + } + } + finally + { + if (filteredPix != null && !object.ReferenceEquals(filteredPix, basePix)) + filteredPix.Dispose(); + } + } } if (full) diff --git a/MuPDF.NET/Pixmap.cs b/MuPDF.NET/Pixmap.cs index 20b4e3f..81f0845 100644 --- a/MuPDF.NET/Pixmap.cs +++ b/MuPDF.NET/Pixmap.cs @@ -21,48 +21,57 @@ static Pixmap() public Pixmap(ColorSpace cs, IRect irect, int alpha = 0) { - _nativePixmap = mupdf.mupdf.fz_new_pixmap_with_bbox(cs.ToFzColorspace(), irect.ToFzIrect(), new FzSeparations(0), alpha); + lock (Utils.MuPDFLock) + { + _nativePixmap = mupdf.mupdf.fz_new_pixmap_with_bbox(cs.ToFzColorspace(), irect.ToFzIrect(), new FzSeparations(0), alpha); + } } public Pixmap(Document doc, int xref) { - PdfDocument pdf = Document.AsPdfDocument(doc); - int xrefLen = pdf.pdf_xref_len(); - if (!Utils.INRANGE(xref, 1, xrefLen - 1)) + lock (Utils.MuPDFLock) { - pdf.Dispose(); - throw new Exception(Utils.ErrorMessages["MSG_BAD_XREF"]); - } + PdfDocument pdf = Document.AsPdfDocument(doc); + int xrefLen = pdf.pdf_xref_len(); + if (!Utils.INRANGE(xref, 1, xrefLen - 1)) + { + pdf.Dispose(); + throw new Exception(Utils.ErrorMessages["MSG_BAD_XREF"]); + } - PdfObj r = pdf.pdf_new_indirect(xref, 0); - PdfObj type = r.pdf_dict_get(new PdfObj("Subtype")); - if (type.pdf_name_eq(new PdfObj("Image")) == 0 && - type.pdf_name_eq(new PdfObj("Alpha")) == 0 && - type.pdf_name_eq(new PdfObj("Luminosity")) == 0) - throw new Exception(Utils.ErrorMessages["MSG_IS_NO_IMAGE"]); - FzImage img = pdf.pdf_load_image(r); - FzPixmap pix = img.fz_get_pixmap_from_image(new FzIrect(Utils.FZ_MIN_INF_RECT, Utils.FZ_MIN_INF_RECT, Utils.FZ_MAX_INF_RECT, Utils.FZ_MAX_INF_RECT), - new FzMatrix(img.w(), 0, 0, img.h(), 0, 0), - null, - null - ); - _nativePixmap = pix; + PdfObj r = pdf.pdf_new_indirect(xref, 0); + PdfObj type = r.pdf_dict_get(new PdfObj("Subtype")); + if (type.pdf_name_eq(new PdfObj("Image")) == 0 && + type.pdf_name_eq(new PdfObj("Alpha")) == 0 && + type.pdf_name_eq(new PdfObj("Luminosity")) == 0) + throw new Exception(Utils.ErrorMessages["MSG_IS_NO_IMAGE"]); + FzImage img = pdf.pdf_load_image(r); + FzPixmap pix = img.fz_get_pixmap_from_image(new FzIrect(Utils.FZ_MIN_INF_RECT, Utils.FZ_MIN_INF_RECT, Utils.FZ_MAX_INF_RECT, Utils.FZ_MAX_INF_RECT), + new FzMatrix(img.w(), 0, 0, img.h(), 0, 0), + null, + null + ); + _nativePixmap = pix; - pdf.Dispose(); + pdf.Dispose(); + } } public Pixmap(Pixmap spix, float w, float h) { - FzIrect bbox = new FzIrect(mupdf.mupdf.fz_infinite_irect); - if (spix == null) - throw new Exception("bad pixmap"); - FzPixmap srcPix = spix.ToFzPixmap(); - FzPixmap pm = null; - if (bbox.fz_is_infinite_irect() == 0) - pm = srcPix.fz_scale_pixmap(srcPix.x(), srcPix.y(), w, h, bbox); - else - pm = srcPix.fz_scale_pixmap(srcPix.x(), srcPix.y(), w, h, new FzIrect(mupdf.mupdf.fz_infinite_irect)); - _nativePixmap = pm; + lock (Utils.MuPDFLock) + { + FzIrect bbox = new FzIrect(mupdf.mupdf.fz_infinite_irect); + if (spix == null) + throw new Exception("bad pixmap"); + FzPixmap srcPix = spix.ToFzPixmap(); + FzPixmap pm = null; + if (bbox.fz_is_infinite_irect() == 0) + pm = srcPix.fz_scale_pixmap(srcPix.x(), srcPix.y(), w, h, bbox); + else + pm = srcPix.fz_scale_pixmap(srcPix.x(), srcPix.y(), w, h, new FzIrect(mupdf.mupdf.fz_infinite_irect)); + _nativePixmap = pm; + } } /// @@ -77,8 +86,11 @@ public IRect IRect { return _irect; } - FzIrect val = _nativePixmap.fz_pixmap_bbox(); - _irect = new IRect(val); + lock (Utils.MuPDFLock) + { + FzIrect val = _nativePixmap.fz_pixmap_bbox(); + _irect = new IRect(val); + } return _irect; } @@ -91,7 +103,10 @@ public int Alpha { get { - return _nativePixmap.alpha(); + lock (Utils.MuPDFLock) + { + return _nativePixmap.alpha(); + } } } @@ -105,7 +120,13 @@ public ColorSpace ColorSpace { if (_colorSpace == null) { - _colorSpace = new ColorSpace(_nativePixmap.fz_pixmap_colorspace()); + lock (Utils.MuPDFLock) + { + if (_colorSpace == null) // Double-check after acquiring lock + { + _colorSpace = new ColorSpace(_nativePixmap.fz_pixmap_colorspace()); + } + } } return _colorSpace; } @@ -118,10 +139,13 @@ public byte[] Digest { get { - vectoruc res = _nativePixmap.fz_md5_pixmap2(); - byte[] ret = new byte[res.Count]; - res.CopyTo(ret, 0); - return ret; + lock (Utils.MuPDFLock) + { + vectoruc res = _nativePixmap.fz_md5_pixmap2(); + byte[] ret = new byte[res.Count]; + res.CopyTo(ret, 0); + return ret; + } } } @@ -132,7 +156,10 @@ public int H { get { - return _nativePixmap.fz_pixmap_height(); + lock (Utils.MuPDFLock) + { + return _nativePixmap.fz_pixmap_height(); + } } } @@ -143,7 +170,10 @@ public bool IsMonoChrome { get { - return Convert.ToBoolean(_nativePixmap.fz_is_pixmap_monochrome()); + lock (Utils.MuPDFLock) + { + return Convert.ToBoolean(_nativePixmap.fz_is_pixmap_monochrome()); + } } } @@ -154,19 +184,22 @@ public bool IsUniColor { get { - FzPixmap pm = _nativePixmap; - byte n = pm.n(); - int count = pm.w() * pm.h(); - List sample0 = PixmapReadSamples(0, n); - List sample = null; - for (int i = n; i < count; i += n) + lock (Utils.MuPDFLock) { - sample = PixmapReadSamples(i, n); - if (!sample.SequenceEqual(sample0)) - return false; + FzPixmap pm = _nativePixmap; + byte n = pm.n(); + int count = pm.w() * pm.h(); + List sample0 = PixmapReadSamples(0, n); + List sample = null; + for (int i = n; i < count; i += n) + { + sample = PixmapReadSamples(i, n); + if (!sample.SequenceEqual(sample0)) + return false; + } + + return true; } - - return true; } } @@ -179,7 +212,13 @@ public int N get { if (_n < 0) - _n = _nativePixmap.fz_pixmap_components(); + { + lock (Utils.MuPDFLock) + { + if (_n < 0) // Double-check after acquiring lock + _n = _nativePixmap.fz_pixmap_components(); + } + } return _n; } @@ -231,7 +270,10 @@ public long SamplesPtr { get { - return _nativePixmap.fz_pixmap_samples_int(); + lock (Utils.MuPDFLock) + { + return _nativePixmap.fz_pixmap_samples_int(); + } } } @@ -242,7 +284,10 @@ public int Size { get { - return _nativePixmap.n() * _nativePixmap.w() * _nativePixmap.h(); + lock (Utils.MuPDFLock) + { + return _nativePixmap.n() * _nativePixmap.w() * _nativePixmap.h(); + } } } @@ -255,7 +300,13 @@ public int Stride get { if (_stride < 0) - _stride = _nativePixmap.fz_pixmap_stride(); + { + lock (Utils.MuPDFLock) + { + if (_stride < 0) // Double-check after acquiring lock + _stride = _nativePixmap.fz_pixmap_stride(); + } + } return _stride; } } @@ -267,7 +318,10 @@ public int W { get { - return _nativePixmap.fz_pixmap_width(); + lock (Utils.MuPDFLock) + { + return _nativePixmap.fz_pixmap_width(); + } } } @@ -281,7 +335,13 @@ public int X { if (_x < 0) { - _x = _nativePixmap.fz_pixmap_x(); + lock (Utils.MuPDFLock) + { + if (_x < 0) // Double-check after acquiring lock + { + _x = _nativePixmap.fz_pixmap_x(); + } + } } return _x; } @@ -294,7 +354,10 @@ public int Xres { get { - return _nativePixmap.xres(); + lock (Utils.MuPDFLock) + { + return _nativePixmap.xres(); + } } } @@ -307,7 +370,13 @@ public int Y get { if (_y < 0) - _y = _nativePixmap.fz_pixmap_y(); + { + lock (Utils.MuPDFLock) + { + if (_y < 0) // Double-check after acquiring lock + _y = _nativePixmap.fz_pixmap_y(); + } + } return _y; } } @@ -319,7 +388,10 @@ public int Yres { get { - return _nativePixmap.yres(); + lock (Utils.MuPDFLock) + { + return _nativePixmap.yres(); + } } } @@ -428,13 +500,17 @@ public Pixmap(string arg0, Pixmap arg1) if (arg0 == "raw" && arg1 != null) { _nativePixmap = arg1.ToFzPixmap(); - int n = N; - byte[] samples = SAMPLES; - ColorSpace colorSpace = ColorSpace; - IRect irect = IRect; - int stride = Stride; - int x = X; - int y = Y; + GC.SuppressFinalize(arg1); + lock (Utils.MuPDFLock) + { + int n = N; + byte[] samples = SAMPLES; + ColorSpace colorSpace = ColorSpace; + IRect irect = IRect; + int stride = Stride; + int x = X; + int y = Y; + } } else throw new Exception("arg0 must be `raw` or arg1 must be not null."); @@ -630,37 +706,43 @@ public Pixmap(Pixmap pix, int alpha) /// internal byte[] ToBytes(int format, int jpgQuality) { - FzPixmap pixmap = _nativePixmap; - int size = pixmap.fz_pixmap_stride() * pixmap.h(); - FzBuffer res = new FzBuffer((uint)size); - FzOutput output = new FzOutput(res); + lock (Utils.MuPDFLock) + { + FzPixmap pixmap = _nativePixmap; + int size = pixmap.fz_pixmap_stride() * pixmap.h(); + FzBuffer res = new FzBuffer((uint)size); + FzOutput output = new FzOutput(res); - if (format == 1) mupdf.mupdf.fz_write_pixmap_as_png(output, pixmap); - else if (format == 2) mupdf.mupdf.fz_write_pixmap_as_pnm(output, pixmap); - else if (format == 3) mupdf.mupdf.fz_write_pixmap_as_pam(output, pixmap); - else if (format == 5) mupdf.mupdf.fz_write_pixmap_as_psd(output, pixmap); - else if (format == 6) mupdf.mupdf.fz_write_pixmap_as_ps(output, pixmap); - else if (format == 7) output.fz_write_pixmap_as_jpeg(pixmap, jpgQuality, 0); // v1.24 later - else mupdf.mupdf.fz_write_pixmap_as_png(output, pixmap); + if (format == 1) mupdf.mupdf.fz_write_pixmap_as_png(output, pixmap); + else if (format == 2) mupdf.mupdf.fz_write_pixmap_as_pnm(output, pixmap); + else if (format == 3) mupdf.mupdf.fz_write_pixmap_as_pam(output, pixmap); + else if (format == 5) mupdf.mupdf.fz_write_pixmap_as_psd(output, pixmap); + else if (format == 6) mupdf.mupdf.fz_write_pixmap_as_ps(output, pixmap); + else if (format == 7) output.fz_write_pixmap_as_jpeg(pixmap, jpgQuality, 0); // v1.24 later + else mupdf.mupdf.fz_write_pixmap_as_png(output, pixmap); - byte[] barray = Utils.BinFromBuffer(res); - output.fz_close_output(); - output.Dispose(); - res.Dispose(); - return barray; + byte[] barray = Utils.BinFromBuffer(res); + output.fz_close_output(); + output.Dispose(); + res.Dispose(); + return barray; + } } private void WriteImage(string filename, int format, int jpgQuality) { - FzPixmap pixmap = _nativePixmap; + lock (Utils.MuPDFLock) + { + FzPixmap pixmap = _nativePixmap; - if (format == 1) mupdf.mupdf.fz_save_pixmap_as_png(pixmap, filename); - else if (format == 2) mupdf.mupdf.fz_save_pixmap_as_pnm(pixmap, filename); - else if (format == 3) mupdf.mupdf.fz_save_pixmap_as_pam(pixmap, filename); - else if (format == 5) mupdf.mupdf.fz_save_pixmap_as_psd(pixmap, filename); - else if (format == 6) mupdf.mupdf.fz_save_pixmap_as_ps(pixmap, filename, 0); - else if (format == 7) mupdf.mupdf.fz_save_pixmap_as_jpeg(pixmap, filename, jpgQuality); - else mupdf.mupdf.fz_save_pixmap_as_png(pixmap, filename); + if (format == 1) mupdf.mupdf.fz_save_pixmap_as_png(pixmap, filename); + else if (format == 2) mupdf.mupdf.fz_save_pixmap_as_pnm(pixmap, filename); + else if (format == 3) mupdf.mupdf.fz_save_pixmap_as_pam(pixmap, filename); + else if (format == 5) mupdf.mupdf.fz_save_pixmap_as_psd(pixmap, filename); + else if (format == 6) mupdf.mupdf.fz_save_pixmap_as_ps(pixmap, filename, 0); + else if (format == 7) mupdf.mupdf.fz_save_pixmap_as_jpeg(pixmap, filename, jpgQuality); + else mupdf.mupdf.fz_save_pixmap_as_png(pixmap, filename); + } } public static int ClearPixmap_RectWithValue(Pixmap pixmap, int v = 0, FzIrect bbox = null) @@ -1223,7 +1305,7 @@ public byte[] ToBytes(string output = "png", int jpgQuality = 95) if (idx == 7) SetDpi(Xres, Yres); - + return ToBytes(idx, jpgQuality); } diff --git a/MuPDF.NET/Utils.cs b/MuPDF.NET/Utils.cs index 4096cac..1feab8f 100644 --- a/MuPDF.NET/Utils.cs +++ b/MuPDF.NET/Utils.cs @@ -33,6 +33,12 @@ public static class Utils public static bool IsInitialized = false; + /// + /// Global lock for thread-safe access to native MuPDF library. + /// MuPDF is not thread-safe, so all P/Invoke calls must be synchronized with this lock. + /// + public static readonly object MuPDFLock = new object(); + public static string ANNOT_ID_STEM = "fitz"; public static int SigFlag_SignaturesExist = 1; @@ -4278,53 +4284,56 @@ internal static Pixmap GetPixmapFromDisplaylist( try { - if (seps == null) + lock (Utils.MuPDFLock) { - seps = new FzSeparations(); - disposables.Add(seps); - } + if (seps == null) + { + seps = new FzSeparations(); + disposables.Add(seps); + } - FzRect rect = mupdf.mupdf.fz_bound_display_list(list); - //disposables.Add(rect); + FzRect rect = mupdf.mupdf.fz_bound_display_list(list); + //disposables.Add(rect); - FzMatrix matrix = new FzMatrix(ctm.A, ctm.B, ctm.C, ctm.D, ctm.E, ctm.F); - //disposables.Add(matrix); + FzMatrix matrix = new FzMatrix(ctm.A, ctm.B, ctm.C, ctm.D, ctm.E, ctm.F); + //disposables.Add(matrix); - FzRect rclip = clip == null ? new FzRect(FzRect.Fixed.Fixed_INFINITE) : clip.ToFzRect(); - //disposables.Add(rclip); - rect = FzRect.fz_intersect_rect(rect, rclip); + FzRect rclip = clip == null ? new FzRect(FzRect.Fixed.Fixed_INFINITE) : clip.ToFzRect(); + //disposables.Add(rclip); + rect = FzRect.fz_intersect_rect(rect, rclip); - rect = rect.fz_transform_rect(matrix); - FzIrect irect = rect.fz_round_rect(); - //disposables.Add(irect); + rect = rect.fz_transform_rect(matrix); + FzIrect irect = rect.fz_round_rect(); + //disposables.Add(irect); - FzPixmap pix = mupdf.mupdf.fz_new_pixmap_with_bbox(cs, irect, seps, alpha); - if (alpha != 0) - pix.fz_clear_pixmap(); - else - pix.fz_clear_pixmap_with_value(0xFF); + FzPixmap pix = mupdf.mupdf.fz_new_pixmap_with_bbox(cs, irect, seps, alpha); + if (alpha != 0) + pix.fz_clear_pixmap(); + else + pix.fz_clear_pixmap_with_value(0xFF); - FzDevice dev; - if (rclip.fz_is_infinite_rect() == 0) - { - dev = mupdf.mupdf.fz_new_draw_device_with_bbox(matrix, pix, irect); - list.fz_run_display_list(dev, new FzMatrix(), rclip, new FzCookie()); - } - else - { - dev = mupdf.mupdf.fz_new_draw_device(matrix, pix); - list.fz_run_display_list( - dev, - new FzMatrix(), - new FzRect(FzRect.Fixed.Fixed_INFINITE), - new FzCookie() - ); - } - disposables.Add(dev); + FzDevice dev; + if (rclip.fz_is_infinite_rect() == 0) + { + dev = mupdf.mupdf.fz_new_draw_device_with_bbox(matrix, pix, irect); + list.fz_run_display_list(dev, new FzMatrix(), rclip, new FzCookie()); + } + else + { + dev = mupdf.mupdf.fz_new_draw_device(matrix, pix); + list.fz_run_display_list( + dev, + new FzMatrix(), + new FzRect(FzRect.Fixed.Fixed_INFINITE), + new FzCookie() + ); + } + disposables.Add(dev); - dev.fz_close_device(); + dev.fz_close_device(); - return new Pixmap("raw", new Pixmap(pix)); + return new Pixmap(pix); + } } finally { @@ -8048,27 +8057,28 @@ private static void LoadEmbeddedLeptonicaDll() public static void InitApp() { - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + lock (MuPDFLock) { if (Utils.IsInitialized) return; - Utils.SetDotCultureForNumber(); - //Utils.LoadEmbeddedLeptonicaDll(); - } - else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) - { - if (Utils.IsInitialized) + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + Utils.SetDotCultureForNumber(); + //Utils.LoadEmbeddedLeptonicaDll(); + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + Utils.SetDotCultureForNumber(); + //Utils.LoadEmbeddedLeptonicaDll(); + } + else + { return; + } - Utils.SetDotCultureForNumber(); - //Utils.LoadEmbeddedLeptonicaDll(); - } - else - { - return; + Utils.IsInitialized = true; } - Utils.IsInitialized = true; } /// diff --git a/PDF4LLM/CHANGELOG.md b/PDF4LLM/CHANGELOG.md index 3d18382..016f8c8 100644 --- a/PDF4LLM/CHANGELOG.md +++ b/PDF4LLM/CHANGELOG.md @@ -2,6 +2,9 @@ All notable changes for `PDF4LLM` are documented in this file. +## [1.27.2.8] +- Improved Tesseract OCR stability by auto-adjusting OCR DPI to keep page pixmap memory under `maxOcrPixmapBytes`. + ## [1.27.2.4] - Fixed `PDFMarkdownReader` to keep page `extraInfo` isolated per page. diff --git a/PDF4LLM/PDF4LLM.csproj b/PDF4LLM/PDF4LLM.csproj index 7c36979..96dfb4f 100644 --- a/PDF4LLM/PDF4LLM.csproj +++ b/PDF4LLM/PDF4LLM.csproj @@ -1,4 +1,4 @@ - + PDF4LLM @@ -7,7 +7,7 @@ True $(Platform) . - 1.27.2.4 + 1.27.2.9 $(MSBuildProjectDirectory)\PDF4LLM.nuspec Configuration=$(Configuration);version=$(Version);PlatformFolder=$(PlatformFolder) diff --git a/PDF4LLM/PDF4LLM.nuspec b/PDF4LLM/PDF4LLM.nuspec index e979caf..55d88f7 100644 --- a/PDF4LLM/PDF4LLM.nuspec +++ b/PDF4LLM/PDF4LLM.nuspec @@ -16,28 +16,28 @@ - + - + - + - + - + - + - + - + diff --git a/PDF4LLM/PdfExtractor.cs b/PDF4LLM/PdfExtractor.cs index 19c6ccc..d137bf4 100644 --- a/PDF4LLM/PdfExtractor.cs +++ b/PDF4LLM/PdfExtractor.cs @@ -68,7 +68,7 @@ public static string ToMarkdown( float? pageHeight = null, bool ignoreCode = false, bool showProgress = false, - bool useOcr = true, + bool useOcr = false, string ocrLanguage = "eng", bool forceOcr = false, OcrPageFunction ocrFunction = null) @@ -158,7 +158,7 @@ public static string ToMarkdown( float? pageHeight = null, bool ignoreCode = false, bool showProgress = false, - bool useOcr = true, + bool useOcr = false, string ocrLanguage = "eng", bool forceOcr = false, OcrPageFunction ocrFunction = null) @@ -203,7 +203,7 @@ public static string ToJson( bool embedImages = false, bool showProgress = false, bool forceText = true, - bool useOcr = true, + bool useOcr = false, string ocrLanguage = "eng", bool forceOcr = false, OcrPageFunction ocrFunction = null) @@ -246,7 +246,7 @@ public static string ToJson( bool embedImages = false, bool showProgress = false, bool forceText = true, - bool useOcr = true, + bool useOcr = false, string ocrLanguage = "eng", bool forceOcr = false, OcrPageFunction ocrFunction = null) @@ -282,7 +282,7 @@ public static string ToText( bool showProgress = false, bool forceText = true, int ocrDpi = 300, - bool useOcr = true, + bool useOcr = false, string ocrLanguage = "eng", string tableFormat = "grid", bool pageChunks = false, @@ -333,7 +333,7 @@ public static string ToText( bool showProgress = false, bool forceText = true, int ocrDpi = 300, - bool useOcr = true, + bool useOcr = false, string ocrLanguage = "eng", string tableFormat = "grid", bool pageChunks = false, @@ -378,7 +378,7 @@ public static ParsedDocument ParseDocument( bool embedImages = false, bool showProgress = false, bool forceText = true, - bool useOcr = true, + bool useOcr = false, string ocrLanguage = "eng", bool forceOcr = false, bool keepOcrText = false, diff --git a/PDF4LLM/VersionInfo.cs b/PDF4LLM/VersionInfo.cs index 208030a..d76f195 100644 --- a/PDF4LLM/VersionInfo.cs +++ b/PDF4LLM/VersionInfo.cs @@ -7,6 +7,6 @@ namespace PDF4LLM public static class VersionInfo { public static readonly (int Major, int Minor, int Patch) MinimumMuPDFVersion = (1, 27, 0); - public const string Version = "1.27.2.4"; + public const string Version = "1.27.2.9"; } } diff --git a/PDF4LLM/helpers/DocumentLayout.cs b/PDF4LLM/helpers/DocumentLayout.cs index e4deea5..8f9f86a 100644 --- a/PDF4LLM/helpers/DocumentLayout.cs +++ b/PDF4LLM/helpers/DocumentLayout.cs @@ -11,13 +11,13 @@ namespace PDF4LLM.Helpers { /// /// Optional hook for per-page OCR (ocr_function-style signature: page, dpi, language, keep OCR text). - /// When supplied and OCR runs, the callback is invoked and text is re-extracted from the page. + /// When supplied and OCR runs, the callback returns an OCR-generated . /// /// Page to OCR. /// Resolution hint (same role as ocr_dpi in the reference API). /// Tesseract / engine language code. /// When true, preserve existing OCR text (keep_ocr_text flag). - public delegate void OcrPageFunction(Page page, int ocrDpi, string ocrLanguage, bool keepOcrText); + public delegate TextPage OcrPageFunction(Page page, int ocrDpi, string ocrLanguage, bool keepOcrText); /// /// Layout box representing a content region on a page @@ -1134,7 +1134,7 @@ public static ParsedDocument ParseDocument( bool embedImages = false, bool showProgress = false, bool forceText = false, - bool useOcr = true, + bool useOcr = false, string ocrLanguage = "eng", bool forceOcr = false, bool keepOcrText = false, @@ -1209,7 +1209,7 @@ public static ParsedDocument ParseDocument( try { page.RemoveRotation(); - + bool pageFullOcred = false; bool pageTextOcred = false; @@ -1238,17 +1238,23 @@ public static ParsedDocument ParseDocument( && pageAnalysis.TryGetValue("needs_ocr", out object n) && n is bool nb && nb) runOcr = true; - if (runOcr && ocrImpl != null) + if (runOcr) { - ocrImpl(page, ocrDpi, ocrLanguage, keepOcrText: keepOcrTextRun); textPage.Dispose(); - textPage = page.GetTextPage( - clip: new Rect(float.NegativeInfinity, float.NegativeInfinity, - float.PositiveInfinity, float.PositiveInfinity), - flags: Utils.FLAGS); - pageInfo = textPage.ExtractDict(null, false); - blocks = pageInfo.Blocks ?? new List(); - pageFullOcred = true; + textPage = null; + + if (ocrImpl != null) + { + //TextPage tp = page.GetTextPageOcr((int)TextFlags.TEXT_PRESERVE_SPANS, dpi: ocrDpi, language: ocrLanguage, full: true); + textPage = ocrImpl(page, ocrDpi, ocrLanguage, keepOcrText: keepOcrTextRun); + } + + if (textPage != null) + { + pageInfo = textPage.ExtractDict(null, false); + blocks = pageInfo.Blocks ?? new List(); + pageFullOcred = true; + } } List fulltext = blocks.Where(b => b.Type == 0).ToList(); diff --git a/PDF4LLM/helpers/LayoutParseHelpers.cs b/PDF4LLM/helpers/LayoutParseHelpers.cs index f5664e2..db0c119 100644 --- a/PDF4LLM/helpers/LayoutParseHelpers.cs +++ b/PDF4LLM/helpers/LayoutParseHelpers.cs @@ -94,10 +94,11 @@ public static void TryRemovePdfStructTreeRoot(Document doc) } } - /// Default OCR backend selection: full-page OCR is supplied via when configured. + /// Default OCR backend selection. public static OcrPageFunction SelectOcrFunction() { - return null; + // Tesseract is the default OCR backend for this .NET port. + return global::PDF4LLM.Ocr.TesseractApi.ExecOcr; } /// Build layout from stext blocks, then merge regions as table (approximates MuPdf.layout + table hints). diff --git a/PDF4LLM/helpers/Utils.cs b/PDF4LLM/helpers/Utils.cs index d65da00..8f925ca 100644 --- a/PDF4LLM/helpers/Utils.cs +++ b/PDF4LLM/helpers/Utils.cs @@ -680,8 +680,47 @@ public static (float x0, float y0, float x1, float y1) ExpandBboxByPoints( /// /// Analyze the page for OCR decision /// + private static (float variance, float edgeEnergy) GetPixmapStats(Page sourcePage, Rect clip, int dpi = 72) + { + using (Pixmap pix = sourcePage.GetPixmap(clip: clip, dpi: dpi)) + { + if (pix == null || pix.N < 1 || pix.W * pix.H == 0) + return (0f, 0f); + + byte[] samples = pix.SAMPLES; + if (samples == null || samples.Length == 0) + return (0f, 0f); + + int n = samples.Length; + double sum = 0; + for (int i = 0; i < n; i++) + sum += samples[i]; + double mean = sum / n; + + double varAcc = 0; + for (int i = 0; i < n; i++) + { + double d = samples[i] - mean; + varAcc += d * d; + } + double variance = varAcc / n; + + double edgeAcc = 0; + for (int i = 1; i < n; i++) + edgeAcc += Math.Abs(samples[i] - samples[i - 1]); + double edge = edgeAcc / n; + + return ((float)variance, (float)edge); + } + } + public static Dictionary AnalyzePage(Page page, List blocks = null) { + const float BAD_CHAR_THRESHOLD = 0.1f; + const float VEC_AREA_THRESHOLD = 0.05f; + const float IMG_VAR_THRESHOLD_HIGH = 50.0f; + const float IMG_EDGE_THRESHOLD_HIGH = 20.0f; + int charsTotal = 0; int charsBad = 0; @@ -703,6 +742,9 @@ public static Dictionary AnalyzePage(Page page, List bloc float txtArea = 0; float vecArea = 0; int ocrSpans = 0; + int vecSuspicious = 0; + float imgVarWeighted = 0; + float imgEdgeWeighted = 0; foreach (var b in blocks) { @@ -718,6 +760,9 @@ public static Dictionary AnalyzePage(Page page, List bloc { imgRect = JoinRects(new List { imgRect, bbox }); imgArea += area; + var (varScore, edgeScore) = GetPixmapStats(page, bbox, dpi: 72); + imgVarWeighted += varScore * area; + imgEdgeWeighted += edgeScore * area; } else if (b.Type == 0) // Text block { @@ -744,7 +789,7 @@ public static Dictionary AnalyzePage(Page page, List bloc continue; if (span.Font == "GlyphLessFont" - || ((span.CharFlags & 8U) == 0 && (span.CharFlags & 16U) == 0)) + || global::PDF4LLM.Ocr.TesseractApi.OcrText(span)) { ocrSpans++; continue; @@ -765,13 +810,12 @@ public static Dictionary AnalyzePage(Page page, List bloc else if ( true && b.Type == 3 // Vector block - // && b.Stroked // Note: Stroked and IsRect may not be available - && 2 < bbox.Width && bbox.Width <= 20 // Width limit for typical characters - && 2 < bbox.Height && bbox.Height <= 20 // Height limit for typical characters - // && !b.IsRect // Contains curves + && 3 <= bbox.Width && bbox.Width <= 20 // Width limit for typical characters + && 3 <= bbox.Height && bbox.Height <= 20 // Height limit for typical characters ) { // Potential character-like vector block + vecSuspicious += 1; vecRect = JoinRects(new List { vecRect, bbox }); vecArea += area; } @@ -779,16 +823,33 @@ public static Dictionary AnalyzePage(Page page, List bloc // The rectangle on page covered by some content Rect covered = JoinRects(new List { imgRect, txtRect, vecRect }); - float coverArea = Math.Abs(covered.Width * covered.Height); + if (BboxIsEmpty(covered)) + { + return new Dictionary + { + ["covered"] = covered, + ["img_joins"] = 0f, + ["img_area"] = 0f, + ["txt_joins"] = 0f, + ["txt_area"] = 0f, + ["vec_joins"] = 0f, + ["vec_area"] = 0f, + ["chars_total"] = 0, + ["chars_bad"] = 0, + ["ocr_spans"] = 0, + ["img_var"] = 0f, + ["img_edges"] = 0f, + ["vec_suspicious"] = 0, + ["needs_ocr"] = false, + ["reason"] = null, + }; + } - bool needsOcr = false; - if (charsTotal > 0 && (float)charsBad / charsTotal > 0.1f) - needsOcr = true; - else if (ocrSpans > 0) - needsOcr = true; + float coverArea = Math.Abs((covered.X1 - covered.X0) * (covered.Y1 - covered.Y0)); + float imgVar = (imgArea > 0 && coverArea > 0) ? (imgVarWeighted / imgArea) : 0f; + float imgEdges = (imgArea > 0 && coverArea > 0) ? (imgEdgeWeighted / imgArea) : 0f; - // The area-related float values are computed as fractions of the total covered area. - return new Dictionary + var analysis = new Dictionary { ["covered"] = covered, // Page area covered by content ["img_joins"] = coverArea > 0 ? Math.Abs(imgRect.Width * imgRect.Height) / coverArea : 0, // Fraction of area of the joined images @@ -800,8 +861,24 @@ public static Dictionary AnalyzePage(Page page, List bloc ["chars_total"] = charsTotal, // Count of visible characters ["chars_bad"] = charsBad, // Count of Replacement Unicode characters ["ocr_spans"] = ocrSpans, // Count: text spans with ignored text (render mode 3) - ["needs_ocr"] = needsOcr, + ["img_var"] = imgVar, + ["img_edges"] = imgEdges, + ["vec_suspicious"] = vecSuspicious, }; + + if (charsTotal > 0 && (float)charsBad / charsTotal > BAD_CHAR_THRESHOLD) + return new Dictionary(analysis) { ["needs_ocr"] = true, ["reason"] = "chars_bad" }; + + if (ocrSpans > 0) + return new Dictionary(analysis) { ["needs_ocr"] = true, ["reason"] = "ocr_spans" }; + + if (vecSuspicious > 3 && coverArea > 0 && (vecArea / coverArea) >= VEC_AREA_THRESHOLD) + return new Dictionary(analysis) { ["needs_ocr"] = true, ["reason"] = "vec_text" }; + + if (imgArea > 0 && (imgVar > IMG_VAR_THRESHOLD_HIGH || imgEdges > IMG_EDGE_THRESHOLD_HIGH)) + return new Dictionary(analysis) { ["needs_ocr"] = true, ["reason"] = "img_text" }; + + return new Dictionary(analysis) { ["needs_ocr"] = false, ["reason"] = null }; } } } diff --git a/PDF4LLM/ocr/tesseract_api.cs b/PDF4LLM/ocr/tesseract_api.cs index d418bd8..a273baa 100644 --- a/PDF4LLM/ocr/tesseract_api.cs +++ b/PDF4LLM/ocr/tesseract_api.cs @@ -28,12 +28,20 @@ public static bool OcrText(Span span) /// /// Full-page OCR callback from PDF4LLM (redaction + pdfocr_tobytes pipeline). - /// Not ported; use for span repair and OCR decisions. + /// Executes Tesseract OCR through MuPDF.NET and returns the OCR text page. /// - public static void ExecOcr(Page page, int dpi = 300, Pixmap pixmap = null, string language = "eng", bool keepOcrText = false) + public static TextPage ExecOcr(Page page, int dpi, string language, bool keepOcrText) { - throw new NotImplementedException( - "TesseractApi.ExecOcr (PDF4LLM.ocr.tesseract_api.exec_ocr) is not implemented for MuPDF.NET; use CheckOcr for span-level repair."); + return ExecOcr(page, dpi: dpi, pixmap: null, language: language, keepOcrText: keepOcrText); + } + + /// + /// Compatibility overload retained for existing OCR helper call sites. + /// + public static TextPage ExecOcr(Page page, int dpi = 300, Pixmap pixmap = null, string language = "eng", bool keepOcrText = false) + { + // keepOcrText is currently not configurable through GetTextPageOcr. + return page.GetTextPageOcr(flags: CheckOcr.FLAGS, dpi: dpi, language: language, full: true); } }