From dc7f2aa5f4e6f5275c589fc861f5b181eff21fb1 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Sun, 1 Mar 2026 06:10:49 -0400 Subject: [PATCH] Rework textpage_ocr For partial OCR, we previously added text content from OCR'd images on the page. We now redact legible text and let the OCR engine recognize the remaining page content - which includes images as before but also vectors simulating text. --- docs/page.rst | 23 ++-- src/utils.py | 168 +++++++++++++++++--------- tests/resources/test_3842_partial.txt | 43 +++++++ tests/test_tesseract.py | 45 +------ 4 files changed, 173 insertions(+), 106 deletions(-) create mode 100644 tests/resources/test_3842_partial.txt diff --git a/docs/page.rst b/docs/page.rst index ff34592df..c842c3aaa 100644 --- a/docs/page.rst +++ b/docs/page.rst @@ -1515,34 +1515,37 @@ In a nutshell, this is what you can do with PyMuPDF: .. method:: get_textpage_ocr(flags=3, language="eng", dpi=72, full=False, tessdata=None) - **Optical Character Recognition** (**OCR**) technology can be used to extract text data for documents where text is in a raster image format throughout the page. Use this method to **OCR** a page for text extraction. + **Optical Character Recognition** (**OCR**) technology can be used to extract text data for pages where text is in raster image or vector graphic format. Use this method to **OCR** a page for subsequent text extraction. - This method returns a :ref:`TextPage` for the page that includes OCRed text. MuPDF will invoke Tesseract-OCR if this method is used. Otherwise this is a normal :ref:`TextPage` object. + This method returns a :ref:`TextPage` for the page that includes OCRed text. MuPDF will invoke Tesseract-OCR if this method is used. :arg int flags: indicator bits controlling the content available for subsequent test extractions and searches -- see the parameter of :meth:`Page.get_text`. :arg str language: the expected language(s). Use "+"-separated values if multiple languages are expected, "eng+spa" for English and Spanish. :arg int dpi: the desired resolution in dots per inch. Influences recognition quality (and execution time). - :arg bool full: whether to OCR the full page, or just the displayed images. - :arg str tessdata: The name of Tesseract's language support folder `tessdata`. If omitted, this information must be present as environment variable `TESSDATA_PREFIX`. Can be determined by function :meth:`get_tessdata`. + :arg bool full: whether to OCR the full page, or only page areas that contain no legible text. + :arg str tessdata: The name of Tesseract's language support folder `tessdata`. If omitted, the name is determined using function :meth:`get_tessdata`. - .. note:: This method does **not** support a clip parameter -- OCR will always happen for the complete page rectangle. + .. note:: This method does **not** support a clip parameter -- OCR (full or partial) will always happen for the complete page rectangle. :returns: a :ref:`TextPage`. Execution may be significantly longer than :meth:`Page.get_textpage`. - For a full page OCR, **all text** will have the font "GlyphlessFont" from Tesseract. In case of partial OCR, normal text will keep its properties, and only text coming from images will have the GlyphlessFont. + For ``full=True`` OCR, **all text** will have the font "GlyphLessFont" from Tesseract. In case of partial OCR (``full=False``), legible normal text will keep its properties, and only recognized text will have the GlyphLessFont. - .. note:: - - **OCRed text is only available** to PyMuPDF's text extractions and searches if their `textpage` parameter specifies the output of this method. + Recognized / OCR text will follow (legible) normal text for partial OCR and will thus not be in reading order. Establishing reading order is -- as always -- your responsibility. + + .. note:: + + Text extraction results, including any OCR, are stored in the returned :ref:`TextPage`. To access them, you must use the ``textpage`` parameter in all subsequent text extraction and search methods. - `This Jupyter notebook `_ walks through an example for using OCR textpages. + `This Jupyter notebook `_ walks through an example for using OCR textpages. |history_begin| * New in v.1.19.0 * Changed in v1.19.1: support full and partial OCRing a page. + * changed in v1.27.2: For partial OCR, **all** page areas outside legible text are now OCRed, not just those within images. This means that OCR will now also be performed for vector graphics, and for text containing illegible characters. |history_end| diff --git a/src/utils.py b/src/utils.py index 8e295989a..7d3271ea8 100644 --- a/src/utils.py +++ b/src/utils.py @@ -14,10 +14,6 @@ from . import pymupdf except Exception: import pymupdf -try: - from . import mupdf -except Exception: - import mupdf _format_g = pymupdf.format_g @@ -322,80 +318,142 @@ def get_textpage_ocr( full: bool = False, tessdata: str = None, ) -> pymupdf.TextPage: - """Create a Textpage from combined results of normal and OCR text parsing. + """Create a Textpage from the OCR version of the page. + + OCR can be executed for the full page image, or (the default) only + for areas that are not covered by readable digital text. Args: flags: (int) control content becoming part of the result. language: (str) specify expected language(s). Default is "eng" (English). dpi: (int) resolution in dpi, default 72. - full: (bool) whether to OCR the full page image, or only its images (default) + full: (bool) whether to OCR the full page, or to keep legible text + tessdata: (str) path to Tesseract language data files. If None, the + built-in function is used to find the path. """ pymupdf.CheckParent(page) tessdata = pymupdf.get_tessdata(tessdata) + # Ensure 0xFFFD is not suppressed + flags = ( + flags + & ~pymupdf.TEXT_USE_CID_FOR_UNKNOWN_UNICODE # pylint: disable=no-member + & ~pymupdf.TEXT_USE_GID_FOR_UNKNOWN_UNICODE # pylint: disable=no-member + ) + def full_ocr(page, dpi, language, flags): - zoom = dpi / 72 - mat = pymupdf.Matrix(zoom, zoom) - pix = page.get_pixmap(matrix=mat) + """Perform OCR for the full page image.""" + pix = page.get_pixmap(dpi=dpi) + # create a 1-page PDF with an OCR text layer. ocr_pdf = pymupdf.Document( - "pdf", - pix.pdfocr_tobytes( - compress=False, - language=language, - tessdata=tessdata, - ), - ) + stream=pix.pdfocr_tobytes( + compress=False, + language=language, + tessdata=tessdata, + ), + ) ocr_page = ocr_pdf.load_page(0) unzoom = page.rect.width / ocr_page.rect.width ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix tpage = ocr_page.get_textpage(flags=flags, matrix=ctm) - ocr_pdf.close() - pix = None + + # associate the textpage with the original page tpage.parent = weakref.proxy(page) return tpage + def partial_ocr(page, dpi, language, flags): + """Perform OCR for parts of the page without legible text. + + We create a temporary PDF for which we can freely redact text. + """ + doc = page.parent + + # make temporary PDF with the passed-in page + temp_pdf = pymupdf.open() + temp_pdf.insert_pdf(doc, from_page=page.number, to_page=page.number) + temp_page = temp_pdf.load_page(0) + temp_page.remove_rotation() # avoid OCR problems with rotated pages + + # extract text bboxes from the page + tp = temp_page.get_textpage(flags=flags) + blocks = tp.extractDICT()["blocks"] + + """ + For partial OCR we need a TextPage that contains legible text only. + Illegible text must be passed to the OCR engine. + """ + # Select spans with illegible text. If present, remove them first. + fffd_spans = [ + s["bbox"] + for b in blocks + if b["type"] == 0 + for l in b["lines"] + for s in l["spans"] + if chr(0xFFFD) in s["text"] + ] + if fffd_spans: + for bbox in fffd_spans: + temp_page.add_redact_annot(bbox) + temp_page.apply_redactions( + images=pymupdf.PDF_REDACT_IMAGE_NONE, # pylint: disable=no-member + graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, # pylint: disable=no-member + text=pymupdf.PDF_REDACT_TEXT_REMOVE, # pylint: disable=no-member + ) + # Extract text again, now without the unreadable spans. + tp = temp_page.get_textpage(flags=flags) + blocks = tp.extractDICT()["blocks"] + # We also need a fresh copy of the original page. + temp_pdf.insert_pdf(doc, from_page=page.number, to_page=page.number) + temp_page = temp_pdf.load_page(-1) + temp_page.remove_rotation() # avoid OCR problems with rotated pages + + span_bboxes = [ + s["bbox"] + for b in blocks + if b["type"] == 0 + for l in b["lines"] + for s in l["spans"] + if not chr(0xFFFD) in s["text"] + ] + + # Remove digital text by redacting the span bboxes. + # Then OCR the remainder of the page. + for bbox in span_bboxes: + temp_page.add_redact_annot(bbox) + + # only remove text, no images, no vectors + temp_page.apply_redactions( + images=pymupdf.PDF_REDACT_IMAGE_NONE, # pylint: disable=no-member + graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, # pylint: disable=no-member + text=pymupdf.PDF_REDACT_TEXT_REMOVE, # pylint: disable=no-member + ) + pix = temp_page.get_pixmap(dpi=dpi) + # matrix = pymupdf.Rect(pix.irect).torect(page.rect) + + # OCR the redacted page + ocr_pdf = pymupdf.open( + stream=pix.pdfocr_tobytes( + compress=False, + language=language, + tessdata=tessdata, + ), + ) + ocr_page = ocr_pdf[0] + + # Extend the original textpage with OCR-ed text. + ocr_page.extend_textpage(tp, flags=pymupdf.TEXT_ACCURATE_BBOXES) + + # associate the textpage with the original page + tp.parent = weakref.proxy(page) + return tp + # if OCR for the full page, OCR its pixmap @ desired dpi if full: return full_ocr(page, dpi, language, flags) # For partial OCR, make a normal textpage, then extend it with text that - # is OCRed from each image. - # Because of this, we need the images flag bit set ON. - tpage = page.get_textpage(flags=flags) - for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]: - if block["type"] != 1: # only look at images - continue - bbox = pymupdf.Rect(block["bbox"]) - if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff - continue - try: - pix = pymupdf.Pixmap(block["image"]) # get image pixmap - if pix.n - pix.alpha != 3: # we need to convert this to RGB! - pix = pymupdf.Pixmap(pymupdf.csRGB, pix) - if pix.alpha: # must remove alpha channel - pix = pymupdf.Pixmap(pix, 0) - imgdoc = pymupdf.Document( - "pdf", - pix.pdfocr_tobytes(language=language, tessdata=tessdata), - ) # pdf with OCRed page - imgpage = imgdoc.load_page(0) # read image as a page - pix = None - # compute matrix to transform coordinates back to that of 'page' - imgrect = imgpage.rect # page size of image PDF - shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height) - mat = shrink * block["transform"] - imgpage.extend_textpage(tpage, flags=0, matrix=mat) - imgdoc.close() - except (RuntimeError, mupdf.FzErrorBase): - if 0 and g_exceptions_verbose: - # Don't show exception info here because it can happen in - # normal operation (see test_3842b). - pymupdf.exception_info() - tpage = None - pymupdf.message("Falling back to full page OCR") - return full_ocr(page, dpi, language, flags) - - return tpage + # is OCRed from the rest of page. + return partial_ocr(page, dpi, language, flags) def get_text( diff --git a/tests/resources/test_3842_partial.txt b/tests/resources/test_3842_partial.txt new file mode 100644 index 000000000..6b97dd0cb --- /dev/null +++ b/tests/resources/test_3842_partial.txt @@ -0,0 +1,43 @@ +NIST SP 800-223 + +High-Performance Computing Security +February 2024 + + +iii +Table of Contents +1. Introduction ...................................................................................................................................1 +2. HPC System Reference Architecture and Main Components ............................................................2 +2.1.1. Components of the High-Performance Computing Zone ............................................................. 3 +2.1.2. Components of the Data Storage Zone ........................................................................................ 4 +2.1.3. Parallel File System ....................................................................................................................... 4 +2.1.4. Archival and Campaign Storage .................................................................................................... 5 +2.1.5. Burst Buffer .................................................................................................................................. 5 +2.1.6. Components of the Access Zone .................................................................................................. 6 +2.1.7. Components of the Management Zone ....................................................................................... 6 +2.1.8. General Architecture and Characteristics .................................................................................... 6 +2.1.9. Basic Services ................................................................................................................................ 7 +2.1.10. Configuration Management ....................................................................................................... 7 +2.1.11. HPC Scheduler and Workflow Management .............................................................................. 7 +2.1.12. HPC Software .............................................................................................................................. 8 +2.1.13. User Software ............................................................................................................................. 8 +2.1.14. Site-Provided Software and Vendor Software ........................................................................... 8 +2.1.15. Containerized Software in HPC .................................................................................................. 9 +3. HPC Threat Analysis...................................................................................................................... 10 +3.2.1. Access Zone Threats ................................................................................................................... 11 +3.2.2. Management Zone Threats ........................................................................................................ 11 +3.2.3. High-Performance Computing Zone Threats .............................................................................. 12 +3.2.4. Data Storage Zone Threats ......................................................................................................... 12 +4. HPC Security Posture, Challenges, and Recommendations ............................................................. 14 +5. Conclusions .................................................................................................................................. 19 +2.1. Main COMPONENNS..........cccccssccccssssccccssssccccssnsecccsssseeccessseeecsessseecssaseecsessseeceessseecseeaseecsessseeesessseeessstseeesD +3.1. Key HPC Security Characteristics and Use REquireMent............cccsscccessscesseceessecesseeesssecesseeestteessteee LO +3.2. Threats to HPC FUNCTION ZONES..........cesccesscesscesscesscesecsssesssecssscesscesscsseessessescesssesscessessssssssssssssssssees LO +3.3. Other Threats ........cccccsccsscssscsssccssscssscssscssscsssesssesssscssscssessseesseeseessesscsssssssessessesssessssssssssssssssssssseesesLO +4.1. HPC Access Control via Network SEgMeNtatiOn ..........:ccccscccsssccessecesssecesecesssecessecessecessteessecessteessee LO +4.2. Compute Node Sanitization ...........cccccessccssssccessecessecesseecssseccseecsseecsseecesseesessscssssesssescssssessssesssessses +LD +4.3. Data Integrity Protection ............cccccccccccccsssssssscececccesssssssseceeccessscssssseeeccesesssssssseeesessssssstsssesesssssssesLOD +4.4. SECUFING CONTAINELSS ........ccccssccccssssccccessseeccesseeccsssssecceesssecceessseccesssseecsessseeccsssssescsssssescssssssscssssesesesLO +4.5. Achieving Security While Maintaining HPC Performance. ..........cc:cccssccsssseesssecessecesssecessecessseesseeesee LZ +4.6. Challenges to HPC Security TOols...........c:ccccssccssseccsssecesseecesecessseccsseecssseecsseecseseecssesesstscssssesssessssessse LZ diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py index 11c383526..4babde463 100644 --- a/tests/test_tesseract.py +++ b/tests/test_tesseract.py @@ -1,7 +1,7 @@ import os import platform import textwrap - +import pathlib import pymupdf def test_tesseract(): @@ -79,10 +79,12 @@ def test_3842(): return path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf') + path_text = os.path.normpath(f'{__file__}/../../tests/resources/test_3842_partial.txt') + text_expected = pathlib.Path(path_text).read_text() with pymupdf.open(path) as document: page = document[6] try: - partial_tp = page.get_textpage_ocr(flags=0, full=False) + partial_tp = page.get_textpage_ocr(flags=0, full=False, dpi=300) except Exception as e: print(f'test_3842(): received exception: {e}', flush=1) if 'No tessdata specified and Tesseract is not installed' in str(e): @@ -96,44 +98,5 @@ def test_3842(): print() print(text) print(f'text:\n{text!r}') - - # 2024-11-29: This is the current incorrect output. We use - # underscores for lines containing entirely whitespace (which - # textwrap.dedent() unfortunately replaces with empty lines). - text_expected = textwrap.dedent(''' - NIST SP 800-223 - _ - High-Performance Computing Security - February 2024 - _ - __ - iii - Table of Contents - 1. Introduction ...................................................................................................................................1 - 2. HPC System Reference Architecture and Main Components ............................................................2 - 2.1.1. Components of the High-Performance Computing Zone ............................................................. 3 - 2.1.2. Components of the Data Storage Zone ........................................................................................ 4 - 2.1.3. Parallel File System ....................................................................................................................... 4 - 2.1.4. Archival and Campaign Storage .................................................................................................... 5 - 2.1.5. Burst Buffer .................................................................................................................................. 5 - 2.1.6. Components of the Access Zone .................................................................................................. 6 - 2.1.7. Components of the Management Zone ....................................................................................... 6 - 2.1.8. General Architecture and Characteristics .................................................................................... 6 - 2.1.9. Basic Services ................................................................................................................................ 7 - 2.1.10. Configuration Management ....................................................................................................... 7 - 2.1.11. HPC Scheduler and Workflow Management .............................................................................. 7 - 2.1.12. HPC Software .............................................................................................................................. 8 - 2.1.13. User Software ............................................................................................................................. 8 - 2.1.14. Site-Provided Software and Vendor Software ........................................................................... 8 - 2.1.15. Containerized Software in HPC .................................................................................................. 9 - 3. HPC Threat Analysis...................................................................................................................... 10 - 3.2.1. Access Zone Threats ................................................................................................................... 11 - 3.2.2. Management Zone Threats ........................................................................................................ 11 - 3.2.3. High-Performance Computing Zone Threats .............................................................................. 12 - 3.2.4. Data Storage Zone Threats ......................................................................................................... 12 - 4. HPC Security Posture, Challenges, and Recommendations ............................................................. 14 - 5. Conclusions .................................................................................................................................. 19 - ''', - )[1:].replace('_', ' ') print(f'text_expected:\n{text_expected!r}') assert text == text_expected