diff --git a/examples/requirements.txt b/examples/requirements.txt index 931b69f..0d57b0b 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -1,4 +1,5 @@ aspose-pdf lxml pydicom -pandas \ No newline at end of file +pandas +pytesseract \ No newline at end of file diff --git a/examples/working_with_documents/example_create_pdf_document.py b/examples/working_with_documents/example_create_pdf_document.py index a00b082..8a8316e 100644 --- a/examples/working_with_documents/example_create_pdf_document.py +++ b/examples/working_with_documents/example_create_pdf_document.py @@ -1,7 +1,10 @@ +import aspose.pdf as ap +import io +import pytesseract import sys from os import path +from pathlib import Path -import aspose.pdf as ap sys.path.append(path.join(path.dirname(__file__), "..")) @@ -16,6 +19,34 @@ def create_new_document(input_pdf, output_pdf): document.save(output_pdf) +def create_searchable_document(infile, outfile, image_file_path, page_number=1): + """ + An example of using optical character recognition (OCR) technology to create a searchable PDF document. + + Args: + infile (str): The name of the input PDF file + outfile (str): The base name for output files (index will be appended) + image_file_path (str): The name of the image file + page_number (int): The page number + + Returns: + None + """ + image_stream = io.FileIO(image_file_path, 'x') + try: + document = ap.Document(infile) + resolution = ap.devices.Resolution(300) + png_device = ap.devices.PngDevice(resolution) + png_device.process(document.pages[page_number], image_stream) + pdf = pytesseract.image_to_pdf_or_hocr(image_file_path, extension='pdf') + document = ap.Document(io.BytesIO(pdf)) + document.save(outfile) + finally: + image_stream.close() + image_file = Path(image_file_path) + image_file.unlink(missing_ok=True) + + def run_all_examples(data_dir=None, license_path=None): """Run PDF creation examples and report status.""" set_license(license_path) @@ -23,13 +54,18 @@ def run_all_examples(data_dir=None, license_path=None): examples = [ ("Create new document", create_new_document), + ("Create a Searchable PDF document", create_searchable_document), ] for name, func in examples: try: input_file_name = path.join(input_dir, f"{func.__name__}.pdf") output_file_name = path.join(output_dir, f"{func.__name__}.pdf") - func(input_file_name, output_file_name) + if func == create_searchable_document: + image_path = path.join(output_dir, "create_searchable_document.png") + func(input_file_name, output_file_name, image_path) + else: + func(input_file_name, output_file_name) print(f"✅ Success: {name}") except Exception as e: print(f"❌ Failed: {name} - {str(e)}") diff --git a/sample_data/working_with_documents/input/create_searchable_document.pdf b/sample_data/working_with_documents/input/create_searchable_document.pdf new file mode 100644 index 0000000..a3daa09 Binary files /dev/null and b/sample_data/working_with_documents/input/create_searchable_document.pdf differ