diff --git a/.gitignore b/.gitignore index 6990476e..f659ceeb 100644 --- a/.gitignore +++ b/.gitignore @@ -395,5 +395,8 @@ FodyWeavers.xsd *.msm *.msp +# Local clone of https://github.com/pymupdf/pymupdf4llm for porting / diff (optional) +pymupdf4llm/ + # JetBrains Rider *.sln.iml diff --git a/Demo/Demo.csproj b/Demo/Demo.csproj index c7f96e77..1590b319 100644 --- a/Demo/Demo.csproj +++ b/Demo/Demo.csproj @@ -1,4 +1,5 @@ + Exe diff --git a/Demo/GlobalUsings.cs b/Demo/GlobalUsings.cs new file mode 100644 index 00000000..9541be44 --- /dev/null +++ b/Demo/GlobalUsings.cs @@ -0,0 +1,19 @@ +global using System; +global using System.Collections.Generic; +global using System.IO; +global using System.Linq; +global using System.Text; +global using System.Threading; +global using mupdf; +global using MuPDF.NET; +global using MuPDF.NET4LLM; +global using MuPDF.NET4LLM.Helpers; +global using MuPDF.NET4LLM.Llama; +global using SkiaSharp; +global using Box = MuPDF.NET.Box; +global using Encoding = System.Text.Encoding; +global using File = System.IO.File; +global using Font = MuPDF.NET.Font; +global using Morph = MuPDF.NET.Morph; +global using TextWriter = MuPDF.NET.TextWriter; +global using Utils = MuPDF.NET.Utils; diff --git a/Demo/Program.cs b/Demo/Program.cs index 7efcbd74..e72dce8e 100644 --- a/Demo/Program.cs +++ b/Demo/Program.cs @@ -1,1738 +1,13 @@ -using mupdf; -using MuPDF.NET; -using MuPDF.NET4LLM; -using MuPDF.NET4LLM.Helpers; -using MuPDF.NET4LLM.Llama; -using SkiaSharp; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Globalization; -using System.IO; -using System.Linq; -using System.Text; -using System.Threading; -using static ICSharpCode.SharpZipLib.Zip.ExtendedUnixData; -using static System.Net.Mime.MediaTypeNames; -using static System.Net.WebRequestMethods; -using Box = MuPDF.NET.Box; -using Encoding = System.Text.Encoding; -using File = System.IO.File; -using Font = MuPDF.NET.Font; -using Morph = MuPDF.NET.Morph; -using TextWriter = MuPDF.NET.TextWriter; -using Utils = MuPDF.NET.Utils; - namespace Demo { - public static class Units + /// + /// GitHub samples entry point. With no arguments, all samples run; see . + /// + internal partial class Program { - // Constants - public const float InchesPerMm = 1.0f / 25.4f; - public const float PointsPerInch = 72.0f; - - // --- mm <-> points (PostScript points: 1 pt = 1/72 in) --- - public static float MmToPoints(float mm) => mm * InchesPerMm * PointsPerInch; // = mm * 72 / 25.4 - public static float PointsToMm(float points) => points / PointsPerInch / InchesPerMm; // = points * 25.4 / 72 - - // --- mm <-> pixels (requires device DPI) --- - public static float MmToPixels(float mm, float dpi) => mm * InchesPerMm * dpi; - public static float PixelsToMm(float px, float dpi) => px / dpi / InchesPerMm; - } - class Program - { - static void Main(string[] args) - { - TestInsertHtmlbox(); - TestLineAnnot(); - AnnotationsFreeText1.Run(args); - AnnotationsFreeText2.Run(args); - NewAnnots.Run(args); - TestHelloWorldToNewDocument(args); - TestHelloWorldToExistingDocument(args); - TestReadBarcode(args); - TestReadDataMatrix(); - TestWriteBarcode(args); - TestExtractTextWithLayout(args); - TestWidget(args); - TestColor(args); - TestCMYKRecolor(args); - TestSVGRecolor(args); - TestReplaceImage(args); - TestInsertImage(args); - TestGetImageInfo(args); - TestGetTextPageOcr(args); - TestCreateImagePage(args); - TestJoinPdfPages(args); - TestFreeTextAnnot(args); - TestTextFont(args); - TestMemoryLeak(); - TestDrawLine(); - TestWriteBarcode1(); - TestUnicodeDocument(); - TestMorph(); - TestMetadata(); - TestMoveFile(); - TestImageFilter(); - TestImageFilterOcr(); - CreateAnnotDocument(); - TestDrawShape(); - TestIssue213(); - TestIssue1880(); - TestLLM(); - TestPyMuPdfRagToMarkdown(); - TestTable(); - TestGetText(); - TestMarkdownReader(); - TestRecompressJBIG2(); - - return; - } - - static void TestRecompressJBIG2() - { - Console.WriteLine("\n=== TestJBIG2 ======================="); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/Jbig2.pdf"); - - Document doc = new Document(testFilePath); - - PdfImageRewriterOptions opts = new PdfImageRewriterOptions(); - - opts.bitonal_image_recompress_method = mupdf.mupdf.FZ_RECOMPRESS_FAX; - opts.recompress_when = mupdf.mupdf.FZ_RECOMPRESS_WHEN_ALWAYS; - - doc.RewriteImage(options: opts); - - doc.Save(@"e:\TestRecompressJBIG2.pdf"); - doc.Close(); - } - - static void TestMarkdownReader() - { - Console.WriteLine("\n=== TestMarkdownReader ======================="); - - var reader = new PDFMarkdownReader(); - string testFilePath = Path.GetFullPath("../../../TestDocuments/columns.pdf"); - - var docs = reader.LoadData(testFilePath); - - foreach (var doc in docs) - { - Console.WriteLine(doc.Text); - } - } - - static void TestGetText() - { - Console.WriteLine("\n=== TestGetText ======================="); - - var reader = new PDFMarkdownReader(); - string testFilePath = Path.GetFullPath("../../../TestDocuments/columns.pdf"); - - Document doc = new Document(testFilePath); - - for (int i = 0; i < doc.PageCount; i++) - { - Page page = doc[i]; - - var text = Utils.GetText(page, option: "dict"); - - Console.WriteLine(text); - - page.Dispose(); - } - - doc.Close(); - } - - static void TestTable() - { - Console.WriteLine("\n=== TestTable ======================="); - - try - { - string testFilePath = Path.GetFullPath("../../../TestDocuments/err_table.pdf"); - - if (!File.Exists(testFilePath)) - { - Console.WriteLine($"Error: Test file not found: {testFilePath}"); - return; - } - - Console.WriteLine($"Loading PDF: {testFilePath}"); - Document doc = new Document(testFilePath); - Console.WriteLine($"Document loaded: {doc.PageCount} page(s)"); - - // Test on first page - Page page = doc[0]; - Console.WriteLine($"\nPage 0 - Rect: {page.Rect}"); - - // Test 1: Get tables with default strategy - Console.WriteLine("\n--- Test 1: Get tables with 'lines_strict' strategy ---"); - List tables = Utils.GetTables( - page, - clip: page.Rect, - vertical_strategy: "lines_strict", - horizontal_strategy: "lines_strict"); - - Console.WriteLine($"Found {tables.Count} table(s) on page 0"); - - if (tables.Count > 0) - { - for (int i = 0; i < tables.Count; i++) - { - Table table = tables[i]; - Console.WriteLine($"\n Table {i + 1}:"); - Console.WriteLine($" Rows: {table.row_count}"); - Console.WriteLine($" Columns: {table.col_count}"); - if (table.bbox != null) - { - Console.WriteLine($" BBox: ({table.bbox.X0:F2}, {table.bbox.Y0:F2}, {table.bbox.X1:F2}, {table.bbox.Y1:F2})"); - } - - // Display header information - if (table.header != null) - { - Console.WriteLine($" Header:"); - Console.WriteLine($" External: {table.header.external}"); - if (table.header.names != null && table.header.names.Count > 0) - { - Console.WriteLine($" Column names: {string.Join(", ", table.header.names)}"); - } - } - - // Extract table data - Console.WriteLine($"\n Extracting table data..."); - List> tableData = table.Extract(); - if (tableData != null && tableData.Count > 0) - { - Console.WriteLine($" Extracted {tableData.Count} row(s) of data"); - // Show first few rows as preview - int previewRows = Math.Min(3, tableData.Count); - for (int row = 0; row < previewRows; row++) - { - var rowData = tableData[row]; - if (rowData != null) - { - Console.WriteLine($" Row {row + 1}: {string.Join(" | ", rowData.Take(5))}"); // Show first 5 columns - } - } - if (tableData.Count > previewRows) - { - Console.WriteLine($" ... and {tableData.Count - previewRows} more row(s)"); - } - } - - // Convert to markdown - Console.WriteLine($"\n Converting to Markdown..."); - try - { - string markdown = table.ToMarkdown(clean: false, fillEmpty: true); - if (!string.IsNullOrEmpty(markdown)) - { - Console.WriteLine($" Markdown length: {markdown.Length} characters"); - // Save markdown to file - string markdownFile = $"table_{i + 1}_page0.md"; - File.WriteAllText(markdownFile, markdown, Encoding.UTF8); - Console.WriteLine($" Markdown saved to: {markdownFile}"); - - // Show preview - int previewLength = Math.Min(200, markdown.Length); - Console.WriteLine($" Preview (first {previewLength} chars):"); - Console.WriteLine($" {markdown.Substring(0, previewLength)}..."); - } - } - catch (Exception ex) - { - Console.WriteLine($" Error converting to markdown: {ex.Message}"); - } - } - } - else - { - Console.WriteLine("No tables found. Trying with 'lines' strategy..."); - - // Test 2: Try with 'lines' strategy (less strict) - Console.WriteLine("\n--- Test 2: Get tables with 'lines' strategy ---"); - tables = Utils.GetTables( - page, - clip: page.Rect, - vertical_strategy: "lines", - horizontal_strategy: "lines"); - - Console.WriteLine($"Found {tables.Count} table(s) with 'lines' strategy"); - } - - // Test 3: Try with 'text' strategy - Console.WriteLine("\n--- Test 3: Get tables with 'text' strategy ---"); - List
textTables = Utils.GetTables( - page, - clip: page.Rect, - vertical_strategy: "text", - horizontal_strategy: "text"); - - Console.WriteLine($"Found {textTables.Count} table(s) with 'text' strategy"); - - // Test 4: Get tables from all pages - Console.WriteLine("\n--- Test 4: Get tables from all pages ---"); - int totalTables = 0; - for (int pageNum = 0; pageNum < doc.PageCount; pageNum++) - { - Page currentPage = doc[pageNum]; - List
pageTables = Utils.GetTables( - currentPage, - clip: currentPage.Rect, - vertical_strategy: "lines_strict", - horizontal_strategy: "lines_strict"); - - if (pageTables.Count > 0) - { - Console.WriteLine($" Page {pageNum}: {pageTables.Count} table(s)"); - totalTables += pageTables.Count; - } - currentPage.Dispose(); - } - Console.WriteLine($"Total tables found across all pages: {totalTables}"); - - page.Dispose(); - doc.Close(); - - Console.WriteLine("\n=== TestTable completed successfully ==="); - } - catch (Exception ex) - { - Console.WriteLine($"Error in TestTable: {ex.Message}"); - Console.WriteLine($"Stack trace: {ex.StackTrace}"); - throw; - } - } - - static void TestPyMuPdfRagToMarkdown() - { - Console.WriteLine("\n=== TestPyMuPdfRagToMarkdown ======================="); - - try - { - // Find a test PDF file - //string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf"); - string testFilePath = Path.GetFullPath("../../../TestDocuments/Magazine.pdf"); - - Document doc = new Document(testFilePath); - Console.WriteLine($"Document loaded: {doc.PageCount} page(s)"); - Console.WriteLine($"Document name: {doc.Name}"); - - // Test 1: Basic ToMarkdown with default settings - Console.WriteLine("\n--- Test 1: Basic ToMarkdown (default settings) ---"); - try - { - List pages = new List(); - pages.Add(0); - string markdown = MuPdfRag.ToMarkdown( - doc, - pages: pages, // All pages - hdrInfo: null, // Auto-detect headers - writeImages: false, - embedImages: false, - ignoreImages: false, - ignoreGraphics: false, - detectBgColor: true, - imagePath: "", - imageFormat: "png", - imageSizeLimit: 0.05f, - filename: testFilePath, - forceText: true, - pageChunks: false, - pageSeparators: false, - margins: null, - dpi: 150, - pageWidth: 612, - pageHeight: null, - tableStrategy: "lines_strict", - graphicsLimit: null, - fontsizeLimit: 3.0f, - ignoreCode: false, - extractWords: false, - showProgress: false, - useGlyphs: false, - ignoreAlpha: false - ); - - string markdownFile = "TestPyMuPdfRag_Output.md"; - File.WriteAllText(markdownFile, markdown, Encoding.UTF8); - Console.WriteLine($"Markdown output saved to: {markdownFile}"); - Console.WriteLine($"Markdown length: {markdown.Length} characters"); - if (markdown.Length > 0) - { - int previewLength = Math.Min(300, markdown.Length); - Console.WriteLine($"Preview (first {previewLength} chars):\n{markdown.Substring(0, previewLength)}..."); - } - } - catch (Exception ex) - { - Console.WriteLine($"Error in basic ToMarkdown: {ex.Message}"); - } - doc.Close(); - } - catch (Exception ex) - { - Console.WriteLine($"An unexpected error occurred during PyMuPdfRag test: {ex.Message}"); - Console.WriteLine($"Stack trace: {ex.StackTrace}"); - } - - Console.WriteLine("\n=== TestPyMuPdfRagToMarkdown Completed ======================="); - } - - static void TestLLM() - { - Console.WriteLine("\n=== TestLLM ======================="); - - try - { - // Display version information - Console.WriteLine($"MuPDF.NET4LLM Version: {MuPDF4LLM.Version}"); - var versionTuple = MuPDF4LLM.VersionTuple; - Console.WriteLine($"Version Tuple: ({versionTuple.major}, {versionTuple.minor}, {versionTuple.patch})"); - - // Test with a sample PDF file - string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf"); - //string testFilePath = Path.GetFullPath("../../../TestDocuments/Magazine.pdf"); - - // Try to find a PDF with actual content if Blank.pdf doesn't work well - if (!File.Exists(testFilePath)) - { - testFilePath = Path.GetFullPath("../../../TestDocuments/Widget.pdf"); - } - - if (!File.Exists(testFilePath)) - { - Console.WriteLine($"Test PDF file not found. Skipping LLM test."); - return; - } - - Console.WriteLine($"\nTesting with PDF: {testFilePath}"); - - Document doc = new Document(testFilePath); - Console.WriteLine($"Document loaded: {doc.PageCount} page(s)"); - - string markdownStr = MuPDF4LLM.ToMarkdown(doc); - - doc.Close(); - - string markdownFile = "TestLLM.md"; - File.WriteAllText(markdownFile, markdownStr, Encoding.UTF8); - Console.WriteLine("\nLLM test completed successfully."); - } - catch (Exception ex) - { - Console.WriteLine($"Error in TestLLM: {ex.Message}"); - Console.WriteLine($"Stack trace: {ex.StackTrace}"); - } - } - - static void TestIssue1880() - { - Console.WriteLine("\n=== TestIssue1880 ======================="); - - string testFilePath = Path.GetFullPath(@"../../../TestDocuments/issue_1880.pdf"); - - Document doc = new Document(testFilePath); - - for (int i = 0; i < doc.PageCount; i++) - { - Page page = doc[i]; - - List barcodes = page.ReadBarcodes(barcodeFormat: BarcodeFormat.DM, pureBarcode:true); - foreach (Barcode barcode in barcodes) - { - BarcodePoint[] points = barcode.ResultPoints; - Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); - } - - page.Dispose(); - } - - doc.Close(); - } - - static void TestIssue213() - { - Console.WriteLine("\n=== TestIssue213 ======================="); - - string origfilename = @"../../../TestDocuments/issue_213.pdf"; - string outfilename = @"../../../TestDocuments/Blank.pdf"; - float newWidth = 0.5f; - - Document inputDoc = new Document(origfilename); - Document outputDoc = new Document(outfilename); - - if (inputDoc.PageCount != outputDoc.PageCount) - { - return; - } - - for (int pagNum = 0; pagNum < inputDoc.PageCount; pagNum++) - { - Page page = inputDoc.LoadPage(pagNum); - - Pixmap pxmp = page.GetPixmap(); - pxmp.Save(@"output.png"); - pxmp.Dispose(); - - Page outPage = outputDoc.LoadPage(pagNum); - List paths = page.GetDrawings(extended: false); - int totalPaths = paths.Count; - - int i = 0; - foreach (PathInfo pathInfo in paths) - { - Shape shape = outPage.NewShape(); - foreach (Item item in pathInfo.Items) - { - if (item != null) - { - if (item.Type == "l") - { - shape.DrawLine(item.P1, item.LastPoint); - //writer.Write($"{i:000}\\] line: {item.Type} >>> {item.P1}, {item.LastPoint}\\n"); - } - else if (item.Type == "re") - { - shape.DrawRect(item.Rect, item.Orientation); - //writer.Write($"{i:000}\\] rect: {item.Type} >>> {item.Rect}, {item.Orientation}\\n"); - } - else if (item.Type == "qu") - { - shape.DrawQuad(item.Quad); - //writer.Write($"{i:000}\\] quad: {item.Type} >>> {item.Quad}\\n"); - } - else if (item.Type == "c") - { - shape.DrawBezier(item.P1, item.P2, item.P3, item.LastPoint); - //writer.Write($"{i:000}\\] curve: {item.Type} >>> {item.P1}, {item.P2}, {item.P3}, {item.LastPoint}\\n"); - } - else - { - throw new Exception("unhandled drawing. Aborting..."); - } - } - } - - //pathInfo.Items.get - float newLineWidth = pathInfo.Width; - if (pathInfo.Width <= newWidth) - { - newLineWidth = newWidth; - } - - int lineCap = 0; - if (pathInfo.LineCap != null && pathInfo.LineCap.Count > 0) - lineCap = (int)pathInfo.LineCap[0]; - shape.Finish( - fill: pathInfo.Fill, - color: pathInfo.Color, //this.\_m_DEFAULT_COLOR, - evenOdd: pathInfo.EvenOdd, - closePath: pathInfo.ClosePath, - lineJoin: (int)pathInfo.LineJoin, - lineCap: lineCap, - width: newLineWidth, - strokeOpacity: pathInfo.StrokeOpacity, - fillOpacity: pathInfo.FillOpacity, - dashes: pathInfo.Dashes - ); - - // file_export.write(f'Path {i:03}\] width: {lwidth}, dashes: {path\["dashes"\]}, closePath: {path\["closePath"\]}\\n') - //writer.Write($"Path {i:000}\\] with: {newLineWidth}, dashes: {pathInfo.Dashes}, closePath: {pathInfo.ClosePath}\\n"); - - i++; - shape.Commit(); - } - } - - inputDoc.Close(); - - outputDoc.Save(@"output.pdf"); - outputDoc.Close(); - - //writer.Close(); - } - - static void CreateAnnotDocument() - { - Console.WriteLine("\n=== CreateAnnotDocument ======================="); - Rect r = Constants.r; // use the rectangle defined in Constants.cs - - Document doc = new Document(); - Page page = doc.NewPage(); - - page.SetRotation(0); // no rotation - - TextWriter pw = new TextWriter(page.TrimBox); - string txt = "Origin 100.100"; - pw.Append(new Point(100, 500), txt, new Font("tiro"), fontSize: 24); - pw.WriteText(page, new float[]{0,0.4f,1}, oc: 0); - - - - Annot annot = page.AddRectAnnot(r); // 'Square' - annot.SetBorder(width: 1f, dashes: new int[] { 1, 2 }); - annot.SetColors(stroke: Constants.blue, fill: Constants.gold); - annot.Update(opacity: 0.5f); - - doc.Save(@"CreateAnnotDocument.pdf"); - - doc.Close(); - } - - static void TestDrawShape() - { - string origfilename = @"../../../TestDocuments/NewAnnots.pdf"; - string outfilename = @"../../../TestDocuments/Blank.pdf"; - float newWidth = 0.5f; - - Document inputDoc = new Document(origfilename); - Document outputDoc = new Document(outfilename); - - //string filePath = @"D:\\Vectorlab\\Jobs\\2025\\PACE\\pdf_fix\\assets\\exported_paths_net.txt"; - //StreamWriter writer = new StreamWriter(filePath); - - if (inputDoc.PageCount != outputDoc.PageCount) - { - return; - } - - for (int pagNum = 0; pagNum < inputDoc.PageCount; pagNum++) - { - Page page = inputDoc.LoadPage(pagNum); - Page outPage = outputDoc.LoadPage(pagNum); - List paths = page.GetDrawings(extended: false); - int totalPaths = paths.Count; - - int i = 0; - foreach (PathInfo pathInfo in paths) - { - Shape shape = outPage.NewShape(); - foreach (Item item in pathInfo.Items) - { - if (item != null) - { - if (item.Type == "l") - { - shape.DrawLine(item.P1, item.LastPoint); - //writer.Write($"{i:000}\\] line: {item.Type} >>> {item.P1}, {item.LastPoint}\\n"); - } - else if (item.Type == "re") - { - shape.DrawRect(item.Rect, item.Orientation); - //writer.Write($"{i:000}\\] rect: {item.Type} >>> {item.Rect}, {item.Orientation}\\n"); - } - else if (item.Type == "qu") - { - shape.DrawQuad(item.Quad); - //writer.Write($"{i:000}\\] quad: {item.Type} >>> {item.Quad}\\n"); - } - else if (item.Type == "c") - { - shape.DrawBezier(item.P1, item.P2, item.P3, item.LastPoint); - //writer.Write($"{i:000}\\] curve: {item.Type} >>> {item.P1}, {item.P2}, {item.P3}, {item.LastPoint}\\n"); - } - else - { - throw new Exception("unhandled drawing. Aborting..."); - } - } - } - - //pathInfo.Items.get - float newLineWidth = pathInfo.Width; - if (pathInfo.Width <= newWidth) - { - newLineWidth = newWidth; - } - - int lineCap = 0; - if (pathInfo.LineCap != null && pathInfo.LineCap.Count > 0) - lineCap = (int)pathInfo.LineCap[0]; - shape.Finish( - fill: pathInfo.Fill, - color: pathInfo.Color, //this.\_m_DEFAULT_COLOR, - evenOdd: pathInfo.EvenOdd, - closePath: pathInfo.ClosePath, - lineJoin: (int)pathInfo.LineJoin, - lineCap: lineCap, - width: newLineWidth, - strokeOpacity: pathInfo.StrokeOpacity, - fillOpacity: pathInfo.FillOpacity, - dashes: pathInfo.Dashes - ); - - // file_export.write(f'Path {i:03}\] width: {lwidth}, dashes: {path\["dashes"\]}, closePath: {path\["closePath"\]}\\n') - //writer.Write($"Path {i:000}\\] with: {newLineWidth}, dashes: {pathInfo.Dashes}, closePath: {pathInfo.ClosePath}\\n"); - - i++; - shape.Commit(); - } - } - - inputDoc.Close(); - - outputDoc.Save(@"TestDrawShape.pdf"); - outputDoc.Close(); - - //writer.Close(); - } - - static void TestImageFilter() - { - const string inputPath = @"../../../TestDocuments/Image/table.jpg"; - const string outputPath = @"output.png"; - - // Load the image file into SKBitmap - using (var bitmap = SKBitmap.Decode(inputPath)) - { - if (bitmap == null) - { - Console.WriteLine("Failed to load image."); - return; - } - - SKBitmap inputBitmap = bitmap.Copy(); - - // build the pipeline - var pipeline = new ImageFilterPipeline(); - - // clear any defaults if you’re reusing the instance - pipeline.Clear(); - - // add filters one-by-one - pipeline.AddDeskew(minAngle: 0.5); // replaces any existing deskew step - pipeline.AddRemoveHorizontalLines(); // also replaces existing horizontal-removal step - pipeline.AddRemoveVerticalLines(); - pipeline.AddGrayscale(); - //pipeline.AddMedian(blockSize: 2, replaceExisting: true); - //pipeline.AddGamma(gamma: 1.2); // brighten slightly - //pipeline.AddContrast(contrast: 100); - //pipeline.AddFit(100); - //pipeline.AddDilation(); - //pipeline.AddScale(scaleFactor: 1.75, quality: SKFilterQuality.Medium); - pipeline.AddInvert(); - - // apply the pipeline (bitmap is modified in place) - pipeline.Apply(ref inputBitmap); - - using (var data = inputBitmap.Encode(SKEncodedImageFormat.Png, 100)) // 100 = quality - { - using (var stream = File.OpenWrite(outputPath)) - { - data.SaveTo(stream); - } - } - - Console.WriteLine($"Loaded image: {bitmap.Width}x{bitmap.Height} pixels"); - } - } - - static void TestImageFilterOcr() - { - const string inputPath = @"../../../TestDocuments/Image/boxedpage.jpg"; - - using (Pixmap pxmp = new Pixmap(inputPath)) - { - // build the pipeline - var pipeline = new ImageFilterPipeline(); - - // clear any defaults if you’re reusing the instance - pipeline.Clear(); - - // add filters one-by-one - //pipeline.AddDeskew(minAngle: 0.5); // replaces any existing deskew step - //pipeline.AddRemoveHorizontalLines(); // also replaces existing horizontal-removal step - //pipeline.AddRemoveVerticalLines(); - //pipeline.AddGrayscale(); - //pipeline.AddMedian(blockSize: 2, replaceExisting: true); - pipeline.AddGamma(gamma: 1.2); // brighten slightly - //pipeline.AddContrast(contrast: 100); - //pipeline.AddScaleFit(100); - //pipeline.AddDilation(); - pipeline.AddScale(scaleFactor: 1.75, quality: SKFilterQuality.High); - //pipeline.AddInvert(); - - string txt = pxmp.GetTextFromOcr(pipeline); - Console.WriteLine(txt); - } - } - - static void TestMoveFile() - { - string origfilename = @"../../../TestDocuments/Blank.pdf"; - - string filePath = @"testmove.pdf"; - - File.Copy(origfilename, filePath, true); - - Document d = new Document(filePath); - - Page page = d[0]; - - Point tl = new Point(100, 120); - Point br = new Point(300, 150); - - Rect rect = new Rect(tl, br); - - TextWriter pw = new TextWriter(page.TrimBox); - /* - Font font = new Font(fontName: "tiro"); - - List<(string, float)> ret = pw.FillTextbox(rect, "This is a test to overwrite the original file and move it", font, fontSize: 24); - */ - pw.WriteText(page); - - page.Dispose(); - - MemoryStream tmp = new MemoryStream(); - - d.Save(tmp, garbage: 3, deflateFonts: 1, deflate: 1); - - d.Close(); - - File.WriteAllBytes(filePath, tmp.ToArray()); - - tmp.Dispose(); - - File.Move(filePath, @"moved.pdf", true); - } - - static void TestMetadata() - { - Console.WriteLine("\n=== TestMetadata ====================="); - - string testFilePath = @"../../../TestDocuments/Annot.pdf"; - - Document doc = new Document(testFilePath); - - Dictionary metaDict = doc.MetaData; - - foreach (string key in metaDict.Keys) - { - Console.WriteLine(key + ": " + metaDict[key]); - } - - doc.Close(); - - Console.WriteLine("TestMetadata completed."); - } - - static void TestMorph() - { - Console.WriteLine("\n=== TestMorph ====================="); - - string testFilePath = @"../../../TestDocuments/Morph.pdf"; - - Document doc = new Document(testFilePath); - Page page = doc[0]; - Rect printrect = new Rect(180, 30, 650, 60); - int pagerot = page.Rotation; - TextWriter pw = new TextWriter(page.TrimBox); - string txt = "Origin 100.100"; - pw.Append(new Point(100, 100), txt, new Font("tiro"), fontSize: 24); - pw.WriteText(page); - - txt = "rotated 270 - 100.100"; - Matrix matrix = new IdentityMatrix(); - matrix.Prerotate(270); - Morph mo = new Morph(new Point(100, 100), matrix); - pw = new TextWriter(page.TrimBox); - pw.Append(new Point(100, 100), txt, new Font("tiro"), fontSize: 24); - pw.WriteText(page, morph:mo); - page.SetRotation(270); - - page.Dispose(); - doc.Save(@"morph.pdf"); - doc.Close(); - } - - static void TestUnicodeDocument() - { - Console.WriteLine("\n=== TestUnicodeDocument ====================="); - - string testFilePath = @"../../../TestDocuments/你好.pdf"; - - Document doc = new Document(testFilePath); - - doc.Save(@"你好_.pdf"); - doc.Close(); - - Console.WriteLine("TestUnicodeDocument completed."); - } - - static void TestWriteBarcode1() - { - Console.WriteLine("\n=== TestWriteBarcode1 ====================="); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/Blank.pdf"); - Document doc = new Document(testFilePath); - - Page page = doc[0]; - - // CODE39 - Rect rect = new Rect( - X0: Units.MmToPoints(50), - X1: Units.MmToPoints(80), - Y0: Units.MmToPoints(70), - Y1: Units.MmToPoints(85)); - - page.WriteBarcode(rect, "JJBEA6500", BarcodeFormat.CODE39, forceFitToRect: true, pureBarcode: true, narrowBarWidth:1); - - rect = new Rect( - X0: Units.MmToPoints(50), - X1: Units.MmToPoints(160), - Y0: Units.MmToPoints(100), - Y1: Units.MmToPoints(105)); - - page.WriteBarcode(rect, "JJBEA6500", BarcodeFormat.CODE39, forceFitToRect: true, pureBarcode: true, narrowBarWidth: 2); - - // CODE128 - Rect rect1 = new Rect( - X0: Units.MmToPoints(50), - X1: Units.MmToPoints(100), - Y0: Units.MmToPoints(50), - Y1: Units.MmToPoints(60)); - - page.WriteBarcode(rect1, "JJBEA6500063000000177922", BarcodeFormat.CODE128, forceFitToRect: false, pureBarcode: true, narrowBarWidth: 1); - - rect1 = new Rect( - X0: Units.MmToPoints(50), - X1: Units.MmToPoints(200), - Y0: Units.MmToPoints(80), - Y1: Units.MmToPoints(120)); - - page.WriteBarcode(rect1, "JJBEA6500063000000177922", BarcodeFormat.CODE128, forceFitToRect: true, pureBarcode: true, narrowBarWidth: 1); - - Rect rect2 = new Rect( - X0: Units.MmToPoints(100), - X1: Units.MmToPoints(140), - Y0: Units.MmToPoints(40), - Y1: Units.MmToPoints(80)); - - page.WriteBarcode(rect2, "01030000110444408000", BarcodeFormat.DM, forceFitToRect: false, pureBarcode: true, narrowBarWidth: 3); - - Pixmap pxmp = Utils.GetBarcodePixmap("JJBEA6500063000000177922", BarcodeFormat.CODE128, width: 500, pureBarcode: true, marginLeft:0, marginTop:0, marginRight:0, marginBottom:0, narrowBarWidth: 1); - - pxmp.Save(@"PxmpBarcode3.png"); - - byte[] imageBytes = pxmp.ToBytes(); - - using var stream = new SKMemoryStream(imageBytes); - using var codec = SKCodec.Create(stream); - var info = codec.Info; - var bitmap = SKBitmap.Decode(codec); - - using var data = bitmap.Encode(SKEncodedImageFormat.Png, 100); // 100 = quality - using var stream1 = File.OpenWrite(@"output.png"); - data.SaveTo(stream1); - - doc.Save(@"TestWriteBarcode1.pdf"); - - page.Dispose(); - doc.Close(); - - Console.WriteLine("TestWriteBarcode1 completed."); - } - - static void TestReadDataMatrix() - { - int i = 0; - - Console.WriteLine("\n=== TestReadDataMatrix ======================="); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/Barcodes/datamatrix.pdf"); - Document doc = new Document(testFilePath); - - Page page = doc[0]; - - List barcodes = page.ReadBarcodes(decodeEmbeddedOnly: false); - - foreach (Barcode barcode in barcodes) - { - BarcodePoint[] points = barcode.ResultPoints; - Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); - } - /* - List blocks = page.GetImageInfo(); - - foreach (Block block in blocks) - { - Rect blockRect = block.Bbox; - barcodes = page.ReadBarcodes(clip:blockRect); - foreach (Barcode barcode in barcodes) - { - BarcodePoint[] points = barcode.ResultPoints; - if (points.Length == 2) - { - Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); - } - else if (points.Length == 4) - { - Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[2]}]"); - } - } - } - */ - /* - List imlist = page.GetImages(); - foreach (Entry im in imlist) - { - ImageInfo img = doc.ExtractImage(im.Xref); - File.WriteAllBytes(@"copy.png", img.Image); - - List barcodes = Utils.ReadBarcodes(@"copy.png", new Rect(0,0,img.Width,img.Height)); - - foreach (Barcode barcode in barcodes) - { - BarcodePoint[] points = barcode.ResultPoints; - Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); - } - } - */ - - page.Dispose(); - doc.Close(); - } - - static void TestMemoryLeak() - { - Console.WriteLine("\n=== TestMemoryLeak ======================="); - string testFilePath = Path.GetFullPath("../../../TestDocuments/Blank.pdf"); - - for (int i = 0; i < 100; i++) - { - Document doc = new Document(testFilePath); - Page page = doc.NewPage(); - page.Dispose(); - doc.Close(); - } - - Console.WriteLine("Memory leak test completed. No leaks should be detected."); - } - - static void DrawLine(Page page, float startX, float startY, float endX, float endY, Color lineColor = null, float lineWidth = 1, bool dashed = false) - { - Console.WriteLine("\n=== DrawLine ======================="); - - if (lineColor == null) - { - lineColor = new Color(); // Default to black - lineColor.Stroke = new float[] { 0, 0, 0 }; // RGB black - } - Shape img = page.NewShape(); - Point startPoint = new Point(startX, startY); - Point endPoint = new Point(endX, endY); - - String dashString = ""; - if (dashed == true) - { - dashString = "[2] 0"; // Example dash pattern - } - - img.DrawLine(startPoint, endPoint); - img.Finish(width: lineWidth, color: lineColor.Stroke, dashes: dashString); - img.Commit(); - - Console.WriteLine($"Line drawn from ({startX}, {startY}) to ({endX}, {endY}) with color {lineColor.Stroke} and width {lineWidth}."); - } - - static void TestDrawLine() - { - Console.WriteLine("\n=== TestDrawLine ======================="); - - Document doc = new Document(); - - Page page = doc.NewPage(); - - string fontDir = Environment.GetFolderPath(Environment.SpecialFolder.Fonts); - - page.DrawLine(new Point(45, 50), new Point(80, 50), width: 0.5f, dashes: "[5] 0"); - page.DrawLine(new Point(90, 50), new Point(150, 50), width: 0.5f, dashes: "[5] 0"); - page.DrawLine(new Point(45, 80), new Point(180, 80), width: 0.5f, dashes: "[5] 0"); - page.DrawLine(new Point(45, 100), new Point(180, 100), width: 0.5f, dashes: "[5] 0"); - - //DrawLine(page, 45, 50, 80, 50, lineWidth: 0.5f, dashed: true); - //DrawLine(page, 90, 60, 150, 60, lineWidth: 0.5f, dashed: true); - //DrawLine(page, 45, 80, 180, 80, lineWidth: 0.5f, dashed: true); - //DrawLine(page, 45, 100, 180, 100, lineWidth: 0.5f, dashed: true); - - doc.Save(@"TestDrawLine.pdf"); - - page.Dispose(); - doc.Close(); - - Console.WriteLine("Write to TestDrawLine.pdf"); - } - - static void TestTextFont(string[] args) - { - Console.WriteLine("\n=== TestTextFont ======================="); - //for (int i = 0; i < 100; i++) - { - Document doc = new Document(); - - Page page0 = doc.NewPage(); - Page page1 = doc.NewPage(pno: -1, width: 595, height: 842); - - string fontDir = Environment.GetFolderPath(Environment.SpecialFolder.Fonts); - - float[] blue = new float[] { 0.0f, 0.0f, 1.0f }; - float[] red = new float[] { 1.0f, 0.0f, 0.0f }; - - Rect rect1 = new Rect(100, 100, 510, 200); - Rect rect2 = new Rect(100, 250, 300, 400); - - MuPDF.NET.Font font1 = new MuPDF.NET.Font("asdfasdf"); - //MuPDF.NET.Font font1 = new MuPDF.NET.Font("arial", fontDir+"\\arial_0.ttf"); - MuPDF.NET.Font font2 = new MuPDF.NET.Font("times", fontDir + "\\times.ttf"); - - string text1 = "This is a test of the FillTextbox method with Arial font."; - string text2 = "This is another test with Times New Roman font."; - - MuPDF.NET.TextWriter tw1 = new MuPDF.NET.TextWriter(page0.Rect); - tw1.FillTextbox(rect: rect1, text: text1, font: font1, fontSize:20); - font1.Dispose(); - tw1.WriteText(page0); - - MuPDF.NET.TextWriter tw2 = new MuPDF.NET.TextWriter(page0.Rect, color: red); - tw2.FillTextbox(rect: rect2, text: text2, font: font2, fontSize: 10, align: (int)TextAlign.TEXT_ALIGN_LEFT); - font2.Dispose(); - tw2.WriteText(page0); - - doc.Save(@"TestTextFont.pdf"); - - page0.Dispose(); - doc.Close(); - - Console.WriteLine("Write to TestTextFont.pdf"); - } - - } - - static void TestInsertHtmlbox() - { - Console.WriteLine("\n=== TestInsertHtmlbox ======================="); - - Rect rect = new Rect(100, 100, 550, 2250); - Document doc = new Document(); - Page page = doc.NewPage(); - - string htmlString = "

生产准备:

1. 每日生产进行维护保养,请参照并填写Philips 自动螺丝起点检表《WI-Screw assembly-Makita DF010&Kilews

SKD-B512L-F01》

2 .扭力计UNIT选择‘lbf.in’,‘P-P’模式,每四小时检查一次,每次检查5组数据,只有合格才可以生产;并填写

力扭矩记录表,表单号 : F-EN-34 。

1.电动起子力矩: 5±1 in-lbs,电动螺丝起编号:5.0。

2.电动起子力矩:10±1 in-lbs,电动螺丝起编号:10.0。

3.电动起子力矩:12±1 in-lbs,电动螺丝起编号:12.0。

Colten - line break

生产准备:

1. 每日生产进行维护保养,请参照并填写Philips 自动螺丝起点检表《WI-Screw assembly-Makita DF010&Kilews

SKD-B512L-F01》

2 .扭力计UNIT选择‘lbf.in’,‘P-P’模式,每四小时检查一次,每次检查5组数据,只有合格才可以生产;并填写

力扭矩记录表,表单号 : F-EN-34 。

1.电动起子力矩: 5±1 in-lbs,电动螺丝起编号:5.0。

2.电动起子力矩:10±1 in-lbs,电动螺丝起编号:10.0。

3.电动起子力矩:12±1 in-lbs,电动螺丝起编号:12.0。

Colten - line break

生产准备:

1. 每日生产进行维护保养,请参照并填写Philips 自动螺丝起点检表《WI-Screw assembly-Makita DF010&Kilews

SKD-B512L-F01》

2 .扭力计UNIT选择‘lbf.in’,‘P-P’模式,每四小时检查一次,每次检查5组数据,只有合格才可以生产;并填写

力扭矩记录表,表单号 : F-EN-34 。

1.电动起子力矩: 5±1 in-lbs,电动螺丝起编号:5.0。

2.电动起子力矩:10±1 in-lbs,电动螺丝起编号:10.0。

3.电动起子力矩:12±1 in-lbs,电动螺丝起编号:12.0。

Colten - line break

生产准备:

1. 每日生产进行维护保养,请参照并填写Philips 自动螺丝起点检表《WI-Screw assembly-Makita DF010&Kilews

SKD-B512L-F01》

2 .扭力计UNIT选择‘lbf.in’,‘P-P’模式,每四小时检查一次,每次检查5组数据,只有合格才可以生产;并填写

力扭矩记录表,表单号 : F-EN-34 。

1.电动起子力矩: 5±1 in-lbs,电动螺丝起编号:5.0。

2.电动起子力矩:10±1 in-lbs,电动螺丝起编号:10.0。

3.电动起子力矩:12±1 in-lbs,电动螺丝起编号:12.0。

"; - (float s, float scale) = page.InsertHtmlBox(rect, htmlString, scaleLow: 0f); - doc.Save(@"TestInsertHtmlbox.pdf"); - - page.Dispose(); - doc.Close(); - - Console.WriteLine($"Inserted HTML box with scale: {scale} and size: {s}"); - } - - static void TestLineAnnot() - { - Console.WriteLine("\n=== TestLineAnnot ======================="); - Document newDoc = new Document(); - Page newPage = newDoc.NewPage(); - - newPage.AddLineAnnot(new Point(100, 100), new Point(300, 300)); - - newDoc.Save(@"TestLineAnnot1.pdf"); - newDoc.Close(); - - Document doc = new Document(@"TestLineAnnot1.pdf"); // open a document - List annotationsToUpdate = new List(); - Page page = doc[0]; - // Fix: Correctly handle the IEnumerable returned by GetAnnots() - IEnumerable annots = page.GetAnnots(); - foreach (Annot annot in annots) - { - Console.WriteLine("Annotation on page width before modified: " + annot.Border.Width); - annot.SetBorder(width: 8); - annot.Update(); - Console.WriteLine("Annotation on page width after modified: " + annot.Border.Width); - } - annotationsToUpdate.Clear(); - doc.Save(@"TestLineAnnot2.pdf"); // Save the modified document - doc.Close(); // Close the document - } - - static void TestHelloWorldToNewDocument(string[] args) - { - Console.WriteLine("\n=== TestHelloWorldToNewDocument ======================="); - Document doc = new Document(); - Page page = doc.NewPage(); - - //{ "helv", "Helvetica" }, - //{ "heit", "Helvetica-Oblique" }, - //{ "hebo", "Helvetica-Bold" }, - //{ "hebi", "Helvetica-BoldOblique" }, - //{ "cour", "Courier" }, - //{ "cobo", "Courier-Bold" }, - //{ "cobi", "Courier-BoldOblique" }, - //{ "tiro", "Times-Roman" }, - //{ "tibo", "Times-Bold" }, - //{ "tiit", "Times-Italic" }, - //{ "tibi", "Times-BoldItalic" }, - //{ "symb", "Symbol" }, - //{ "zadb", "ZapfDingbats" } - MuPDF.NET.TextWriter writer = new MuPDF.NET.TextWriter(page.Rect); - var ret = writer.FillTextbox(page.Rect, "Hello World!", new MuPDF.NET.Font(fontName: "helv"), rtl: true); - writer.WriteText(page); - doc.Save("text.pdf", pretty: 1); - doc.Close(); - - Console.WriteLine($"Text written to 'text.pdf' in: {page.Rect}"); - } - - static void TestHelloWorldToExistingDocument(string[] args) - { - Console.WriteLine("\n=== TestHelloWorldToExistingDocument ======================="); - string testFilePath = Path.GetFullPath("../../../TestDocuments/Blank.pdf"); - Document doc = new Document(testFilePath); - - Page page = doc[0]; - - Rect rect = new Rect(100, 100, 510, 210); - page.DrawRect(rect); - - MuPDF.NET.TextWriter writer = new MuPDF.NET.TextWriter(page.Rect); - //Font font = new Font("kenpixel", "../../../kenpixel.ttf", isBold: 1); - Font font = new Font("cobo", isBold: 0); - var ret = writer.FillTextbox(page.Rect, "123456789012345678901234567890Peter Test- this is a string that is too long to fit into the TextBox", font, rtl: false); - writer.WriteText(page); - - doc.Save("text1.pdf", pretty: 1); - - doc.Close(); - - Console.WriteLine($"Text written to 'text1.pdf' in: {page.Rect}"); - } - - static void TestReadBarcode(string[] args) - { - int i = 0; - - Console.WriteLine("\n=== TestReadBarcode ======================="); - - Console.WriteLine("--- Read from image file ----------"); - string testFilePath1 = Path.GetFullPath("../../../TestDocuments/Barcodes/rendered.bmp"); - - Rect rect1 = new Rect(1260, 390, 1720, 580); - List barcodes2 = Utils.ReadBarcodes(testFilePath1, clip:rect1); - - i = 0; - foreach (Barcode barcode in barcodes2) - { - BarcodePoint[] points = barcode.ResultPoints; - Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); - } - - Console.WriteLine("--- Read from pdf file ----------"); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/Barcodes/Samples.pdf"); - Document doc = new Document(testFilePath); - - Page page = doc[0]; - //Rect rect = new Rect(290, 590, 420, 660); - List barcodes = page.ReadBarcodes(); - - foreach (Barcode barcode in barcodes) - { - BarcodePoint[] points = barcode.ResultPoints; - Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); - } - doc.Close(); - } - - static void TestReadQrCode(string[] args) - { - Console.WriteLine("\n=== TestReadQrCode ======================="); - int i = 0; - /* - Console.WriteLine("=== Read from image file ====================="); - string testFilePath1 = Path.GetFullPath("../../../TestDocuments/Barcodes/2.png"); - - List barcodes2 = Utils.ReadBarcodes(testFilePath1, autoRotate:true); - - i = 0; - foreach (Barcode barcode in barcodes2) - { - BarcodePoint[] points = barcode.ResultPoints; - Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); - } - */ - ///* - Console.WriteLine("--- Read from pdf file ----------"); - - string testImagePath = @"test.png"; - string testFilePath = Path.GetFullPath("../../../TestDocuments/Barcodes/input.pdf"); - Document doc = new Document(testFilePath); - - Page page = doc[0]; - page.RemoveRotation(); // remove rotation to read barcodes correctly - - // Apply 2x scale (both X and Y) - var matrix = new Matrix(3.0f, 3.0f); - - // Render the page using the scaled matrix - var pixmap = page.GetPixmap(matrix); - - pixmap.GammaWith(3.2f); // apply gamma correction to improve barcode detection - - pixmap.Save(testImagePath); - - /* - Rect rect = new Rect(400, 700, page.Rect.X1, page.Rect.Y1); - List barcodes = page.ReadBarcodes(rect); - - foreach (Barcode barcode in barcodes) - { - BarcodePoint[] points = barcode.ResultPoints; - Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); - } - */ - - pixmap.Dispose(); - doc.Close(); - - List barcodes2 = Utils.ReadBarcodes(testImagePath); - - i = 0; - foreach (Barcode barcode in barcodes2) - { - BarcodePoint[] points = barcode.ResultPoints; - Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); - } - //*/ - } - - static void TestWriteBarcode(string[] args) - { - Console.WriteLine("\n=== TestWriteBarcode ======================="); - Console.WriteLine("--- Write to pdf file ----------"); - string testFilePath = Path.GetFullPath("../../../TestDocuments/Blank.pdf"); - Document doc = new Document(testFilePath); - Page page = doc[0]; - - MuPDF.NET.TextWriter writer = new MuPDF.NET.TextWriter(page.Rect); - Font font = new Font("cour", isBold: 1); - writer.FillTextbox(page.Rect, "QR_CODE", font, pos: new Point(0, 10)); - writer.FillTextbox(page.Rect, "EAN_8", font, pos: new Point(0, 110)); - writer.FillTextbox(page.Rect, "EAN_13", font, pos: new Point(0, 165)); - writer.FillTextbox(page.Rect, "UPC_A", font, pos: new Point(0, 220)); - writer.FillTextbox(page.Rect, "CODE_39", font, pos: new Point(0, 275)); - writer.FillTextbox(page.Rect, "CODE_128", font, pos: new Point(0, 330)); - writer.FillTextbox(page.Rect, "ITF", font, pos: new Point(0, 385)); - writer.FillTextbox(page.Rect, "PDF_417", font, pos: new Point(0, 440)); - writer.FillTextbox(page.Rect, "CODABAR", font, pos: new Point(0, 520)); - writer.FillTextbox(page.Rect, "DATA_MATRIX", font, pos: new Point(0, 620)); - writer.WriteText(page); - - // QR_CODE - Rect rect = new Rect(100, 20, 300, 80); - page.WriteBarcode(rect, "Hello World!", BarcodeFormat.QR, forceFitToRect:false, pureBarcode:false, marginLeft:0); - - // EAN_8 - rect = new Rect(100, 100, 300, 120); - page.WriteBarcode(rect, "1234567", BarcodeFormat.EAN8, forceFitToRect: false, pureBarcode: false, marginBottom: 20); - - // EAN_13 - rect = new Rect(100, 155, 300, 200); - page.WriteBarcode(rect, "123456789012", BarcodeFormat.EAN13, forceFitToRect: false, pureBarcode: true, marginBottom: 0); - - // UPC_A - rect = new Rect(100, 210, 300, 255); - page.WriteBarcode(rect, "123456789012", BarcodeFormat.UPC_A, forceFitToRect: false, pureBarcode: true, marginBottom: 0); - - // CODE_39 - rect = new Rect(100, 265, 600, 285); - page.WriteBarcode(rect, "Hello World!", BarcodeFormat.CODE39, forceFitToRect: false, pureBarcode: false, marginBottom: 0); - - // CODE_128 - rect = new Rect(100, 320, 400, 355); - page.WriteBarcode(rect, "Hello World!", BarcodeFormat.CODE128, forceFitToRect: true, pureBarcode: true, marginBottom: 0); - - // ITF - rect = new Rect(100, 385, 300, 420); - page.WriteBarcode(rect, "12345678901234567890", BarcodeFormat.I2OF5, forceFitToRect: false, pureBarcode: false, marginBottom: 0); - - // PDF_417 - rect = new Rect(100, 430, 400, 435); - page.WriteBarcode(rect, "Hello World!", BarcodeFormat.PDF417, forceFitToRect: false, pureBarcode: true, marginBottom: 0); - - // CODABAR - rect = new Rect(100, 540, 400, 580); - page.WriteBarcode(rect, "12345678901234567890", BarcodeFormat.CODABAR, forceFitToRect: false, pureBarcode: true, marginBottom: 0); - - // DATA_MATRIX - rect = new Rect(100, 620, 140, 660); - page.WriteBarcode(rect, "01100000110419257000", BarcodeFormat.DM, forceFitToRect: false, pureBarcode: false, marginBottom: 0); - - doc.Save("barcode.pdf"); - - Console.WriteLine($"Barcodes written to 'barcode.pdf' in: {page.Rect}"); - doc.Close(); - - Console.WriteLine("--- Write to image file ----------"); - - // QR_CODE - Utils.WriteBarcode("QR_CODE.png", "Hello World!", BarcodeFormat.QR, width: 600, height: 600, forceFitToRect: true, pureBarcode: false, marginBottom: 0); - - // EAN_8 - Utils.WriteBarcode("EAN_8.png", "1234567", BarcodeFormat.EAN8, width: 300, height: 20, forceFitToRect: false, pureBarcode: false, marginBottom: 4); - - // EAN_13 - Utils.WriteBarcode("EAN_13.png", "123456789012", BarcodeFormat.EAN13, width: 300, height: 0, forceFitToRect: false, pureBarcode: false, marginBottom: 10); - - // UPC_A - Utils.WriteBarcode("UPC_A.png", "123456789012", BarcodeFormat.UPC_A, width: 300, height: 20, forceFitToRect: false, pureBarcode: false, marginBottom: 10); - - // CODE_39 - Utils.WriteBarcode("CODE_39.png", "Hello World!", BarcodeFormat.CODE39, width: 300, height: 70, forceFitToRect: false, pureBarcode: false, marginBottom: 20); - - // CODE_128 - Utils.WriteBarcode("CODE_128.png", "Hello World!", BarcodeFormat.CODE128, width: 300, height: 150, forceFitToRect: false, pureBarcode: false, marginBottom: 20); - - // ITF - Utils.WriteBarcode("ITF.png", "12345678901234567890", BarcodeFormat.I2OF5, width: 300, height: 120, forceFitToRect: false, pureBarcode: false, marginBottom: 20); - - // PDF_417 - Utils.WriteBarcode("PDF_417.png", "Hello World!", BarcodeFormat.PDF417, width: 300, height: 10, forceFitToRect: false, pureBarcode: false, marginBottom: 0); - - // CODABAR - Utils.WriteBarcode("CODABAR.png", "12345678901234567890", BarcodeFormat.CODABAR, width: 300, height: 150, forceFitToRect: false, pureBarcode: false, marginBottom: 20); - - // DATA_MATRIX - Utils.WriteBarcode("DATA_MATRIX.png", "01100000110419257000", BarcodeFormat.DM, width: 300, height: 300, forceFitToRect: false, pureBarcode: true, marginBottom: 1); - - Console.WriteLine("Barcodes written to image files in the current directory."); - } - - static void TestExtractTextWithLayout(string[] args) - { - Console.WriteLine("\n=== TestExtractTextWithLayout ====================="); - string testFilePath = Path.GetFullPath("../../../TestDocuments/columns.pdf"); - Document doc = new Document(testFilePath); - - FileStream wstream = File.Create("columns.txt"); - - for (int i = 0; i < 1/*doc.PageCount*/; i++) - { - Page page = doc[i]; - string textWithLayout = page.GetTextWithLayout(tolerance: 3); - if (!string.IsNullOrEmpty(textWithLayout)) - { - byte[] bytes = Encoding.UTF8.GetBytes(textWithLayout); - wstream.Write(bytes, 0, bytes.Length); - } - } - - wstream.Close(); - - doc.Close(); - - Console.WriteLine("Created columns.txt file"); - } - - static void TestWidget(string[] args) - { - Console.WriteLine("\n=== TestWidget ====================="); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/Widget.pdf"); - Document doc = new Document(testFilePath); - for (int i = 0; i < 1; i++) - { - var page = doc[i]; - - List entries = page.GetXObjects(); - - Widget fWidget = page.FirstWidget; - while (fWidget != null) - { - Console.WriteLine($"Widget: {fWidget}"); - Console.WriteLine($"FieldName: {fWidget.FieldName}"); - Console.WriteLine($"FieldType: {fWidget.FieldType}"); - Console.WriteLine($"FieldValue: {fWidget.FieldValue}"); - Console.WriteLine($"FieldFlags: {fWidget.FieldFlags}"); - Console.WriteLine($"FieldLabel: {fWidget.FieldLabel}"); - Console.WriteLine($"TextFont: {fWidget.TextFont}"); - Console.WriteLine($"TextFontSize: {fWidget.TextFontSize}"); - Console.WriteLine($"TextColor: {string.Join(",", fWidget.TextColor)}"); - fWidget = (Widget)fWidget.Next; - } - - foreach (var widget in page.GetWidgets()) - { - Console.WriteLine($"Widget: {widget}"); - Console.WriteLine($"FieldName: {widget.FieldName}"); - Console.WriteLine($"FieldType: {widget.FieldType}"); - Console.WriteLine($"FieldValue: {widget.FieldValue}"); - Console.WriteLine($"FieldFlags: {widget.FieldFlags}"); - Console.WriteLine($"FieldLabel: {widget.FieldLabel}"); - Console.WriteLine($"TextFont: {widget.TextFont}"); - Console.WriteLine($"TextFontSize: {widget.TextFontSize}"); - Console.WriteLine($"TextColor: {string.Join(",", widget.TextColor)}"); - - } - } - - doc.Close(); - Console.WriteLine("Widget test completed."); - } - - static void TestColor(string[] args) - { - Console.WriteLine("\n=== TestColor ====================="); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/Color.pdf"); - Document doc = new Document(testFilePath); - List images = doc.GetPageImages(0); - Console.WriteLine($"CaName: {images[0].CsName}"); - doc.Recolor(0, 4); - images = doc.GetPageImages(0); - Console.WriteLine($"CaName: {images[0].AltCsName}"); - doc.Save("ReColor.pdf"); - doc.Close(); - - Console.WriteLine("Color test completed."); - } - - static void TestCMYKRecolor(string[] args) - { - Console.WriteLine("\n=== TestCMYKRecolor ====================="); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/CMYK_Recolor.pdf"); - Document doc = new Document(testFilePath); - //List images = doc.GetPageImages(0); - //Console.WriteLine($"CaName: {images[0].CsName}"); - doc.Recolor(0, "CMYK"); - //images = doc.GetPageImages(0); - //Console.WriteLine($"CaName: {images[0].AltCsName}"); - doc.Save("CMYKRecolor.pdf"); - doc.Close(); - - Console.WriteLine("CMYK Recolor test completed."); - } - - static void TestSVGRecolor(string[] args) - { - Console.WriteLine("\n=== TestSVGRecolor ====================="); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/SvgTest.pdf"); - Document doc = new Document(testFilePath); - doc.Recolor(0, "RGB"); - doc.Save("SVGRecolor.pdf"); - doc.Close(); - - Console.WriteLine("SVG Recolor test completed."); - } - - static void TestReplaceImage(string[] args) - { - Console.WriteLine("\n=== TestReplaceImage ====================="); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/Color.pdf"); - Document doc = new Document(testFilePath); - Page page = doc[0]; - List images = page.GetImages(true); - List imgs = page.GetImageRects(images[0].Xref); - - List infos = page.GetImageInfo(xrefs: true); - - page.ReplaceImage(images[0].Xref, "../../../TestDocuments/Image/_apple.png"); - page.ReplaceImage(images[0].Xref, "../../../TestDocuments/Image/_bb-logo.png"); - - infos = page.GetImageInfo(xrefs: true); - //page.DeleteImage(images[0].Xref); - - //int newXref = page.InsertImage(imgs[0].Rect, "../../../TestDocuments/Sample.png"); - - //images = page.GetImages(true); - //imgs = page.GetImageRects(images[0].Xref); - - //page.ReplaceImage(infos[0].Xref, "../../../TestDocuments/Sample.png"); - //page.DeleteImage(images[0].Xref); - - //page.InsertImage(imgs[0].Rect, "../../../TestDocuments/Sample.jpg"); - - doc.Save("ReplaceImage.pdf"); - doc.Close(); - - Console.WriteLine("Image replacement test completed."); - } - - static void TestInsertImage(string[] args) - { - Console.WriteLine("\n=== TestInsertImage ====================="); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/Image/test.pdf"); - Document doc = new Document(testFilePath); - Page page = doc[0]; - - var pixmap1 = new Pixmap("../../../TestDocuments/Image/_apple.png"); - //var pixmap1 = new Pixmap("../../../TestDocuments/Image/30mb.jpg"); - var pixmap2 = new Pixmap("../../../TestDocuments/Image/_bb-logo.png"); - var imageRect1 = new Rect(0, 0, 100, 100); - var imageRect2 = new Rect(100, 100, 200, 200); - var imageRect3 = new Rect(100, 200, 200, 300); - var imageRect4 = new Rect(100, 300, 200, 400); - var imageRect5 = new Rect(100, 400, 200, 500); - var imageRect6 = new Rect(100, 500, 200, 600); - - var img_xref = page.InsertImage(imageRect1, pixmap: pixmap1); - Console.WriteLine(img_xref); - - //img_xref = page.InsertImage(imageRect2, "../../../TestDocuments/Image/_apple.png"); - img_xref = page.InsertImage(imageRect2, pixmap: pixmap1); - Console.WriteLine(img_xref); - img_xref = page.InsertImage(imageRect3, pixmap: pixmap2); - Console.WriteLine(img_xref); - img_xref = page.InsertImage(imageRect4, "../../../TestDocuments/Image/_bb-logo.png"); - Console.WriteLine(img_xref); - page.InsertImage(imageRect5, xref: img_xref); - Console.WriteLine(img_xref); - page.InsertImage(imageRect6, xref: img_xref); - - doc.Save("TestInsertImage.pdf"); - doc.Close(); - - Console.WriteLine("Image insertion test completed."); - } - - static void TestGetImageInfo(string[] args) - { - Console.WriteLine("\n=== TestGetImageInfo ====================="); - - string testFilePath = Path.GetFullPath("../../../TestDocuments/Image/TestInsertImage.pdf"); - Document doc = new Document(testFilePath); - Page page = doc[0]; - - List infos = page.GetImageInfo(xrefs: true); - - doc.Close(); - - Console.WriteLine("Image info test completed."); - } - - static void TestGetTextPageOcr(string[] args) - { - Console.WriteLine("\n=== TestGetTextPageOcr ====================="); - - string testFilePath = Path.GetFullPath(@"../../../TestDocuments/Ocr.pdf"); - Document doc = new Document(testFilePath); - Page page = doc[0]; - - page.RemoveRotation(); - Pixmap pixmap = page.GetPixmap(); - - List blocks = page.GetText("dict", flags: (int)TextFlags.TEXT_PRESERVE_IMAGES)?.Blocks; - foreach (Block block in blocks) - { - Console.WriteLine(block.Image.Length); - } - - // build the pipeline - var pipeline = new ImageFilterPipeline(); - pipeline.Clear(); - //pipeline.AddDeskew(minAngle: 0.5); // replaces any existing deskew step - //pipeline.AddRemoveHorizontalLines(); // also replaces existing horizontal-removal step - //pipeline.AddRemoveVerticalLines(); - //pipeline.AddGrayscale(); - //pipeline.AddMedian(blockSize: 2, replaceExisting: true); - pipeline.AddGamma(gamma: 1.2); // brighten slightly - //pipeline.AddScaleFit(100); - pipeline.AddScale(scaleFactor: 3f, quality: SKFilterQuality.High); - //pipeline.AddContrast(contrast: 100); - //pipeline.AddDilation(); - //pipeline.AddInvert(); - - TextPage tp = page.GetTextPageOcr((int)TextFlags.TEXT_PRESERVE_SPANS, full: true, imageFilters: pipeline); - string txt = tp.ExtractText(); - Console.WriteLine(txt); - - doc.Close(); - - Console.WriteLine("OCR text extraction test completed."); - } - - static void TestCreateImagePage(string[] args) - { - Console.WriteLine("\n=== TestCreateImagePage ====================="); - - Pixmap pxmp = new Pixmap("../../../TestDocuments/Image/_bb-logo.png"); - - Document doc = new Document(); - Page page = doc.NewPage(width:pxmp.W, height:pxmp.H); - - page.InsertImage(page.Rect, pixmap: pxmp); - - pxmp.Dispose(); - - doc.Save("_bb-logo.pdf", pretty: 1); - doc.Close(); - - Console.WriteLine("Image page creation test completed."); - } - - static void TestJoinPdfPages(string[] args) - { - Console.WriteLine("\n=== TestJoinPdfPages ====================="); - - string testFilePath1 = Path.GetFullPath(@"../../../TestDocuments/Widget.pdf"); - Document doc1 = new Document(testFilePath1); - string testFilePath2 = Path.GetFullPath(@"../../../TestDocuments/Color.pdf"); - Document doc2 = new Document(testFilePath2); - - doc1.InsertPdf(doc2, 0, 0, 2); - - doc1.Save("Joined.pdf", pretty: 1); - - doc2.Close(); - doc1.Close(); - - Console.WriteLine("PDF pages joined successfully into 'Joined.pdf'."); - } - - static void TestFreeTextAnnot(string[] args) + private static void Main(string[] args) { - Console.WriteLine("\n=== TestFreeTextAnnot ====================="); - - Rect r = new Rect(72, 72, 220, 100); - string t1 = "têxt üsès Lätiñ charß,\nEUR: €, mu: µ, super scripts: ²³!"; - Rect rect = new Rect(100,100,200,200); - float[] red = new float[] { 1, 0, 0 }; - float[] blue = new float[] { 0, 0, 1 }; - float[] gold = new float[] { 1, 1, 0 }; - float[] green = new float[] { 0, 1, 0 }; - float[] white = new float[] { 1, 1, 1 }; - - Document doc = new Document(); - Page page = doc.NewPage(); - - Annot annot = page.AddFreeTextAnnot( - rect, - t1, - fontSize: 10, - rotate: 90, - textColor: red, - fillColor: gold, - align: (int)TextAlign.TEXT_ALIGN_CENTER, - dashes: new int[] { 2 } - ); - - annot.SetBorder(border: null, width: 0.3f, dashes: new int[] { 2 }); - annot.Update(textColor: blue); - //annot.Update(textColor: red, fillColor: blue); - - doc.Save("FreeTextAnnot.pdf"); - - doc.Close(); - - Console.WriteLine("Free text annotation created and saved to 'FreeTextAnnot.pdf'."); + SampleMenu.Run(args); } } } diff --git a/Demo/SampleMenu.cs b/Demo/SampleMenu.cs new file mode 100644 index 00000000..d6def26f --- /dev/null +++ b/Demo/SampleMenu.cs @@ -0,0 +1,171 @@ +namespace Demo +{ + /// + /// Demo samples grouped by MuPDF.NET / MuPDF.NET4LLM feature areas. With no arguments, runs every sample. + /// Use dotnet run -- help for the list, or dotnet run -- <name> for one sample. + /// + public static class SampleMenu + { + /// Library-facing group (matches folders under Samples/ and major API surfaces). + private sealed record Sample(string Category, string Name, string Description, Action Run); + + /// Order matches Samples/ layout; MuPDF.NET4LLM extras live in Samples/Llm/Program.Llm.*.Fixtures.cs. + private static readonly Sample[] Samples = + { + // —— Document & I/O (MuPDF.NET Document, open/save, streams) —— Samples/Document + new("Document & I/O", "hello-new-pdf", "Hello World on a new PDF", a => Program.TestHelloWorldToNewDocument(a)), + new("Document & I/O", "hello-existing-pdf", "Hello World on existing Blank.pdf", a => Program.TestHelloWorldToExistingDocument(a)), + new("Document & I/O", "join-pdf", "Insert pages from another PDF", a => Program.TestJoinPdfPages(a)), + new("Document & I/O", "metadata", "Print document metadata", _ => Program.TestMetadata()), + new("Document & I/O", "move-file", "Save through MemoryStream and move output", _ => Program.TestMoveFile()), + new("Document & I/O", "unicode-doc", "Save PDF with unicode filename", _ => Program.TestUnicodeDocument()), + new("Document & I/O", "memory-leak", "Open/close documents in a loop", _ => Program.TestMemoryLeak()), + + // —— Text, story & vector drawing (Page, Story, TextWriter, Shape) —— Samples/TextDrawing + new("Text, story & drawing", "insert-htmlbox", "Insert HTML story box into a new page", _ => Program.TestInsertHtmlbox()), + new("Text, story & drawing", "text-font", "FillTextbox with fonts", a => Program.TestTextFont(a)), + new("Text, story & drawing", "morph", "TextWriter with morph / rotation", _ => Program.TestMorph()), + new("Text, story & drawing", "gettext", "GetText dict dump per page", _ => Program.TestGetText()), + new("Text, story & drawing", "extract-text-layout", "Extract text with reading order (columns.pdf)", a => Program.TestExtractTextWithLayout(a)), + new("Text, story & drawing", "draw-line", "Draw dashed lines on a page", _ => Program.TestDrawLine()), + new("Text, story & drawing", "draw-shape", "Copy vector paths between PDFs", _ => Program.TestDrawShape()), + + // —— Annotations —— Samples/Annotations + new("Annotations", "line-annot", "Create and modify line annotations", _ => Program.TestLineAnnot()), + new("Annotations", "annot-freetext1", "Free-text annotation sample (1)", a => Program.TestAnnotationsFreeText1(a)), + new("Annotations", "annot-freetext2", "Free-text annotation sample (2)", a => Program.TestAnnotationsFreeText2(a)), + new("Annotations", "new-annots", "Caret, markers, shapes, stamp, redaction, etc.", a => NewAnnots.Run(a)), + new("Annotations", "annot-doc", "Rectangle annotation + text", _ => Program.CreateAnnotDocument()), + new("Annotations", "freetext-annot", "Add free-text annotation (unicode)", a => Program.TestFreeTextAnnot(a)), + + // —— Pages, widgets, images & color —— Samples/PageContent + new("Pages, widgets, images & color", "widget", "Inspect form widgets", a => Program.TestWidget(a)), + new("Pages, widgets, images & color", "color", "Recolor page images", a => Program.TestColor(a)), + new("Pages, widgets, images & color", "cmyk-recolor", "CMYK recolor", a => Program.TestCMYKRecolor(a)), + new("Pages, widgets, images & color", "svg-recolor", "SVG / RGB recolor", a => Program.TestSVGRecolor(a)), + new("Pages, widgets, images & color", "replace-image", "Replace embedded images", a => Program.TestReplaceImage(a)), + new("Pages, widgets, images & color", "insert-image", "Insert images from pixmaps and files", a => Program.TestInsertImage(a)), + new("Pages, widgets, images & color", "get-image-info", "Dump image xref info", a => Program.TestGetImageInfo(a)), + new("Pages, widgets, images & color", "page-ocr", "OCR text page with image filter pipeline", a => Program.TestGetTextPageOcr(a)), + new("Pages, widgets, images & color", "create-image-page", "New PDF page from PNG pixmap", a => Program.TestCreateImagePage(a)), + + // —— Image filters (Skia) —— Samples/ImageFilters + new("Image filters (Skia)", "image-filter", "Skia pipeline on table.jpg → output.png", _ => Program.TestImageFilter()), + new("Image filters (Skia)", "image-filter-ocr", "Pixmap OCR with filter pipeline", _ => Program.TestImageFilterOcr()), + + // —— Barcodes —— Samples/Barcodes + new("Barcodes", "read-barcode", "Read barcodes from image and PDF", a => Program.TestReadBarcode(a)), + new("Barcodes", "read-datamatrix", "Read Data Matrix from PDF", _ => Program.TestReadDataMatrix()), + new("Barcodes", "read-qrcode", "Render PDF page and read QR from PNG", a => Program.TestReadQrCode(a)), + new("Barcodes", "write-barcode", "Write many barcode types to PDF and PNG", a => Program.TestWriteBarcode(a)), + new("Barcodes", "write-barcode1", "Write CODE39/CODE128/DM with Units rects", _ => Program.TestWriteBarcode1()), + + // —— MuPDF.NET4LLM —— Samples/Llm + new("MuPDF.NET4LLM", "llm", "MuPDF4LLM.ToMarkdown quick test", _ => Program.TestLLM()), + new("MuPDF.NET4LLM", "rag-markdown", "MuPdfRag.ToMarkdown (Magazine.pdf)", _ => Program.TestPyMuPdfRagToMarkdown()), + new("MuPDF.NET4LLM", "table", "Detect tables and export markdown", _ => Program.TestTable()), + new("MuPDF.NET4LLM", "markdown-reader", "LlamaIndex PDFMarkdownReader", _ => Program.TestMarkdownReader()), + new("MuPDF.NET4LLM", "llm-to-markdown-fixture-370", "ToMarkdown vs tests/test_370_expected.md (needs tests/test_370.pdf)", a => Program.Test4LlmToMarkdownCompareExpected370(a)), + new("MuPDF.NET4LLM", "llm-to-markdown-ocr-1", "ToMarkdown + U+FFFD fixture (tests/test_ocr_loremipsum_FFFD.pdf)", a => Program.Test4LlmToMarkdownOcrFixture1(a)), + new("MuPDF.NET4LLM", "llm-to-markdown-ocr-2", "ToMarkdown useOcr=false on FFFD fixture", a => Program.Test4LlmToMarkdownOcrFixture2(a)), + new("MuPDF.NET4LLM", "llm-to-markdown-ocr-3", "ToMarkdown OCR on/off on SVG fixture", a => Program.Test4LlmToMarkdownOcrFixture3(a)), + new("MuPDF.NET4LLM", "llm-pdf-reader-empty", "PDFMarkdownReader: new PDF, one blank page", a => Program.Test4LlmPdfMarkdownReaderEmptyPage(a)), + new("MuPDF.NET4LLM", "llm-pdf-reader-missing-file", "PDFMarkdownReader: missing path → FileNotFoundException", a => Program.Test4LlmPdfMarkdownReaderMissingFile(a)), + + // —— Regression & diagnostics —— Samples/Regression + new("Regression & diagnostics", "issue-213", "Repro: drawing paths / line width", _ => Program.TestIssue213()), + new("Regression & diagnostics", "issue-1880", "Repro: read Data Matrix barcodes", _ => Program.TestIssue1880()), + new("Regression & diagnostics", "issue-234", "Repro: pixmap scale + insert image", _ => Program.TestIssue234()), + new("Regression & diagnostics", "jbig2", "Rewrite images with FAX recompression", _ => Program.TestRecompressJBIG2()), + }; + + private static readonly Dictionary ByName = BuildIndex(); + + private static Dictionary BuildIndex() + { + var d = new Dictionary(StringComparer.OrdinalIgnoreCase); + foreach (var s in Samples) + { + d[s.Name] = s; + } + return d; + } + + public static void Run(string[] args) + { + if (args.Length > 0 && IsHelp(args[0])) + { + PrintUsage(); + return; + } + + if (args.Length == 0 || IsRunAllSwitch(args[0])) + { + RunAll(); + return; + } + + if (!ByName.TryGetValue(args[0], out var sample)) + { + Console.Error.WriteLine($"Unknown sample: {args[0]}"); + PrintUsage(); + Environment.ExitCode = 1; + return; + } + + Console.WriteLine($"--- Sample: {sample.Name} ({sample.Category}) ---"); + sample.Run(args); + } + + private static bool IsHelp(string a) => + a is "-h" or "-?" or "/?" or "help" or "--help"; + + private static bool IsRunAllSwitch(string a) => + string.Equals(a, "all", StringComparison.OrdinalIgnoreCase) + || string.Equals(a, "-all", StringComparison.OrdinalIgnoreCase) + || string.Equals(a, "--all", StringComparison.OrdinalIgnoreCase); + + private static void RunAll() + { + var sampleArgs = Array.Empty(); + foreach (var s in Samples) + { + Console.WriteLine(); + Console.WriteLine($"========== {s.Category} / {s.Name} =========="); + try + { + s.Run(sampleArgs); + } + catch (Exception ex) + { + Console.Error.WriteLine($"FAILED {s.Name}: {ex.Message}"); + } + } + } + + private static void PrintUsage() + { + Console.WriteLine("MuPDF.NET Demo — samples mirror library areas under Demo/Samples/. Default: run all."); + Console.WriteLine(); + Console.WriteLine(" dotnet run (or: dotnet run -- -all)"); + Console.WriteLine(" dotnet run -- "); + Console.WriteLine(" dotnet run -- help"); + Console.WriteLine(); + Console.WriteLine("Samples by category:"); + var lastCat = ""; + foreach (var s in Samples) + { + if (s.Category != lastCat) + { + Console.WriteLine(); + Console.WriteLine($" [{s.Category}]"); + lastCat = s.Category; + } + + Console.WriteLine($" {s.Name,-22} {s.Description}"); + } + + Console.WriteLine(); + } + } +} diff --git a/Demo/new-annots.cs b/Demo/Samples/Annotations/NewAnnots.cs similarity index 100% rename from Demo/new-annots.cs rename to Demo/Samples/Annotations/NewAnnots.cs diff --git a/Demo/Samples/Annotations/Program.Annotations.FreeText.cs b/Demo/Samples/Annotations/Program.Annotations.FreeText.cs new file mode 100644 index 00000000..4f398e5f --- /dev/null +++ b/Demo/Samples/Annotations/Program.Annotations.FreeText.cs @@ -0,0 +1,75 @@ +namespace Demo +{ + internal partial class Program + { + /// Three stacked FreeText annotations (plain text, fonts, rotation). + internal static void TestAnnotationsFreeText1(string[] args) + { + _ = args; + Console.WriteLine("\n=== TestAnnotationsFreeText1 ======================="); + + Document doc = new Document(); + Page page = doc.NewPage(); + + Rect r1 = new Rect(100, 100, 200, 150); + Rect r2 = r1 + new Rect(0, 75, 0, 75); + Rect r3 = r2 + new Rect(0, 75, 0, 75); + + string t = "¡Un pequeño texto para practicar!"; + + Annot a1 = page.AddFreeTextAnnot(r1, t, textColor: Constants.red); + Annot a2 = page.AddFreeTextAnnot(r2, t, fontName: "Ti", textColor: Constants.blue); + Annot a3 = page.AddFreeTextAnnot(r3, t, fontName: "Co", textColor: Constants.blue, rotate: 90); + a3.SetBorder(width: 0); + a3.Update(fontSize: 8, fillColor: Constants.gold); + + doc.Save("a-freetext.pdf"); + doc.Close(); + + Console.WriteLine("Saved to a-freetext.pdf"); + } + + /// FreeText with rich text, styling, and callout line. + internal static void TestAnnotationsFreeText2(string[] args) + { + _ = args; + Console.WriteLine("\n=== TestAnnotationsFreeText2 ======================="); + + string ds = "font-size: 11pt; font-family: sans-serif;"; + string bullet = "\u2610\u2611\u2612"; + + string text = $@"

+MuPDF.NET འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན། +Here is some bold and italic text, followed by bold-italic. Text-based check boxes: {bullet}. +

"; + + Document doc = new Document(); + Page page = doc.NewPage(); + + Rect rect = new Rect(100, 100, 350, 200); + Point p2 = rect.TopRight + new Point(50, 30); + Point p3 = p2 + new Point(0, 30); + + Annot annot = page.AddFreeTextAnnot( + rect, + text, + fillColor: Constants.gold, + opacity: 1, + rotate: 0, + borderWidth: 1, + dashes: null, + richtext: true, + style: ds, + callout: new Point[] { p3, p2, rect.TopRight }, + lineEnd: PdfLineEnding.PDF_ANNOT_LE_OPEN_ARROW, + borderColor: Constants.green + ); + + const string outName = "AnnotationsFreeText2.pdf"; + doc.Save(outName, pretty: 1); + doc.Close(); + + Console.WriteLine("Saved to " + outName); + } + } +} diff --git a/Demo/Samples/Barcodes/Program.Barcodes.cs b/Demo/Samples/Barcodes/Program.Barcodes.cs new file mode 100644 index 00000000..792c161a --- /dev/null +++ b/Demo/Samples/Barcodes/Program.Barcodes.cs @@ -0,0 +1,341 @@ +namespace Demo +{ + internal partial class Program + { + internal static void TestWriteBarcode1() + { + Console.WriteLine("\n=== TestWriteBarcode1 ====================="); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/Blank.pdf"); + Document doc = new Document(testFilePath); + + Page page = doc[0]; + + // CODE39 + Rect rect = new Rect( + X0: Units.MmToPoints(50), + X1: Units.MmToPoints(80), + Y0: Units.MmToPoints(70), + Y1: Units.MmToPoints(85)); + + page.WriteBarcode(rect, "JJBEA6500", BarcodeFormat.CODE39, forceFitToRect: true, pureBarcode: true, narrowBarWidth:1); + + rect = new Rect( + X0: Units.MmToPoints(50), + X1: Units.MmToPoints(160), + Y0: Units.MmToPoints(100), + Y1: Units.MmToPoints(105)); + + page.WriteBarcode(rect, "JJBEA6500", BarcodeFormat.CODE39, forceFitToRect: true, pureBarcode: true, narrowBarWidth: 2); + + // CODE128 + Rect rect1 = new Rect( + X0: Units.MmToPoints(50), + X1: Units.MmToPoints(100), + Y0: Units.MmToPoints(50), + Y1: Units.MmToPoints(60)); + + page.WriteBarcode(rect1, "JJBEA6500063000000177922", BarcodeFormat.CODE128, forceFitToRect: false, pureBarcode: true, narrowBarWidth: 1); + + rect1 = new Rect( + X0: Units.MmToPoints(50), + X1: Units.MmToPoints(200), + Y0: Units.MmToPoints(80), + Y1: Units.MmToPoints(120)); + + page.WriteBarcode(rect1, "JJBEA6500063000000177922", BarcodeFormat.CODE128, forceFitToRect: true, pureBarcode: true, narrowBarWidth: 1); + + Rect rect2 = new Rect( + X0: Units.MmToPoints(100), + X1: Units.MmToPoints(140), + Y0: Units.MmToPoints(40), + Y1: Units.MmToPoints(80)); + + page.WriteBarcode(rect2, "01030000110444408000", BarcodeFormat.DM, forceFitToRect: false, pureBarcode: true, narrowBarWidth: 3); + + Pixmap pxmp = Utils.GetBarcodePixmap("JJBEA6500063000000177922", BarcodeFormat.CODE128, width: 500, pureBarcode: true, marginLeft:0, marginTop:0, marginRight:0, marginBottom:0, narrowBarWidth: 1); + + pxmp.Save(@"PxmpBarcode3.png"); + + byte[] imageBytes = pxmp.ToBytes(); + + using var stream = new SKMemoryStream(imageBytes); + using var codec = SKCodec.Create(stream); + var info = codec.Info; + var bitmap = SKBitmap.Decode(codec); + + using var data = bitmap.Encode(SKEncodedImageFormat.Png, 100); // 100 = quality + using var stream1 = File.OpenWrite(@"output.png"); + data.SaveTo(stream1); + + doc.Save(@"TestWriteBarcode1.pdf"); + + page.Dispose(); + doc.Close(); + + Console.WriteLine("TestWriteBarcode1 completed."); + } + + internal static void TestReadDataMatrix() + { + int i = 0; + + Console.WriteLine("\n=== TestReadDataMatrix ======================="); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/Barcodes/datamatrix.pdf"); + Document doc = new Document(testFilePath); + + Page page = doc[0]; + + List barcodes = page.ReadBarcodes(decodeEmbeddedOnly: false); + + foreach (Barcode barcode in barcodes) + { + BarcodePoint[] points = barcode.ResultPoints; + Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); + } + /* + List blocks = page.GetImageInfo(); + + foreach (Block block in blocks) + { + Rect blockRect = block.Bbox; + barcodes = page.ReadBarcodes(clip:blockRect); + foreach (Barcode barcode in barcodes) + { + BarcodePoint[] points = barcode.ResultPoints; + if (points.Length == 2) + { + Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); + } + else if (points.Length == 4) + { + Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[2]}]"); + } + } + } + */ + /* + List imlist = page.GetImages(); + foreach (Entry im in imlist) + { + ImageInfo img = doc.ExtractImage(im.Xref); + File.WriteAllBytes(@"copy.png", img.Image); + + List barcodes = Utils.ReadBarcodes(@"copy.png", new Rect(0,0,img.Width,img.Height)); + + foreach (Barcode barcode in barcodes) + { + BarcodePoint[] points = barcode.ResultPoints; + Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); + } + } + */ + + page.Dispose(); + doc.Close(); + } + + + internal static void TestReadBarcode(string[] args) + { + int i = 0; + + Console.WriteLine("\n=== TestReadBarcode ======================="); + + Console.WriteLine("--- Read from image file ----------"); + string testFilePath1 = Path.GetFullPath("../../../TestDocuments/Barcodes/rendered.bmp"); + + Rect rect1 = new Rect(1260, 390, 1720, 580); + List barcodes2 = Utils.ReadBarcodes(testFilePath1, clip:rect1); + + i = 0; + foreach (Barcode barcode in barcodes2) + { + BarcodePoint[] points = barcode.ResultPoints; + Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); + } + + Console.WriteLine("--- Read from pdf file ----------"); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/Barcodes/Samples.pdf"); + Document doc = new Document(testFilePath); + + Page page = doc[0]; + //Rect rect = new Rect(290, 590, 420, 660); + List barcodes = page.ReadBarcodes(); + + foreach (Barcode barcode in barcodes) + { + BarcodePoint[] points = barcode.ResultPoints; + Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); + } + doc.Close(); + } + + internal static void TestReadQrCode(string[] args) + { + Console.WriteLine("\n=== TestReadQrCode ======================="); + int i = 0; + /* + Console.WriteLine("=== Read from image file ====================="); + string testFilePath1 = Path.GetFullPath("../../../TestDocuments/Barcodes/2.png"); + + List barcodes2 = Utils.ReadBarcodes(testFilePath1, autoRotate:true); + + i = 0; + foreach (Barcode barcode in barcodes2) + { + BarcodePoint[] points = barcode.ResultPoints; + Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); + } + */ + ///* + Console.WriteLine("--- Read from pdf file ----------"); + + string testImagePath = @"test.png"; + string testFilePath = Path.GetFullPath("../../../TestDocuments/Barcodes/input.pdf"); + Document doc = new Document(testFilePath); + + Page page = doc[0]; + page.RemoveRotation(); // remove rotation to read barcodes correctly + + // Apply 2x scale (both X and Y) + var matrix = new Matrix(3.0f, 3.0f); + + // Render the page using the scaled matrix + var pixmap = page.GetPixmap(matrix); + + pixmap.GammaWith(3.2f); // apply gamma correction to improve barcode detection + + pixmap.Save(testImagePath); + + /* + Rect rect = new Rect(400, 700, page.Rect.X1, page.Rect.Y1); + List barcodes = page.ReadBarcodes(rect); + + foreach (Barcode barcode in barcodes) + { + BarcodePoint[] points = barcode.ResultPoints; + Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); + } + */ + + pixmap.Dispose(); + doc.Close(); + + List barcodes2 = Utils.ReadBarcodes(testImagePath); + + i = 0; + foreach (Barcode barcode in barcodes2) + { + BarcodePoint[] points = barcode.ResultPoints; + Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); + } + //*/ + } + + internal static void TestWriteBarcode(string[] args) + { + Console.WriteLine("\n=== TestWriteBarcode ======================="); + Console.WriteLine("--- Write to pdf file ----------"); + string testFilePath = Path.GetFullPath("../../../TestDocuments/Blank.pdf"); + Document doc = new Document(testFilePath); + Page page = doc[0]; + + MuPDF.NET.TextWriter writer = new MuPDF.NET.TextWriter(page.Rect); + Font font = new Font("cour", isBold: 1); + writer.FillTextbox(page.Rect, "QR_CODE", font, pos: new Point(0, 10)); + writer.FillTextbox(page.Rect, "EAN_8", font, pos: new Point(0, 110)); + writer.FillTextbox(page.Rect, "EAN_13", font, pos: new Point(0, 165)); + writer.FillTextbox(page.Rect, "UPC_A", font, pos: new Point(0, 220)); + writer.FillTextbox(page.Rect, "CODE_39", font, pos: new Point(0, 275)); + writer.FillTextbox(page.Rect, "CODE_128", font, pos: new Point(0, 330)); + writer.FillTextbox(page.Rect, "ITF", font, pos: new Point(0, 385)); + writer.FillTextbox(page.Rect, "PDF_417", font, pos: new Point(0, 440)); + writer.FillTextbox(page.Rect, "CODABAR", font, pos: new Point(0, 520)); + writer.FillTextbox(page.Rect, "DATA_MATRIX", font, pos: new Point(0, 620)); + writer.WriteText(page); + + // QR_CODE + Rect rect = new Rect(100, 20, 300, 80); + page.WriteBarcode(rect, "Hello World!", BarcodeFormat.QR, forceFitToRect:false, pureBarcode:false, marginLeft:0); + + // EAN_8 + rect = new Rect(100, 100, 300, 120); + page.WriteBarcode(rect, "1234567", BarcodeFormat.EAN8, forceFitToRect: false, pureBarcode: false, marginBottom: 20); + + // EAN_13 + rect = new Rect(100, 155, 300, 200); + page.WriteBarcode(rect, "123456789012", BarcodeFormat.EAN13, forceFitToRect: false, pureBarcode: true, marginBottom: 0); + + // UPC_A + rect = new Rect(100, 210, 300, 255); + page.WriteBarcode(rect, "123456789012", BarcodeFormat.UPC_A, forceFitToRect: false, pureBarcode: true, marginBottom: 0); + + // CODE_39 + rect = new Rect(100, 265, 600, 285); + page.WriteBarcode(rect, "Hello World!", BarcodeFormat.CODE39, forceFitToRect: false, pureBarcode: false, marginBottom: 0); + + // CODE_128 + rect = new Rect(100, 320, 400, 355); + page.WriteBarcode(rect, "Hello World!", BarcodeFormat.CODE128, forceFitToRect: true, pureBarcode: true, marginBottom: 0); + + // ITF + rect = new Rect(100, 385, 300, 420); + page.WriteBarcode(rect, "12345678901234567890", BarcodeFormat.I2OF5, forceFitToRect: false, pureBarcode: false, marginBottom: 0); + + // PDF_417 + rect = new Rect(100, 430, 400, 435); + page.WriteBarcode(rect, "Hello World!", BarcodeFormat.PDF417, forceFitToRect: false, pureBarcode: true, marginBottom: 0); + + // CODABAR + rect = new Rect(100, 540, 400, 580); + page.WriteBarcode(rect, "12345678901234567890", BarcodeFormat.CODABAR, forceFitToRect: false, pureBarcode: true, marginBottom: 0); + + // DATA_MATRIX + rect = new Rect(100, 620, 140, 660); + page.WriteBarcode(rect, "01100000110419257000", BarcodeFormat.DM, forceFitToRect: false, pureBarcode: false, marginBottom: 0); + + doc.Save("barcode.pdf"); + + Console.WriteLine($"Barcodes written to 'barcode.pdf' in: {page.Rect}"); + doc.Close(); + + Console.WriteLine("--- Write to image file ----------"); + + // QR_CODE + Utils.WriteBarcode("QR_CODE.png", "Hello World!", BarcodeFormat.QR, width: 600, height: 600, forceFitToRect: true, pureBarcode: false, marginBottom: 0); + + // EAN_8 + Utils.WriteBarcode("EAN_8.png", "1234567", BarcodeFormat.EAN8, width: 300, height: 20, forceFitToRect: false, pureBarcode: false, marginBottom: 4); + + // EAN_13 + Utils.WriteBarcode("EAN_13.png", "123456789012", BarcodeFormat.EAN13, width: 300, height: 0, forceFitToRect: false, pureBarcode: false, marginBottom: 10); + + // UPC_A + Utils.WriteBarcode("UPC_A.png", "123456789012", BarcodeFormat.UPC_A, width: 300, height: 20, forceFitToRect: false, pureBarcode: false, marginBottom: 10); + + // CODE_39 + Utils.WriteBarcode("CODE_39.png", "Hello World!", BarcodeFormat.CODE39, width: 300, height: 70, forceFitToRect: false, pureBarcode: false, marginBottom: 20); + + // CODE_128 + Utils.WriteBarcode("CODE_128.png", "Hello World!", BarcodeFormat.CODE128, width: 300, height: 150, forceFitToRect: false, pureBarcode: false, marginBottom: 20); + + // ITF + Utils.WriteBarcode("ITF.png", "12345678901234567890", BarcodeFormat.I2OF5, width: 300, height: 120, forceFitToRect: false, pureBarcode: false, marginBottom: 20); + + // PDF_417 + Utils.WriteBarcode("PDF_417.png", "Hello World!", BarcodeFormat.PDF417, width: 300, height: 10, forceFitToRect: false, pureBarcode: false, marginBottom: 0); + + // CODABAR + Utils.WriteBarcode("CODABAR.png", "12345678901234567890", BarcodeFormat.CODABAR, width: 300, height: 150, forceFitToRect: false, pureBarcode: false, marginBottom: 20); + + // DATA_MATRIX + Utils.WriteBarcode("DATA_MATRIX.png", "01100000110419257000", BarcodeFormat.DM, width: 300, height: 300, forceFitToRect: false, pureBarcode: true, marginBottom: 1); + + Console.WriteLine("Barcodes written to image files in the current directory."); + } + + } +} diff --git a/Demo/Samples/Document/Program.Document.cs b/Demo/Samples/Document/Program.Document.cs new file mode 100644 index 00000000..0e23500a --- /dev/null +++ b/Demo/Samples/Document/Program.Document.cs @@ -0,0 +1,125 @@ +namespace Demo +{ + internal partial class Program + { + internal static void TestMoveFile() + { + string origfilename = @"../../../TestDocuments/Blank.pdf"; + + string filePath = @"testmove.pdf"; + + File.Copy(origfilename, filePath, true); + + Document d = new Document(filePath); + + Page page = d[0]; + + Point tl = new Point(100, 120); + Point br = new Point(300, 150); + + Rect rect = new Rect(tl, br); + + TextWriter pw = new TextWriter(page.TrimBox); + /* + Font font = new Font(fontName: "tiro"); + + List<(string, float)> ret = pw.FillTextbox(rect, "This is a test to overwrite the original file and move it", font, fontSize: 24); + */ + pw.WriteText(page); + + page.Dispose(); + + MemoryStream tmp = new MemoryStream(); + + d.Save(tmp, garbage: 3, deflateFonts: 1, deflate: 1); + + d.Close(); + + File.WriteAllBytes(filePath, tmp.ToArray()); + + tmp.Dispose(); + + File.Move(filePath, @"moved.pdf", true); + } + + internal static void TestMetadata() + { + Console.WriteLine("\n=== TestMetadata ====================="); + + string testFilePath = @"../../../TestDocuments/Annot.pdf"; + + Document doc = new Document(testFilePath); + + Dictionary metaDict = doc.MetaData; + + foreach (string key in metaDict.Keys) + { + Console.WriteLine(key + ": " + metaDict[key]); + } + + doc.Close(); + + Console.WriteLine("TestMetadata completed."); + } + + internal static void TestMorph() + { + Console.WriteLine("\n=== TestMorph ====================="); + + string testFilePath = @"../../../TestDocuments/Morph.pdf"; + + Document doc = new Document(testFilePath); + Page page = doc[0]; + Rect printrect = new Rect(180, 30, 650, 60); + int pagerot = page.Rotation; + TextWriter pw = new TextWriter(page.TrimBox); + string txt = "Origin 100.100"; + pw.Append(new Point(100, 100), txt, new Font("tiro"), fontSize: 24); + pw.WriteText(page); + + txt = "rotated 270 - 100.100"; + Matrix matrix = new IdentityMatrix(); + matrix.Prerotate(270); + Morph mo = new Morph(new Point(100, 100), matrix); + pw = new TextWriter(page.TrimBox); + pw.Append(new Point(100, 100), txt, new Font("tiro"), fontSize: 24); + pw.WriteText(page, morph:mo); + page.SetRotation(270); + + page.Dispose(); + doc.Save(@"morph.pdf"); + doc.Close(); + } + + internal static void TestUnicodeDocument() + { + Console.WriteLine("\n=== TestUnicodeDocument ====================="); + + string testFilePath = @"../../../TestDocuments/Σ╜áσÑ╜.pdf"; + + Document doc = new Document(testFilePath); + + doc.Save(@"Σ╜áσÑ╜_.pdf"); + doc.Close(); + + Console.WriteLine("TestUnicodeDocument completed."); + } + + internal static void TestMemoryLeak() + { + Console.WriteLine("\n=== TestMemoryLeak ======================="); + string testFilePath = Path.GetFullPath("../../../TestDocuments/Blank.pdf"); + + for (int i = 0; i < 100; i++) + { + Document doc = new Document(testFilePath); + Page page = doc.NewPage(); + page.Dispose(); + doc.Close(); + } + + Console.WriteLine("Memory leak test completed. No leaks should be detected."); + } + + } +} diff --git a/Demo/Samples/ImageFilters/Program.ImageFilters.cs b/Demo/Samples/ImageFilters/Program.ImageFilters.cs new file mode 100644 index 00000000..f51e3d7a --- /dev/null +++ b/Demo/Samples/ImageFilters/Program.ImageFilters.cs @@ -0,0 +1,86 @@ +namespace Demo +{ + internal partial class Program + { + internal static void TestImageFilter() + { + const string inputPath = @"../../../TestDocuments/Image/table.jpg"; + const string outputPath = @"output.png"; + + // Load the image file into SKBitmap + using (var bitmap = SKBitmap.Decode(inputPath)) + { + if (bitmap == null) + { + Console.WriteLine("Failed to load image."); + return; + } + + SKBitmap inputBitmap = bitmap.Copy(); + + // build the pipeline + var pipeline = new ImageFilterPipeline(); + + // clear any defaults if youΓÇÖre reusing the instance + pipeline.Clear(); + + // add filters one-by-one + pipeline.AddDeskew(minAngle: 0.5); // replaces any existing deskew step + pipeline.AddRemoveHorizontalLines(); // also replaces existing horizontal-removal step + pipeline.AddRemoveVerticalLines(); + pipeline.AddGrayscale(); + //pipeline.AddMedian(blockSize: 2, replaceExisting: true); + //pipeline.AddGamma(gamma: 1.2); // brighten slightly + //pipeline.AddContrast(contrast: 100); + //pipeline.AddFit(100); + //pipeline.AddDilation(); + //pipeline.AddScale(scaleFactor: 1.75, quality: SKFilterQuality.Medium); + pipeline.AddInvert(); + + // apply the pipeline (bitmap is modified in place) + pipeline.Apply(ref inputBitmap); + + using (var data = inputBitmap.Encode(SKEncodedImageFormat.Png, 100)) // 100 = quality + { + using (var stream = File.OpenWrite(outputPath)) + { + data.SaveTo(stream); + } + } + + Console.WriteLine($"Loaded image: {bitmap.Width}x{bitmap.Height} pixels"); + } + } + + internal static void TestImageFilterOcr() + { + const string inputPath = @"../../../TestDocuments/Image/boxedpage.jpg"; + + using (Pixmap pxmp = new Pixmap(inputPath)) + { + // build the pipeline + var pipeline = new ImageFilterPipeline(); + + // clear any defaults if youΓÇÖre reusing the instance + pipeline.Clear(); + + // add filters one-by-one + //pipeline.AddDeskew(minAngle: 0.5); // replaces any existing deskew step + //pipeline.AddRemoveHorizontalLines(); // also replaces existing horizontal-removal step + //pipeline.AddRemoveVerticalLines(); + //pipeline.AddGrayscale(); + //pipeline.AddMedian(blockSize: 2, replaceExisting: true); + pipeline.AddGamma(gamma: 1.2); // brighten slightly + //pipeline.AddContrast(contrast: 100); + //pipeline.AddScaleFit(100); + //pipeline.AddDilation(); + pipeline.AddScale(scaleFactor: 1.75, quality: SKFilterQuality.High); + //pipeline.AddInvert(); + + string txt = pxmp.GetTextFromOcr(pipeline); + Console.WriteLine(txt); + } + } + + } +} diff --git a/Demo/Samples/Llm/Program.Llm.PdfMarkdownReader.Fixtures.cs b/Demo/Samples/Llm/Program.Llm.PdfMarkdownReader.Fixtures.cs new file mode 100644 index 00000000..2100de11 --- /dev/null +++ b/Demo/Samples/Llm/Program.Llm.PdfMarkdownReader.Fixtures.cs @@ -0,0 +1,49 @@ +namespace Demo +{ + /// + /// demos aligned with MuPDF.NET4LLM / repository reader tests. + /// + internal partial class Program + { + /// Empty in-memory PDF, save, then (one page → one Llama document). + internal static void Test4LlmPdfMarkdownReaderEmptyPage(string[] args) + { + _ = args; + Console.WriteLine("\n=== Test4LlmPdfMarkdownReaderEmptyPage ======================="); + + string path = Path.Combine(AppContext.BaseDirectory, "llm_reader_empty_page.pdf"); + Document document = new Document(); + try + { + document.NewPage(); + document.Save(path); + } + finally + { + document.Close(); + } + + var reader = new PDFMarkdownReader(); + var documents = reader.LoadData(path); + Console.WriteLine($"Loaded {documents.Count} document(s)."); + } + + /// with a non-existent path → . + internal static void Test4LlmPdfMarkdownReaderMissingFile(string[] args) + { + _ = args; + Console.WriteLine("\n=== Test4LlmPdfMarkdownReaderMissingFile ======================="); + + var reader = new PDFMarkdownReader(); + try + { + reader.LoadData(Path.Combine(LlmRepositoryTestsDirectory(), "fake", "path", "nope.pdf")); + Console.WriteLine("Unexpected: LoadData should throw for missing file."); + } + catch (FileNotFoundException ex) + { + Console.WriteLine($"OK: FileNotFoundException — {ex.Message}"); + } + } + } +} diff --git a/Demo/Samples/Llm/Program.Llm.ToMarkdown.Fixtures.cs b/Demo/Samples/Llm/Program.Llm.ToMarkdown.Fixtures.cs new file mode 100644 index 00000000..c1a4dc2d --- /dev/null +++ b/Demo/Samples/Llm/Program.Llm.ToMarkdown.Fixtures.cs @@ -0,0 +1,211 @@ +namespace Demo +{ + /// + /// MuPDF.NET4LLM demos aligned with repository tests/ fixtures (golden markdown, OCR behavior). + /// PDFs live under repo tests/; samples skip if files are missing. + /// + internal partial class Program + { + private static string LlmRepositoryRootFromAppBase() => + Path.GetFullPath(Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..")); + + private static string LlmRepositoryTestsDirectory() => + Path.Combine(LlmRepositoryRootFromAppBase(), "tests"); + + private static bool LlmOcrEnvironmentLikelyAvailable() => + !string.IsNullOrEmpty(Utils.TESSDATA_PREFIX); + + /// ToMarkdown with fixed flags vs tests/test_370_expected.md (fixture: tests/test_370.pdf). + internal static void Test4LlmToMarkdownCompareExpected370(string[] args) + { + _ = args; + Console.WriteLine("\n=== Test4LlmToMarkdownCompareExpected370 (MuPDF.NET4LLM) ======================="); + + string testsDir = LlmRepositoryTestsDirectory(); + string pdfPath = Path.Combine(testsDir, "test_370.pdf"); + string expectedPath = Path.Combine(testsDir, "test_370_expected.md"); + if (!File.Exists(pdfPath) || !File.Exists(expectedPath)) + { + Console.WriteLine($"Skip: need test_370.pdf and test_370_expected.md in: {testsDir}"); + return; + } + + string expected = File.ReadAllText(expectedPath, Encoding.UTF8); + Document document = new Document(pdfPath); + try + { + string actual = MuPDF4LLM.ToMarkdown( + document, + header: false, + footer: false, + writeImages: false, + embedImages: false, + imageFormat: "jpg", + showProgress: true, + forceText: true, + pageSeparators: true); + + string actualPath = Path.Combine(AppContext.BaseDirectory, "llm_fixture_370_actual.md"); + File.WriteAllText(actualPath, actual, Encoding.UTF8); + Console.WriteLine($"Wrote actual markdown: {actualPath}"); + + if (!string.Equals(actual, expected, StringComparison.Ordinal)) + { + Console.WriteLine("Mismatch vs tests/test_370_expected.md (first differences):"); + LlmPrintLineDiff(expected, actual, maxLines: 40); + } + else + { + Console.WriteLine("OK: actual matches test_370_expected.md"); + } + } + finally + { + document.Close(); + } + } + + /// Default ToMarkdown on FFFD fixture; U+FFFD vs TESSDATA_PREFIX (fixture: tests/test_ocr_loremipsum_FFFD.pdf). + internal static void Test4LlmToMarkdownOcrFixture1(string[] args) + { + _ = args; + Console.WriteLine("\n=== Test4LlmToMarkdownOcrFixture1 ======================="); + + string pdfPath = Path.Combine(LlmRepositoryTestsDirectory(), "test_ocr_loremipsum_FFFD.pdf"); + if (!File.Exists(pdfPath)) + { + Console.WriteLine($"Skip: missing {pdfPath}"); + return; + } + + Document doc = new Document(pdfPath); + string md; + try + { + md = MuPDF4LLM.ToMarkdown(doc); + } + finally + { + doc.Close(); + } + + File.WriteAllText(Path.Combine(AppContext.BaseDirectory, "llm_ocr_fixture_1.md"), md, Encoding.UTF8); + bool ocr = LlmOcrEnvironmentLikelyAvailable(); + Console.WriteLine($"TESSDATA_PREFIX set: {ocr}"); + bool hasReplacement = md.Contains(MuPDF.NET4LLM.Ocr.TesseractApi.ReplacementUnicode); + if (ocr && hasReplacement) + Console.WriteLine("Note: U+FFFD still present—check tessdata / language / PDF."); + else if (ocr && !hasReplacement) + Console.WriteLine("OK: no U+FFFD when tessdata is configured."); + else if (!ocr && hasReplacement) + Console.WriteLine("OK: U+FFFD present without tessdata."); + else + Console.WriteLine("Note: no U+FFFD without tessdata—compare llm_ocr_fixture_1.md."); + } + + /// ToMarkdown(..., useOcr: false) on FFFD fixture. + internal static void Test4LlmToMarkdownOcrFixture2(string[] args) + { + _ = args; + Console.WriteLine("\n=== Test4LlmToMarkdownOcrFixture2 ======================="); + + string pdfPath = Path.Combine(LlmRepositoryTestsDirectory(), "test_ocr_loremipsum_FFFD.pdf"); + if (!File.Exists(pdfPath)) + { + Console.WriteLine($"Skip: missing {pdfPath}"); + return; + } + + Document doc = new Document(pdfPath); + string md; + try + { + md = MuPDF4LLM.ToMarkdown(doc, useOcr: false); + } + finally + { + doc.Close(); + } + + File.WriteAllText(Path.Combine(AppContext.BaseDirectory, "llm_ocr_fixture_2.md"), md, Encoding.UTF8); + bool hasReplacement = md.Contains(MuPDF.NET4LLM.Ocr.TesseractApi.ReplacementUnicode); + Console.WriteLine(hasReplacement + ? "OK: U+FFFD present with useOcr=false." + : "Note: no U+FFFD with OCR off—fixture-dependent."); + } + + /// SVG text fixture: compare default vs useOcr: false output size (fixture: tests/test_ocr_loremipsum_svg.pdf). + internal static void Test4LlmToMarkdownOcrFixture3(string[] args) + { + _ = args; + Console.WriteLine("\n=== Test4LlmToMarkdownOcrFixture3 ======================="); + + string pdfPath = Path.Combine(LlmRepositoryTestsDirectory(), "test_ocr_loremipsum_svg.pdf"); + if (!File.Exists(pdfPath)) + { + Console.WriteLine($"Skip: missing {pdfPath}"); + return; + } + + Document doc = new Document(pdfPath); + string md; + string mdNoOcr; + try + { + md = MuPDF4LLM.ToMarkdown(doc); + mdNoOcr = MuPDF4LLM.ToMarkdown(doc, useOcr: false); + } + finally + { + doc.Close(); + } + + string baseDir = AppContext.BaseDirectory; + File.WriteAllText(Path.Combine(baseDir, "llm_ocr_fixture_3.md"), md, Encoding.UTF8); + File.WriteAllText(Path.Combine(baseDir, "llm_ocr_fixture_3_no_ocr.md"), mdNoOcr, Encoding.UTF8); + + bool ocr = LlmOcrEnvironmentLikelyAvailable(); + if (ocr) + { + if (mdNoOcr.Length < md.Length) + Console.WriteLine($"OK: with tessdata, no-OCR shorter ({mdNoOcr.Length} < {md.Length})."); + else + Console.WriteLine($"Note: lengths OCR={md.Length}, no-OCR={mdNoOcr.Length} (environment-dependent)."); + } + else + { + Console.WriteLine(string.Equals(md, mdNoOcr, StringComparison.Ordinal) + ? "OK: without tessdata, OCR on/off often match." + : "Note: outputs differ; compare llm_ocr_fixture_3*.md."); + } + } + + private static void LlmPrintLineDiff(string expected, string actual, int maxLines) + { + string[] a = expected.Replace("\r\n", "\n").Split('\n'); + string[] b = actual.Replace("\r\n", "\n").Split('\n'); + int n = Math.Max(a.Length, b.Length); + int printed = 0; + for (int i = 0; i < n && printed < maxLines; i++) + { + string lineA = i < a.Length ? a[i] : ""; + string lineB = i < b.Length ? b[i] : ""; + if (lineA == lineB) + continue; + Console.WriteLine($" line {i + 1}:"); + Console.WriteLine($" expected: {LlmTruncateForConsole(lineA)}"); + Console.WriteLine($" actual: {LlmTruncateForConsole(lineB)}"); + printed++; + } + if (printed >= maxLines) + Console.WriteLine(" ... (truncated)"); + } + + private static string LlmTruncateForConsole(string s, int max = 200) + { + if (string.IsNullOrEmpty(s)) + return s; + return s.Length <= max ? s : s.Substring(0, max) + "…"; + } + } +} diff --git a/Demo/Samples/Llm/Program.Llm.cs b/Demo/Samples/Llm/Program.Llm.cs new file mode 100644 index 00000000..0353ba2f --- /dev/null +++ b/Demo/Samples/Llm/Program.Llm.cs @@ -0,0 +1,437 @@ +namespace Demo +{ + internal partial class Program + { + internal static void TestMarkdownReader() + { + Console.WriteLine("\n=== TestMarkdownReader ======================="); + + var reader = new PDFMarkdownReader(); + string testFilePath = Path.GetFullPath("../../../TestDocuments/columns.pdf"); + + var docs = reader.LoadData(testFilePath); + + foreach (var doc in docs) + { + Console.WriteLine(doc.Text); + } + } + + internal static void TestGetText() + { + Console.WriteLine("\n=== TestGetText ======================="); + + var reader = new PDFMarkdownReader(); + string testFilePath = Path.GetFullPath("../../../TestDocuments/columns.pdf"); + + Document doc = new Document(testFilePath); + + for (int i = 0; i < doc.PageCount; i++) + { + Page page = doc[i]; + + var text = Utils.GetText(page, option: "dict"); + + Console.WriteLine(text); + + page.Dispose(); + } + + doc.Close(); + } + + internal static void TestTable() + { + Console.WriteLine("\n=== TestTable ======================="); + + try + { + string testFilePath = Path.GetFullPath("../../../TestDocuments/err_table.pdf"); + + if (!File.Exists(testFilePath)) + { + Console.WriteLine($"Error: Test file not found: {testFilePath}"); + return; + } + + Console.WriteLine($"Loading PDF: {testFilePath}"); + Document doc = new Document(testFilePath); + Console.WriteLine($"Document loaded: {doc.PageCount} page(s)"); + + // Test on first page + Page page = doc[0]; + Console.WriteLine($"\nPage 0 - Rect: {page.Rect}"); + + // Test 1: Get tables with default strategy + Console.WriteLine("\n--- Test 1: Get tables with 'lines_strict' strategy ---"); + List
tables = Utils.GetTables( + page, + clip: page.Rect, + vertical_strategy: "lines_strict", + horizontal_strategy: "lines_strict"); + + Console.WriteLine($"Found {tables.Count} table(s) on page 0"); + + if (tables.Count > 0) + { + for (int i = 0; i < tables.Count; i++) + { + Table table = tables[i]; + Console.WriteLine($"\n Table {i + 1}:"); + Console.WriteLine($" Rows: {table.row_count}"); + Console.WriteLine($" Columns: {table.col_count}"); + if (table.bbox != null) + { + Console.WriteLine($" BBox: ({table.bbox.X0:F2}, {table.bbox.Y0:F2}, {table.bbox.X1:F2}, {table.bbox.Y1:F2})"); + } + + // Display header information + if (table.header != null) + { + Console.WriteLine($" Header:"); + Console.WriteLine($" External: {table.header.external}"); + if (table.header.names != null && table.header.names.Count > 0) + { + Console.WriteLine($" Column names: {string.Join(", ", table.header.names)}"); + } + } + + // Extract table data + Console.WriteLine($"\n Extracting table data..."); + List> tableData = table.Extract(); + if (tableData != null && tableData.Count > 0) + { + Console.WriteLine($" Extracted {tableData.Count} row(s) of data"); + // Show first few rows as preview + int previewRows = Math.Min(3, tableData.Count); + for (int row = 0; row < previewRows; row++) + { + var rowData = tableData[row]; + if (rowData != null) + { + Console.WriteLine($" Row {row + 1}: {string.Join(" | ", rowData.Take(5))}"); // Show first 5 columns + } + } + if (tableData.Count > previewRows) + { + Console.WriteLine($" ... and {tableData.Count - previewRows} more row(s)"); + } + } + + // Convert to markdown + Console.WriteLine($"\n Converting to Markdown..."); + try + { + string markdown = table.ToMarkdown(clean: false, fillEmpty: true); + if (!string.IsNullOrEmpty(markdown)) + { + Console.WriteLine($" Markdown length: {markdown.Length} characters"); + // Save markdown to file + string markdownFile = $"table_{i + 1}_page0.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($" Markdown saved to: {markdownFile}"); + + // Show preview + int previewLength = Math.Min(200, markdown.Length); + Console.WriteLine($" Preview (first {previewLength} chars):"); + Console.WriteLine($" {markdown.Substring(0, previewLength)}..."); + } + } + catch (Exception ex) + { + Console.WriteLine($" Error converting to markdown: {ex.Message}"); + } + } + } + else + { + Console.WriteLine("No tables found. Trying with 'lines' strategy..."); + + // Test 2: Try with 'lines' strategy (less strict) + Console.WriteLine("\n--- Test 2: Get tables with 'lines' strategy ---"); + tables = Utils.GetTables( + page, + clip: page.Rect, + vertical_strategy: "lines", + horizontal_strategy: "lines"); + + Console.WriteLine($"Found {tables.Count} table(s) with 'lines' strategy"); + } + + // Test 3: Try with 'text' strategy + Console.WriteLine("\n--- Test 3: Get tables with 'text' strategy ---"); + List
textTables = Utils.GetTables( + page, + clip: page.Rect, + vertical_strategy: "text", + horizontal_strategy: "text"); + + Console.WriteLine($"Found {textTables.Count} table(s) with 'text' strategy"); + + // Test 4: Get tables from all pages + Console.WriteLine("\n--- Test 4: Get tables from all pages ---"); + int totalTables = 0; + for (int pageNum = 0; pageNum < doc.PageCount; pageNum++) + { + Page currentPage = doc[pageNum]; + List
pageTables = Utils.GetTables( + currentPage, + clip: currentPage.Rect, + vertical_strategy: "lines_strict", + horizontal_strategy: "lines_strict"); + + if (pageTables.Count > 0) + { + Console.WriteLine($" Page {pageNum}: {pageTables.Count} table(s)"); + totalTables += pageTables.Count; + } + currentPage.Dispose(); + } + Console.WriteLine($"Total tables found across all pages: {totalTables}"); + + page.Dispose(); + doc.Close(); + + Console.WriteLine("\n=== TestTable completed successfully ==="); + } + catch (Exception ex) + { + Console.WriteLine($"Error in TestTable: {ex.Message}"); + Console.WriteLine($"Stack trace: {ex.StackTrace}"); + throw; + } + } + + internal static void TestPyMuPdfRagToMarkdown() + { + Console.WriteLine("\n=== TestPyMuPdfRagToMarkdown ======================="); + + try + { + // Find a test PDF file + //string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf"); + string testFilePath = Path.GetFullPath("../../../TestDocuments/Magazine.pdf"); + + Document doc = new Document(testFilePath); + Console.WriteLine($"Document loaded: {doc.PageCount} page(s)"); + Console.WriteLine($"Document name: {doc.Name}"); + + // Test 1: Basic ToMarkdown with default settings + Console.WriteLine("\n--- Test 1: Basic ToMarkdown (default settings) ---"); + try + { + List pages = new List(); + pages.Add(0); + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: pages, // All pages + hdrInfo: null, // Auto-detect headers + writeImages: false, + embedImages: false, + ignoreImages: false, + ignoreGraphics: false, + detectBgColor: true, + imagePath: "", + imageFormat: "png", + imageSizeLimit: 0.05f, + filename: testFilePath, + forceText: true, + pageChunks: false, + pageSeparators: false, + margins: null, + dpi: 150, + pageWidth: 612, + pageHeight: null, + tableStrategy: "lines_strict", + graphicsLimit: null, + fontsizeLimit: 3.0f, + ignoreCode: false, + extractWords: false, + showProgress: false, + useGlyphs: false, + ignoreAlpha: false + ); + + string markdownFile = "TestPyMuPdfRag_Output.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($"Markdown output saved to: {markdownFile}"); + Console.WriteLine($"Markdown length: {markdown.Length} characters"); + if (markdown.Length > 0) + { + int previewLength = Math.Min(300, markdown.Length); + Console.WriteLine($"Preview (first {previewLength} chars):\n{markdown.Substring(0, previewLength)}..."); + } + } + catch (Exception ex) + { + Console.WriteLine($"Error in basic ToMarkdown: {ex.Message}"); + } + /* + // Test 2: ToMarkdown with IdentifyHeaders + Console.WriteLine("\n--- Test 2: ToMarkdown with IdentifyHeaders ---"); + try + { + var identifyHeaders = new IdentifyHeaders(doc, pages: null, bodyLimit: 12.0f, maxLevels: 6); + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, // First page only + hdrInfo: identifyHeaders, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: testFilePath, + forceText: true, + showProgress: false + ); + + string markdownFile = "TestPyMuPdfRag_WithHeaders.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($"Markdown with headers saved to: {markdownFile}"); + Console.WriteLine($"Markdown length: {markdown.Length} characters"); + } + catch (Exception ex) + { + Console.WriteLine($"Error in ToMarkdown with IdentifyHeaders: {ex.Message}"); + } + + // Test 3: ToMarkdown with TocHeaders + Console.WriteLine("\n--- Test 3: ToMarkdown with TocHeaders ---"); + try + { + var tocHeaders = new TocHeaders(doc); + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: new List { 0 }, // First page only + hdrInfo: tocHeaders, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: testFilePath, + forceText: true, + showProgress: false + ); + + string markdownFile = "TestPyMuPdfRag_WithToc.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($"Markdown with TOC headers saved to: {markdownFile}"); + Console.WriteLine($"Markdown length: {markdown.Length} characters"); + } + catch (Exception ex) + { + Console.WriteLine($"Error in ToMarkdown with TocHeaders: {ex.Message}"); + } + + // Test 4: ToMarkdown with page separators + Console.WriteLine("\n--- Test 4: ToMarkdown with page separators ---"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: null, // All pages + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: testFilePath, + forceText: true, + pageSeparators: true, // Add page separators + showProgress: false + ); + + string markdownFile = "TestPyMuPdfRag_WithSeparators.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($"Markdown with page separators saved to: {markdownFile}"); + Console.WriteLine($"Markdown length: {markdown.Length} characters"); + } + catch (Exception ex) + { + Console.WriteLine($"Error in ToMarkdown with page separators: {ex.Message}"); + } + + // Test 5: ToMarkdown with progress bar + Console.WriteLine("\n--- Test 5: ToMarkdown with progress bar ---"); + try + { + string markdown = MuPdfRag.ToMarkdown( + doc, + pages: null, // All pages + hdrInfo: null, + writeImages: false, + embedImages: false, + ignoreImages: false, + filename: testFilePath, + forceText: true, + showProgress: true, // Show progress bar + pageSeparators: false + ); + + string markdownFile = "TestPyMuPdfRag_WithProgress.md"; + File.WriteAllText(markdownFile, markdown, Encoding.UTF8); + Console.WriteLine($"\nMarkdown with progress saved to: {markdownFile}"); + Console.WriteLine($"Markdown length: {markdown.Length} characters"); + } + catch (Exception ex) + { + Console.WriteLine($"Error in ToMarkdown with progress: {ex.Message}"); + } + */ + doc.Close(); + } + catch (Exception ex) + { + Console.WriteLine($"An unexpected error occurred during PyMuPdfRag test: {ex.Message}"); + Console.WriteLine($"Stack trace: {ex.StackTrace}"); + } + + Console.WriteLine("\n=== TestPyMuPdfRagToMarkdown Completed ======================="); + } + + internal static void TestLLM() + { + Console.WriteLine("\n=== TestLLM ======================="); + + try + { + // Display version information + Console.WriteLine($"MuPDF.NET4LLM Version: {MuPDF4LLM.Version}"); + var versionTuple = MuPDF4LLM.VersionTuple; + Console.WriteLine($"Version Tuple: ({versionTuple.major}, {versionTuple.minor}, {versionTuple.patch})"); + + // Test with a sample PDF file + string testFilePath = Path.GetFullPath("../../../TestDocuments/national-capitals.pdf"); + //string testFilePath = Path.GetFullPath("../../../TestDocuments/Magazine.pdf"); + + // Try to find a PDF with actual content if Blank.pdf doesn't work well + if (!File.Exists(testFilePath)) + { + testFilePath = Path.GetFullPath("../../../TestDocuments/Widget.pdf"); + } + + if (!File.Exists(testFilePath)) + { + Console.WriteLine($"Test PDF file not found. Skipping LLM test."); + return; + } + + Console.WriteLine($"\nTesting with PDF: {testFilePath}"); + + Document doc = new Document(testFilePath); + Console.WriteLine($"Document loaded: {doc.PageCount} page(s)"); + + string markdownStr = MuPDF4LLM.ToMarkdown(doc); + + doc.Close(); + + string markdownFile = "TestLLM.md"; + File.WriteAllText(markdownFile, markdownStr, Encoding.UTF8); + Console.WriteLine("\nLLM test completed successfully."); + } + catch (Exception ex) + { + Console.WriteLine($"Error in TestLLM: {ex.Message}"); + Console.WriteLine($"Stack trace: {ex.StackTrace}"); + } + } + + } +} diff --git a/Demo/Samples/PageContent/Program.PageContent.cs b/Demo/Samples/PageContent/Program.PageContent.cs new file mode 100644 index 00000000..89f60423 --- /dev/null +++ b/Demo/Samples/PageContent/Program.PageContent.cs @@ -0,0 +1,291 @@ +namespace Demo +{ + internal partial class Program + { + internal static void TestExtractTextWithLayout(string[] args) + { + Console.WriteLine("\n=== TestExtractTextWithLayout ====================="); + string testFilePath = Path.GetFullPath("../../../TestDocuments/columns.pdf"); + Document doc = new Document(testFilePath); + + FileStream wstream = File.Create("columns.txt"); + + for (int i = 0; i < 1/*doc.PageCount*/; i++) + { + Page page = doc[i]; + string textWithLayout = page.GetTextWithLayout(tolerance: 3); + if (!string.IsNullOrEmpty(textWithLayout)) + { + byte[] bytes = Encoding.UTF8.GetBytes(textWithLayout); + wstream.Write(bytes, 0, bytes.Length); + } + } + + wstream.Close(); + + doc.Close(); + + Console.WriteLine("Created columns.txt file"); + } + + internal static void TestWidget(string[] args) + { + Console.WriteLine("\n=== TestWidget ====================="); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/Widget.pdf"); + Document doc = new Document(testFilePath); + for (int i = 0; i < 1; i++) + { + var page = doc[i]; + + List entries = page.GetXObjects(); + + Widget fWidget = page.FirstWidget; + while (fWidget != null) + { + Console.WriteLine($"Widget: {fWidget}"); + Console.WriteLine($"FieldName: {fWidget.FieldName}"); + Console.WriteLine($"FieldType: {fWidget.FieldType}"); + Console.WriteLine($"FieldValue: {fWidget.FieldValue}"); + Console.WriteLine($"FieldFlags: {fWidget.FieldFlags}"); + Console.WriteLine($"FieldLabel: {fWidget.FieldLabel}"); + Console.WriteLine($"TextFont: {fWidget.TextFont}"); + Console.WriteLine($"TextFontSize: {fWidget.TextFontSize}"); + Console.WriteLine($"TextColor: {string.Join(",", fWidget.TextColor)}"); + fWidget = (Widget)fWidget.Next; + } + + foreach (var widget in page.GetWidgets()) + { + Console.WriteLine($"Widget: {widget}"); + Console.WriteLine($"FieldName: {widget.FieldName}"); + Console.WriteLine($"FieldType: {widget.FieldType}"); + Console.WriteLine($"FieldValue: {widget.FieldValue}"); + Console.WriteLine($"FieldFlags: {widget.FieldFlags}"); + Console.WriteLine($"FieldLabel: {widget.FieldLabel}"); + Console.WriteLine($"TextFont: {widget.TextFont}"); + Console.WriteLine($"TextFontSize: {widget.TextFontSize}"); + Console.WriteLine($"TextColor: {string.Join(",", widget.TextColor)}"); + + } + } + + doc.Close(); + Console.WriteLine("Widget test completed."); + } + + internal static void TestColor(string[] args) + { + Console.WriteLine("\n=== TestColor ====================="); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/Color.pdf"); + Document doc = new Document(testFilePath); + List images = doc.GetPageImages(0); + Console.WriteLine($"CaName: {images[0].CsName}"); + doc.Recolor(0, 4); + images = doc.GetPageImages(0); + Console.WriteLine($"CaName: {images[0].AltCsName}"); + doc.Save("ReColor.pdf"); + doc.Close(); + + Console.WriteLine("Color test completed."); + } + + internal static void TestCMYKRecolor(string[] args) + { + Console.WriteLine("\n=== TestCMYKRecolor ====================="); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/CMYK_Recolor.pdf"); + Document doc = new Document(testFilePath); + //List images = doc.GetPageImages(0); + //Console.WriteLine($"CaName: {images[0].CsName}"); + doc.Recolor(0, "CMYK"); + //images = doc.GetPageImages(0); + //Console.WriteLine($"CaName: {images[0].AltCsName}"); + doc.Save("CMYKRecolor.pdf"); + doc.Close(); + + Console.WriteLine("CMYK Recolor test completed."); + } + + internal static void TestSVGRecolor(string[] args) + { + Console.WriteLine("\n=== TestSVGRecolor ====================="); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/SvgTest.pdf"); + Document doc = new Document(testFilePath); + doc.Recolor(0, "RGB"); + doc.Save("SVGRecolor.pdf"); + doc.Close(); + + Console.WriteLine("SVG Recolor test completed."); + } + + internal static void TestReplaceImage(string[] args) + { + Console.WriteLine("\n=== TestReplaceImage ====================="); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/Color.pdf"); + Document doc = new Document(testFilePath); + Page page = doc[0]; + List images = page.GetImages(true); + List imgs = page.GetImageRects(images[0].Xref); + + List infos = page.GetImageInfo(xrefs: true); + + page.ReplaceImage(images[0].Xref, "../../../TestDocuments/Image/_apple.png"); + page.ReplaceImage(images[0].Xref, "../../../TestDocuments/Image/_bb-logo.png"); + + infos = page.GetImageInfo(xrefs: true); + //page.DeleteImage(images[0].Xref); + + //int newXref = page.InsertImage(imgs[0].Rect, "../../../TestDocuments/Sample.png"); + + //images = page.GetImages(true); + //imgs = page.GetImageRects(images[0].Xref); + + //page.ReplaceImage(infos[0].Xref, "../../../TestDocuments/Sample.png"); + //page.DeleteImage(images[0].Xref); + + //page.InsertImage(imgs[0].Rect, "../../../TestDocuments/Sample.jpg"); + + doc.Save("ReplaceImage.pdf"); + doc.Close(); + + Console.WriteLine("Image replacement test completed."); + } + + internal static void TestInsertImage(string[] args) + { + Console.WriteLine("\n=== TestInsertImage ====================="); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/Image/test.pdf"); + Document doc = new Document(testFilePath); + Page page = doc[0]; + + var pixmap1 = new Pixmap("../../../TestDocuments/Image/_apple.png"); + //var pixmap1 = new Pixmap("../../../TestDocuments/Image/30mb.jpg"); + var pixmap2 = new Pixmap("../../../TestDocuments/Image/_bb-logo.png"); + var imageRect1 = new Rect(0, 0, 100, 100); + var imageRect2 = new Rect(100, 100, 200, 200); + var imageRect3 = new Rect(100, 200, 200, 300); + var imageRect4 = new Rect(100, 300, 200, 400); + var imageRect5 = new Rect(100, 400, 200, 500); + var imageRect6 = new Rect(100, 500, 200, 600); + + var img_xref = page.InsertImage(imageRect1, pixmap: pixmap1); + Console.WriteLine(img_xref); + + //img_xref = page.InsertImage(imageRect2, "../../../TestDocuments/Image/_apple.png"); + img_xref = page.InsertImage(imageRect2, pixmap: pixmap1); + Console.WriteLine(img_xref); + img_xref = page.InsertImage(imageRect3, pixmap: pixmap2); + Console.WriteLine(img_xref); + img_xref = page.InsertImage(imageRect4, "../../../TestDocuments/Image/_bb-logo.png"); + Console.WriteLine(img_xref); + page.InsertImage(imageRect5, xref: img_xref); + Console.WriteLine(img_xref); + page.InsertImage(imageRect6, xref: img_xref); + + doc.Save("TestInsertImage.pdf"); + doc.Close(); + + Console.WriteLine("Image insertion test completed."); + } + + internal static void TestGetImageInfo(string[] args) + { + Console.WriteLine("\n=== TestGetImageInfo ====================="); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/Image/TestInsertImage.pdf"); + Document doc = new Document(testFilePath); + Page page = doc[0]; + + List infos = page.GetImageInfo(xrefs: true); + + doc.Close(); + + Console.WriteLine("Image info test completed."); + } + + internal static void TestGetTextPageOcr(string[] args) + { + Console.WriteLine("\n=== TestGetTextPageOcr ====================="); + + string testFilePath = Path.GetFullPath(@"../../../TestDocuments/Ocr.pdf"); + Document doc = new Document(testFilePath); + Page page = doc[0]; + + page.RemoveRotation(); + Pixmap pixmap = page.GetPixmap(); + + List blocks = page.GetText("dict", flags: (int)TextFlags.TEXT_PRESERVE_IMAGES)?.Blocks; + foreach (Block block in blocks) + { + Console.WriteLine(block.Image.Length); + } + + // build the pipeline + var pipeline = new ImageFilterPipeline(); + pipeline.Clear(); + //pipeline.AddDeskew(minAngle: 0.5); // replaces any existing deskew step + //pipeline.AddRemoveHorizontalLines(); // also replaces existing horizontal-removal step + //pipeline.AddRemoveVerticalLines(); + //pipeline.AddGrayscale(); + //pipeline.AddMedian(blockSize: 2, replaceExisting: true); + pipeline.AddGamma(gamma: 1.2); // brighten slightly + //pipeline.AddScaleFit(100); + pipeline.AddScale(scaleFactor: 3f, quality: SKFilterQuality.High); + //pipeline.AddContrast(contrast: 100); + //pipeline.AddDilation(); + //pipeline.AddInvert(); + + TextPage tp = page.GetTextPageOcr((int)TextFlags.TEXT_PRESERVE_SPANS, full: true, imageFilters: pipeline); + string txt = tp.ExtractText(); + Console.WriteLine(txt); + + doc.Close(); + + Console.WriteLine("OCR text extraction test completed."); + } + + internal static void TestCreateImagePage(string[] args) + { + Console.WriteLine("\n=== TestCreateImagePage ====================="); + + Pixmap pxmp = new Pixmap("../../../TestDocuments/Image/_bb-logo.png"); + + Document doc = new Document(); + Page page = doc.NewPage(width:pxmp.W, height:pxmp.H); + + page.InsertImage(page.Rect, pixmap: pxmp); + + pxmp.Dispose(); + + doc.Save("_bb-logo.pdf", pretty: 1); + doc.Close(); + + Console.WriteLine("Image page creation test completed."); + } + + internal static void TestJoinPdfPages(string[] args) + { + Console.WriteLine("\n=== TestJoinPdfPages ====================="); + + string testFilePath1 = Path.GetFullPath(@"../../../TestDocuments/Widget.pdf"); + Document doc1 = new Document(testFilePath1); + string testFilePath2 = Path.GetFullPath(@"../../../TestDocuments/Color.pdf"); + Document doc2 = new Document(testFilePath2); + + doc1.InsertPdf(doc2, 0, 0, 2); + + doc1.Save("Joined.pdf", pretty: 1); + + doc2.Close(); + doc1.Close(); + + Console.WriteLine("PDF pages joined successfully into 'Joined.pdf'."); + } + + } +} diff --git a/Demo/Samples/Regression/Program.Regression.cs b/Demo/Samples/Regression/Program.Regression.cs new file mode 100644 index 00000000..e9b10b8b --- /dev/null +++ b/Demo/Samples/Regression/Program.Regression.cs @@ -0,0 +1,168 @@ +namespace Demo +{ + internal partial class Program + { + internal static void TestIssue234() + { + Console.WriteLine("\n=== TestIssue234 ======================="); + + var pix = new Pixmap("../../../TestDocuments/Image/boxedpage.jpg"); // 629x1000 image + var scaled = new Pixmap(pix, 943, 1500, null); // scale up + byte[] jpeg = scaled.ToBytes("jpg", 65); + + using var doc = new Document(); + Page page = doc.NewPage(0, 943, 1500); + page.InsertImage(page.Rect, stream: jpeg); + page.Dispose(); + doc.Save("issue_234.pdf"); + doc.Close(); + } + + internal static void TestRecompressJBIG2() + { + Console.WriteLine("\n=== TestJBIG2 ======================="); + + string testFilePath = Path.GetFullPath("../../../TestDocuments/Jbig2.pdf"); + + Document doc = new Document(testFilePath); + + PdfImageRewriterOptions opts = new PdfImageRewriterOptions(); + + opts.bitonal_image_recompress_method = mupdf.mupdf.FZ_RECOMPRESS_FAX; + opts.recompress_when = mupdf.mupdf.FZ_RECOMPRESS_WHEN_ALWAYS; + + doc.RewriteImage(options: opts); + + doc.Save(@"e:\TestRecompressJBIG2.pdf"); + doc.Close(); + } + + internal static void TestIssue1880() + { + Console.WriteLine("\n=== TestIssue1880 ======================="); + + string testFilePath = Path.GetFullPath(@"../../../TestDocuments/issue_1880.pdf"); + + Document doc = new Document(testFilePath); + + for (int i = 0; i < doc.PageCount; i++) + { + Page page = doc[i]; + + List barcodes = page.ReadBarcodes(barcodeFormat: BarcodeFormat.DM, pureBarcode:true); + foreach (Barcode barcode in barcodes) + { + BarcodePoint[] points = barcode.ResultPoints; + Console.WriteLine($"Page {i++} - Type: {barcode.BarcodeFormat} - Value: {barcode.Text} - Rect: [{points[0]},{points[1]}]"); + } + + page.Dispose(); + } + + doc.Close(); + } + + internal static void TestIssue213() + { + Console.WriteLine("\n=== TestIssue213 ======================="); + + string origfilename = @"../../../TestDocuments/issue_213.pdf"; + string outfilename = @"../../../TestDocuments/Blank.pdf"; + float newWidth = 0.5f; + + Document inputDoc = new Document(origfilename); + Document outputDoc = new Document(outfilename); + + if (inputDoc.PageCount != outputDoc.PageCount) + { + return; + } + + for (int pagNum = 0; pagNum < inputDoc.PageCount; pagNum++) + { + Page page = inputDoc.LoadPage(pagNum); + + Pixmap pxmp = page.GetPixmap(); + pxmp.Save(@"output.png"); + pxmp.Dispose(); + + Page outPage = outputDoc.LoadPage(pagNum); + List paths = page.GetDrawings(extended: false); + int totalPaths = paths.Count; + + int i = 0; + foreach (PathInfo pathInfo in paths) + { + Shape shape = outPage.NewShape(); + foreach (Item item in pathInfo.Items) + { + if (item != null) + { + if (item.Type == "l") + { + shape.DrawLine(item.P1, item.LastPoint); + //writer.Write($"{i:000}\\] line: {item.Type} >>> {item.P1}, {item.LastPoint}\\n"); + } + else if (item.Type == "re") + { + shape.DrawRect(item.Rect, item.Orientation); + //writer.Write($"{i:000}\\] rect: {item.Type} >>> {item.Rect}, {item.Orientation}\\n"); + } + else if (item.Type == "qu") + { + shape.DrawQuad(item.Quad); + //writer.Write($"{i:000}\\] quad: {item.Type} >>> {item.Quad}\\n"); + } + else if (item.Type == "c") + { + shape.DrawBezier(item.P1, item.P2, item.P3, item.LastPoint); + //writer.Write($"{i:000}\\] curve: {item.Type} >>> {item.P1}, {item.P2}, {item.P3}, {item.LastPoint}\\n"); + } + else + { + throw new Exception("unhandled drawing. Aborting..."); + } + } + } + + //pathInfo.Items.get + float newLineWidth = pathInfo.Width; + if (pathInfo.Width <= newWidth) + { + newLineWidth = newWidth; + } + + int lineCap = 0; + if (pathInfo.LineCap != null && pathInfo.LineCap.Count > 0) + lineCap = (int)pathInfo.LineCap[0]; + shape.Finish( + fill: pathInfo.Fill, + color: pathInfo.Color, //this.\_m_DEFAULT_COLOR, + evenOdd: pathInfo.EvenOdd, + closePath: pathInfo.ClosePath, + lineJoin: (int)pathInfo.LineJoin, + lineCap: lineCap, + width: newLineWidth, + strokeOpacity: pathInfo.StrokeOpacity, + fillOpacity: pathInfo.FillOpacity, + dashes: pathInfo.Dashes + ); + + // file_export.write(f'Path {i:03}\] width: {lwidth}, dashes: {path\["dashes"\]}, closePath: {path\["closePath"\]}\\n') + //writer.Write($"Path {i:000}\\] with: {newLineWidth}, dashes: {pathInfo.Dashes}, closePath: {pathInfo.ClosePath}\\n"); + + i++; + shape.Commit(); + } + } + + inputDoc.Close(); + + outputDoc.Save(@"output.pdf"); + outputDoc.Close(); + + //writer.Close(); + } + + } +} diff --git a/Demo/Samples/TextDrawing/Program.TextDrawing.cs b/Demo/Samples/TextDrawing/Program.TextDrawing.cs new file mode 100644 index 00000000..d927a168 --- /dev/null +++ b/Demo/Samples/TextDrawing/Program.TextDrawing.cs @@ -0,0 +1,366 @@ +namespace Demo +{ + internal partial class Program + { + internal static void CreateAnnotDocument() + { + Console.WriteLine("\n=== CreateAnnotDocument ======================="); + Rect r = Constants.r; // use the rectangle defined in Constants.cs + + Document doc = new Document(); + Page page = doc.NewPage(); + + page.SetRotation(0); // no rotation + + TextWriter pw = new TextWriter(page.TrimBox); + string txt = "Origin 100.100"; + pw.Append(new Point(100, 500), txt, new Font("tiro"), fontSize: 24); + pw.WriteText(page, new float[]{0,0.4f,1}, oc: 0); + + + + Annot annot = page.AddRectAnnot(r); // 'Square' + annot.SetBorder(width: 1f, dashes: new int[] { 1, 2 }); + annot.SetColors(stroke: Constants.blue, fill: Constants.gold); + annot.Update(opacity: 0.5f); + + doc.Save(@"CreateAnnotDocument.pdf"); + + doc.Close(); + } + + internal static void TestDrawShape() + { + string origfilename = @"../../../TestDocuments/NewAnnots.pdf"; + string outfilename = @"../../../TestDocuments/Blank.pdf"; + float newWidth = 0.5f; + + Document inputDoc = new Document(origfilename); + Document outputDoc = new Document(outfilename); + + //string filePath = @"D:\\Vectorlab\\Jobs\\2025\\PACE\\pdf_fix\\assets\\exported_paths_net.txt"; + //StreamWriter writer = new StreamWriter(filePath); + + if (inputDoc.PageCount != outputDoc.PageCount) + { + return; + } + + for (int pagNum = 0; pagNum < inputDoc.PageCount; pagNum++) + { + Page page = inputDoc.LoadPage(pagNum); + Page outPage = outputDoc.LoadPage(pagNum); + List paths = page.GetDrawings(extended: false); + int totalPaths = paths.Count; + + int i = 0; + foreach (PathInfo pathInfo in paths) + { + Shape shape = outPage.NewShape(); + foreach (Item item in pathInfo.Items) + { + if (item != null) + { + if (item.Type == "l") + { + shape.DrawLine(item.P1, item.LastPoint); + //writer.Write($"{i:000}\\] line: {item.Type} >>> {item.P1}, {item.LastPoint}\\n"); + } + else if (item.Type == "re") + { + shape.DrawRect(item.Rect, item.Orientation); + //writer.Write($"{i:000}\\] rect: {item.Type} >>> {item.Rect}, {item.Orientation}\\n"); + } + else if (item.Type == "qu") + { + shape.DrawQuad(item.Quad); + //writer.Write($"{i:000}\\] quad: {item.Type} >>> {item.Quad}\\n"); + } + else if (item.Type == "c") + { + shape.DrawBezier(item.P1, item.P2, item.P3, item.LastPoint); + //writer.Write($"{i:000}\\] curve: {item.Type} >>> {item.P1}, {item.P2}, {item.P3}, {item.LastPoint}\\n"); + } + else + { + throw new Exception("unhandled drawing. Aborting..."); + } + } + } + + //pathInfo.Items.get + float newLineWidth = pathInfo.Width; + if (pathInfo.Width <= newWidth) + { + newLineWidth = newWidth; + } + + int lineCap = 0; + if (pathInfo.LineCap != null && pathInfo.LineCap.Count > 0) + lineCap = (int)pathInfo.LineCap[0]; + shape.Finish( + fill: pathInfo.Fill, + color: pathInfo.Color, //this.\_m_DEFAULT_COLOR, + evenOdd: pathInfo.EvenOdd, + closePath: pathInfo.ClosePath, + lineJoin: (int)pathInfo.LineJoin, + lineCap: lineCap, + width: newLineWidth, + strokeOpacity: pathInfo.StrokeOpacity, + fillOpacity: pathInfo.FillOpacity, + dashes: pathInfo.Dashes + ); + + // file_export.write(f'Path {i:03}\] width: {lwidth}, dashes: {path\["dashes"\]}, closePath: {path\["closePath"\]}\\n') + //writer.Write($"Path {i:000}\\] with: {newLineWidth}, dashes: {pathInfo.Dashes}, closePath: {pathInfo.ClosePath}\\n"); + + i++; + shape.Commit(); + } + } + + inputDoc.Close(); + + outputDoc.Save(@"TestDrawShape.pdf"); + outputDoc.Close(); + + //writer.Close(); + } + + internal static void DrawLine(Page page, float startX, float startY, float endX, float endY, Color lineColor = null, float lineWidth = 1, bool dashed = false) + { + Console.WriteLine("\n=== DrawLine ======================="); + + if (lineColor == null) + { + lineColor = new Color(); // Default to black + lineColor.Stroke = new float[] { 0, 0, 0 }; // RGB black + } + Shape img = page.NewShape(); + Point startPoint = new Point(startX, startY); + Point endPoint = new Point(endX, endY); + + String dashString = ""; + if (dashed == true) + { + dashString = "[2] 0"; // Example dash pattern + } + + img.DrawLine(startPoint, endPoint); + img.Finish(width: lineWidth, color: lineColor.Stroke, dashes: dashString); + img.Commit(); + + Console.WriteLine($"Line drawn from ({startX}, {startY}) to ({endX}, {endY}) with color {lineColor.Stroke} and width {lineWidth}."); + } + + internal static void TestDrawLine() + { + Console.WriteLine("\n=== TestDrawLine ======================="); + + Document doc = new Document(); + + Page page = doc.NewPage(); + + string fontDir = Environment.GetFolderPath(Environment.SpecialFolder.Fonts); + + page.DrawLine(new Point(45, 50), new Point(80, 50), width: 0.5f, dashes: "[5] 0"); + page.DrawLine(new Point(90, 50), new Point(150, 50), width: 0.5f, dashes: "[5] 0"); + page.DrawLine(new Point(45, 80), new Point(180, 80), width: 0.5f, dashes: "[5] 0"); + page.DrawLine(new Point(45, 100), new Point(180, 100), width: 0.5f, dashes: "[5] 0"); + + //DrawLine(page, 45, 50, 80, 50, lineWidth: 0.5f, dashed: true); + //DrawLine(page, 90, 60, 150, 60, lineWidth: 0.5f, dashed: true); + //DrawLine(page, 45, 80, 180, 80, lineWidth: 0.5f, dashed: true); + //DrawLine(page, 45, 100, 180, 100, lineWidth: 0.5f, dashed: true); + + doc.Save(@"TestDrawLine.pdf"); + + page.Dispose(); + doc.Close(); + + Console.WriteLine("Write to TestDrawLine.pdf"); + } + + internal static void TestTextFont(string[] args) + { + Console.WriteLine("\n=== TestTextFont ======================="); + //for (int i = 0; i < 100; i++) + { + Document doc = new Document(); + + Page page0 = doc.NewPage(); + Page page1 = doc.NewPage(pno: -1, width: 595, height: 842); + + string fontDir = Environment.GetFolderPath(Environment.SpecialFolder.Fonts); + + float[] blue = new float[] { 0.0f, 0.0f, 1.0f }; + float[] red = new float[] { 1.0f, 0.0f, 0.0f }; + + Rect rect1 = new Rect(100, 100, 510, 200); + Rect rect2 = new Rect(100, 250, 300, 400); + + MuPDF.NET.Font font1 = new MuPDF.NET.Font("asdfasdf"); + //MuPDF.NET.Font font1 = new MuPDF.NET.Font("arial", fontDir+"\\arial_0.ttf"); + MuPDF.NET.Font font2 = new MuPDF.NET.Font("times", fontDir + "\\times.ttf"); + + string text1 = "This is a test of the FillTextbox method with Arial font."; + string text2 = "This is another test with Times New Roman font."; + + MuPDF.NET.TextWriter tw1 = new MuPDF.NET.TextWriter(page0.Rect); + tw1.FillTextbox(rect: rect1, text: text1, font: font1, fontSize:20); + font1.Dispose(); + tw1.WriteText(page0); + + MuPDF.NET.TextWriter tw2 = new MuPDF.NET.TextWriter(page0.Rect, color: red); + tw2.FillTextbox(rect: rect2, text: text2, font: font2, fontSize: 10, align: (int)TextAlign.TEXT_ALIGN_LEFT); + font2.Dispose(); + tw2.WriteText(page0); + + doc.Save(@"TestTextFont.pdf"); + + page0.Dispose(); + doc.Close(); + + Console.WriteLine("Write to TestTextFont.pdf"); + } + + } + + internal static void TestInsertHtmlbox() + { + Console.WriteLine("\n=== TestInsertHtmlbox ======================="); + + Rect rect = new Rect(100, 100, 550, 2250); + Document doc = new Document(); + Page page = doc.NewPage(); + + string htmlString = "

生产准备:

1. 每日生产进行维护保养,请参照并填写Philips 自动螺丝起点检表《WI-Screw assembly-Makita DF010&Kilews

SKD-B512L-F01》

2 .扭力计UNIT选择‘lbf.in’,‘P-P’模式,每四小时检查一次,每次检查5组数据,只有合格才可以生产;并填写

力扭矩记录表,表单号 : F-EN-34 。

1.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü 5±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü5.0πÇé

2.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü10±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü10.0πÇé

3.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü12±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü12.0πÇé

Colten - line break

生产准备:

1. 每日生产进行维护保养,请参照并填写Philips 自动螺丝起点检表《WI-Screw assembly-Makita DF010&Kilews

SKD-B512L-F01》

2 .扭力计UNIT选择‘lbf.in’,‘P-P’模式,每四小时检查一次,每次检查5组数据,只有合格才可以生产;并填写

力扭矩记录表,表单号 : F-EN-34 。

1.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü 5±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü5.0πÇé

2.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü10±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü10.0πÇé

3.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü12±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü12.0πÇé

Colten - line break

生产准备:

1. 每日生产进行维护保养,请参照并填写Philips 自动螺丝起点检表《WI-Screw assembly-Makita DF010&Kilews

SKD-B512L-F01》

2 .扭力计UNIT选择‘lbf.in’,‘P-P’模式,每四小时检查一次,每次检查5组数据,只有合格才可以生产;并填写

力扭矩记录表,表单号 : F-EN-34 。

1.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü 5±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü5.0πÇé

2.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü10±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü10.0πÇé

3.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü12±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü12.0πÇé

Colten - line break

生产准备:

1. 每日生产进行维护保养,请参照并填写Philips 自动螺丝起点检表《WI-Screw assembly-Makita DF010&Kilews

SKD-B512L-F01》

2 .扭力计UNIT选择‘lbf.in’,‘P-P’模式,每四小时检查一次,每次检查5组数据,只有合格才可以生产;并填写

力扭矩记录表,表单号 : F-EN-34 。

1.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü 5±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü5.0πÇé

2.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü10±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü10.0πÇé

3.τö╡σè¿Φ╡╖σ¡Éσè¢τƒ⌐∩╝Ü12±1 in-lbs∩╝îτö╡σè¿Φ₧║Σ╕¥Φ╡╖τ╝ûσÅ╖∩╝Ü12.0πÇé

"; + (float s, float scale) = page.InsertHtmlBox(rect, htmlString, scaleLow: 0f); + doc.Save(@"TestInsertHtmlbox.pdf"); + + page.Dispose(); + doc.Close(); + + Console.WriteLine($"Inserted HTML box with scale: {scale} and size: {s}"); + } + + internal static void TestLineAnnot() + { + Console.WriteLine("\n=== TestLineAnnot ======================="); + Document newDoc = new Document(); + Page newPage = newDoc.NewPage(); + + newPage.AddLineAnnot(new Point(100, 100), new Point(300, 300)); + + newDoc.Save(@"TestLineAnnot1.pdf"); + newDoc.Close(); + + Document doc = new Document(@"TestLineAnnot1.pdf"); // open a document + List annotationsToUpdate = new List(); + Page page = doc[0]; + // Fix: Correctly handle the IEnumerable returned by GetAnnots() + IEnumerable annots = page.GetAnnots(); + foreach (Annot annot in annots) + { + Console.WriteLine("Annotation on page width before modified: " + annot.Border.Width); + annot.SetBorder(width: 8); + annot.Update(); + Console.WriteLine("Annotation on page width after modified: " + annot.Border.Width); + } + annotationsToUpdate.Clear(); + doc.Save(@"TestLineAnnot2.pdf"); // Save the modified document + doc.Close(); // Close the document + } + + internal static void TestHelloWorldToNewDocument(string[] args) + { + Console.WriteLine("\n=== TestHelloWorldToNewDocument ======================="); + Document doc = new Document(); + Page page = doc.NewPage(); + + //{ "helv", "Helvetica" }, + //{ "heit", "Helvetica-Oblique" }, + //{ "hebo", "Helvetica-Bold" }, + //{ "hebi", "Helvetica-BoldOblique" }, + //{ "cour", "Courier" }, + //{ "cobo", "Courier-Bold" }, + //{ "cobi", "Courier-BoldOblique" }, + //{ "tiro", "Times-Roman" }, + //{ "tibo", "Times-Bold" }, + //{ "tiit", "Times-Italic" }, + //{ "tibi", "Times-BoldItalic" }, + //{ "symb", "Symbol" }, + //{ "zadb", "ZapfDingbats" } + MuPDF.NET.TextWriter writer = new MuPDF.NET.TextWriter(page.Rect); + var ret = writer.FillTextbox(page.Rect, "Hello World!", new MuPDF.NET.Font(fontName: "helv"), rtl: true); + writer.WriteText(page); + doc.Save("text.pdf", pretty: 1); + doc.Close(); + + Console.WriteLine($"Text written to 'text.pdf' in: {page.Rect}"); + } + + internal static void TestHelloWorldToExistingDocument(string[] args) + { + Console.WriteLine("\n=== TestHelloWorldToExistingDocument ======================="); + string testFilePath = Path.GetFullPath("../../../TestDocuments/Blank.pdf"); + Document doc = new Document(testFilePath); + + Page page = doc[0]; + + Rect rect = new Rect(100, 100, 510, 210); + page.DrawRect(rect); + + MuPDF.NET.TextWriter writer = new MuPDF.NET.TextWriter(page.Rect); + //Font font = new Font("kenpixel", "../../../kenpixel.ttf", isBold: 1); + Font font = new Font("cobo", isBold: 0); + var ret = writer.FillTextbox(page.Rect, "123456789012345678901234567890Peter Test- this is a string that is too long to fit into the TextBox", font, rtl: false); + writer.WriteText(page); + + doc.Save("text1.pdf", pretty: 1); + + doc.Close(); + + Console.WriteLine($"Text written to 'text1.pdf' in: {page.Rect}"); + } + + internal static void TestFreeTextAnnot(string[] args) + { + Console.WriteLine("\n=== TestFreeTextAnnot ====================="); + + Rect r = new Rect(72, 72, 220, 100); + string t1 = "t├¬xt ├╝s├¿s L├ñti├▒ char├ƒ,\nEUR: Γé¼, mu: ┬╡, super scripts: ┬▓┬│!"; + Rect rect = new Rect(100,100,200,200); + float[] red = new float[] { 1, 0, 0 }; + float[] blue = new float[] { 0, 0, 1 }; + float[] gold = new float[] { 1, 1, 0 }; + float[] green = new float[] { 0, 1, 0 }; + float[] white = new float[] { 1, 1, 1 }; + + Document doc = new Document(); + Page page = doc.NewPage(); + + Annot annot = page.AddFreeTextAnnot( + rect, + t1, + fontSize: 10, + rotate: 90, + textColor: red, + fillColor: gold, + align: (int)TextAlign.TEXT_ALIGN_CENTER, + dashes: new int[] { 2 } + ); + + annot.SetBorder(border: null, width: 0.3f, dashes: new int[] { 2 }); + annot.Update(textColor: blue); + //annot.Update(textColor: red, fillColor: blue); + + doc.Save("FreeTextAnnot.pdf"); + + doc.Close(); + + Console.WriteLine("Free text annotation created and saved to 'FreeTextAnnot.pdf'."); + } + + } +} diff --git a/Demo/_Constants.cs b/Demo/Support/Constants.cs similarity index 100% rename from Demo/_Constants.cs rename to Demo/Support/Constants.cs diff --git a/Demo/Support/Units.cs b/Demo/Support/Units.cs new file mode 100644 index 00000000..730f7d3e --- /dev/null +++ b/Demo/Support/Units.cs @@ -0,0 +1,14 @@ +namespace Demo +{ + public static class Units + { + public const float InchesPerMm = 1.0f / 25.4f; + public const float PointsPerInch = 72.0f; + + public static float MmToPoints(float mm) => mm * InchesPerMm * PointsPerInch; + public static float PointsToMm(float points) => points / PointsPerInch / InchesPerMm; + + public static float MmToPixels(float mm, float dpi) => mm * InchesPerMm * dpi; + public static float PixelsToMm(float px, float dpi) => px / dpi / InchesPerMm; + } +} diff --git a/Demo/annotations-freetext1.cs b/Demo/annotations-freetext1.cs deleted file mode 100644 index f9ea7091..00000000 --- a/Demo/annotations-freetext1.cs +++ /dev/null @@ -1,42 +0,0 @@ -using MuPDF.NET; -using SkiaSharp; -using System; -using System.Collections.Generic; -using System.Globalization; -using System.IO; -using System.Linq; -using System.Text; - -namespace Demo -{ - public static class AnnotationsFreeText1 - { - public static void Run(string[] args) - { - Console.WriteLine("\n=== AnnotationsFreeText1 ======================="); - Document doc = new Document(); - Page page = doc.NewPage(); - - // 3 rectangles, same size, above each other - Rect r1 = new Rect(100, 100, 200, 150); - Rect r2 = r1 + new Rect(0, 75, 0, 75); - Rect r3 = r2 + new Rect(0, 75, 0, 75); - - // the text, Latin alphabet - string t = "¡Un pequeño texto para practicar!"; - - // add 3 annots, modify the last one somewhat - Annot a1 = page.AddFreeTextAnnot(r1, t, textColor: Constants.red); - Annot a2 = page.AddFreeTextAnnot(r2, t, fontName: "Ti", textColor: Constants.blue); - Annot a3 = page.AddFreeTextAnnot(r3, t, fontName: "Co", textColor: Constants.blue, rotate: 90); - a3.SetBorder(width: 0); - a3.Update(fontSize: 8, fillColor: Constants.gold); - - doc.Save("a-freetext.pdf"); - - doc.Close(); - - Console.WriteLine("Saved to a-freetext.pdf"); - } - } -} diff --git a/Demo/annotations-freetext2.cs b/Demo/annotations-freetext2.cs deleted file mode 100644 index fb3a0e31..00000000 --- a/Demo/annotations-freetext2.cs +++ /dev/null @@ -1,65 +0,0 @@ -using MuPDF.NET; -using SkiaSharp; -using System; -using System.Collections.Generic; -using System.Globalization; -using System.IO; -using System.IO.Ports; -using System.Linq; -using System.Security.Policy; -using System.Text; -using static System.Net.Mime.MediaTypeNames; - -namespace Demo -{ - public static class AnnotationsFreeText2 - { - // Use rich text for FreeText annotations - public static void Run(string[] args) - { - Console.WriteLine("\n=== AnnotationsFreeText2 ======================="); - // define an overall styling - string ds = "font-size: 11pt; font-family: sans-serif;"; - // some special characters - string bullet = "\u2610\u2611\u2612"; // Output: ☐☑☒ - - // the annotation text with HTML and styling syntax - string text = $@"

-MuPDF.NET འདི་ ཡིག་ཆ་བཀྲམ་སྤེལ་གྱི་དོན་ལུ་ པའི་ཐོན་ཐུམ་སྒྲིལ་དྲག་ཤོས་དང་མགྱོགས་ཤོས་ཅིག་ཨིན། -Here is some bold and italic text, followed by bold-italic. Text-based check boxes: {bullet}. -

"; - - Document doc = new Document(); - Page page = doc.NewPage(); - - // 3 rectangles, same size, above each other - Rect rect = new Rect(100, 100, 350, 200); - - // define some points for callout lines - Point p2 = rect.TopRight + new Point(50, 30); - Point p3 = p2 + new Point(0, 30); - - // define the annotation - Annot annot = page.AddFreeTextAnnot( - rect, - text, - fillColor: Constants.gold, // fill color - opacity: 1, // non-transparent - rotate: 0, // no rotation - borderWidth: 1, // border and callout line width - dashes: null, // no dashing - richtext: true, // this is rich text - style: ds, // my styling default - callout: new Point[]{ p3, p2, rect.TopRight }, // define end, knee, start points - lineEnd: PdfLineEnding.PDF_ANNOT_LE_OPEN_ARROW, // symbol shown at p3 - borderColor: Constants.green - ); - - doc.Save(typeof(AnnotationsFreeText2).Name + ".pdf", pretty:1); - - doc.Close(); - - Console.WriteLine("Saved to " + typeof(AnnotationsFreeText2).Name + ".pdf"); - } - } -} diff --git a/MuPDF.NET.Test/UtilsTest.cs b/MuPDF.NET.Test/UtilsTest.cs new file mode 100644 index 00000000..2aeb0e7f --- /dev/null +++ b/MuPDF.NET.Test/UtilsTest.cs @@ -0,0 +1,50 @@ +using NUnit.Framework; +using MuPDF.NET; + +namespace MuPDF.NET.Test +{ + public class UtilsTest + { + [Test] + public void FloatToString_NoScientificNotation() + { + Assert.That(Utils.FloatToString(1.5f), Is.EqualTo("1.5")); + Assert.That(Utils.FloatToString(0f), Is.EqualTo("0")); + Assert.That(Utils.FloatToString(-123.456f), Is.EqualTo("-123.456")); + Assert.That(Utils.FloatToString(1000000f), Is.EqualTo("1000000")); + + // Values that would use scientific notation with default ToString + string small = Utils.FloatToString(0.0000123f); + Assert.That(small.Contains("E") || small.Contains("e"), Is.False, "Should not use scientific notation"); + Assert.That(small.Contains("0.00001") || small.Contains("0.000012"), Is.True); + } + + [Test] + public void FloatToString_InvariantCulture() + { + string result = Utils.FloatToString(1.5f); + Assert.That(result.Contains("."), Is.True); + Assert.That(result.Contains(","), Is.False); + } + + [Test] + public void DoubleToString_NoScientificNotation() + { + Assert.That(Utils.DoubleToString(1.5), Is.EqualTo("1.5")); + Assert.That(Utils.DoubleToString(0), Is.EqualTo("0")); + Assert.That(Utils.DoubleToString(-123.456), Is.EqualTo("-123.456")); + Assert.That(Utils.DoubleToString(1000000), Is.EqualTo("1000000")); + + string small = Utils.DoubleToString(1.23e-10); + Assert.That(small.Contains("E") || small.Contains("e"), Is.False, "Should not use scientific notation"); + } + + [Test] + public void DoubleToString_InvariantCulture() + { + string result = Utils.DoubleToString(1.5); + Assert.That(result.Contains("."), Is.True); + Assert.That(result.Contains(","), Is.False); + } + } +} diff --git a/MuPDF.NET/MuPDF.NET.csproj b/MuPDF.NET/MuPDF.NET.csproj index 81336611..20d805c8 100644 --- a/MuPDF.NET/MuPDF.NET.csproj +++ b/MuPDF.NET/MuPDF.NET.csproj @@ -61,4 +61,13 @@ + + + + + + diff --git a/MuPDF.NET/MuPDF.NET.nuspec b/MuPDF.NET/MuPDF.NET.nuspec index 7e38d1f6..dad82569 100644 --- a/MuPDF.NET/MuPDF.NET.nuspec +++ b/MuPDF.NET/MuPDF.NET.nuspec @@ -29,14 +29,24 @@ - - - - - - - - + + + + + + + + + + + + + + + + + + diff --git a/MuPDF.NET/Page.cs b/MuPDF.NET/Page.cs index 2d4cb923..be43551e 100644 --- a/MuPDF.NET/Page.cs +++ b/MuPDF.NET/Page.cs @@ -2242,7 +2242,15 @@ public int InsertImage( xobject.pdf_dict_puts(imgName, ref_); FzBuffer nres = mupdf.mupdf.fz_new_buffer(50); nres.fz_append_string( - string.Format(System.Globalization.CultureInfo.InvariantCulture, template, mat.a, mat.b, mat.c, mat.d, mat.e, mat.f, imgName) + string.Format(System.Globalization.CultureInfo.InvariantCulture, + template, + Utils.FloatToString(mat.a), + Utils.FloatToString(mat.b), + Utils.FloatToString(mat.c), + Utils.FloatToString(mat.d), + Utils.FloatToString(mat.e), + Utils.FloatToString(mat.f), + imgName) ); Utils.InsertContents(pageDoc, page.obj(), nres, overlay); } diff --git a/MuPDF.NET/Utils.cs b/MuPDF.NET/Utils.cs index 9a2780f9..ef5039ee 100644 --- a/MuPDF.NET/Utils.cs +++ b/MuPDF.NET/Utils.cs @@ -7923,19 +7923,19 @@ internal static void SetDotCultureForNumber() } /// - /// Converts a float to string with dot as decimal separator, regardless of culture + /// Converts a float to string with dot as decimal separator, without scientific notation. /// - internal static string FloatToString(float value) + public static string FloatToString(float value) { - return value.ToString(CultureInfo.InvariantCulture); + return value.ToString("0.#######", CultureInfo.InvariantCulture); } /// - /// Converts a double to string with dot as decimal separator, regardless of culture + /// Converts a double to string with dot as decimal separator, without scientific notation. /// - internal static string DoubleToString(double value) + public static string DoubleToString(double value) { - return value.ToString(CultureInfo.InvariantCulture); + return value.ToString("0.#################", CultureInfo.InvariantCulture); } /// diff --git a/MuPDF.NET4LLM/MuPDF.NET4LLM.csproj b/MuPDF.NET4LLM/MuPDF.NET4LLM.csproj index 2392e5b3..abc7ee5d 100644 --- a/MuPDF.NET4LLM/MuPDF.NET4LLM.csproj +++ b/MuPDF.NET4LLM/MuPDF.NET4LLM.csproj @@ -6,22 +6,9 @@ True $(Platform) . - MuPDF.NET4LLM - 0.2.9-rc.1 - Artifex Software Inc. - Artifex Software Inc. - MuPDF.NET4LLM - LLM/RAG helpers for MuPDF.NET: PDF-to-Markdown conversion, layout parsing, document structure analysis. Designed for use with RAG pipelines and integration with LLMs. - MuPDF;PDF;LLM;RAG;Markdown;document-processing;PDF-to-text - https://github.com/ArtifexSoftware/MuPDF.NET - https://github.com/ArtifexSoftware/MuPDF.NET - git - LICENSE.md - LLM/RAG helpers for PDF processing: Markdown conversion, layout analysis, multi-column detection. - false - $(ProjectDir)MuPDF.NET4LLM.nuspec - version=$(Version) - false + 0.3.4-rc.1 + $(MSBuildProjectDirectory)\MuPDF.NET4LLM.nuspec + Configuration=$(Configuration);version=$(Version);PlatformFolder=$(PlatformFolder) diff --git a/MuPDF.NET4LLM/MuPDF.NET4LLM.nuspec b/MuPDF.NET4LLM/MuPDF.NET4LLM.nuspec index cda1f63e..3fe57c67 100644 --- a/MuPDF.NET4LLM/MuPDF.NET4LLM.nuspec +++ b/MuPDF.NET4LLM/MuPDF.NET4LLM.nuspec @@ -41,16 +41,18 @@ - - - - - - - - - - - + + + + + + + + + + + + + diff --git a/MuPDF.NET4LLM/VersionInfo.cs b/MuPDF.NET4LLM/VersionInfo.cs index 18f93809..3fc6be15 100644 --- a/MuPDF.NET4LLM/VersionInfo.cs +++ b/MuPDF.NET4LLM/VersionInfo.cs @@ -7,6 +7,6 @@ namespace MuPDF.NET4LLM public static class VersionInfo { public static readonly (int Major, int Minor, int Patch) MinimumMuPDFVersion = (1, 27, 0); - public const string Version = "0.2.9-rc.1"; + public const string Version = "0.3.4-rc.1"; } } diff --git a/MuPDF.NET4LLM/helpers/DocumentLayout.cs b/MuPDF.NET4LLM/helpers/DocumentLayout.cs index f31d8ba1..d9a2099d 100644 --- a/MuPDF.NET4LLM/helpers/DocumentLayout.cs +++ b/MuPDF.NET4LLM/helpers/DocumentLayout.cs @@ -3,6 +3,7 @@ using System.Linq; using System.Text; using MuPDF.NET; +using MuPDF.NET4LLM.Ocr; using Newtonsoft.Json; namespace MuPDF.NET4LLM.Helpers diff --git a/MuPDF.NET4LLM/ocr/__init__.cs b/MuPDF.NET4LLM/ocr/__init__.cs new file mode 100644 index 00000000..4ffbe8d2 --- /dev/null +++ b/MuPDF.NET4LLM/ocr/__init__.cs @@ -0,0 +1,14 @@ +namespace MuPDF.NET4LLM.Ocr +{ + /// + /// Package exports aligned with mupdf4llm.ocr.__init__ (OCRMode). + /// + public enum OcrMode + { + Never = 0, + SelectRemovingOld = 1, + SelectPreservingOld = 2, + AlwaysRemovingOld = 3, + AlwaysPreservingOld = 4, + } +} diff --git a/MuPDF.NET4LLM/ocr/paddleocr_api.cs b/MuPDF.NET4LLM/ocr/paddleocr_api.cs new file mode 100644 index 00000000..669537ce --- /dev/null +++ b/MuPDF.NET4LLM/ocr/paddleocr_api.cs @@ -0,0 +1,23 @@ +using System.Collections.Generic; +using MuPDF.NET; + +namespace MuPDF.NET4LLM.Ocr +{ + /// + /// Same module contract as mupdf4llm.ocr.paddleocr_api (duplicate of rapidocr_api in upstream). + /// + public static class PaddleOcrApi + { + public const char ReplacementUnicode = '\uFFFD'; + + public static readonly Dictionary Kwargs = new Dictionary(); + + public static bool OcrText(Span span) => TesseractApi.OcrText(span); + + public static void ExecOcr(Page page, int dpi = 300, Pixmap pixmap = null, string language = "eng", bool keepOcrText = false) + { + throw new System.NotImplementedException( + "PaddleOcrApi.ExecOcr is not implemented for MuPDF.NET; see mupdf4llm.ocr.paddleocr_api."); + } + } +} diff --git a/MuPDF.NET4LLM/ocr/paddletess_api.cs b/MuPDF.NET4LLM/ocr/paddletess_api.cs new file mode 100644 index 00000000..14c6f5b1 --- /dev/null +++ b/MuPDF.NET4LLM/ocr/paddletess_api.cs @@ -0,0 +1,26 @@ +using MuPDF.NET; + +namespace MuPDF.NET4LLM.Ocr +{ + /// + /// Same module contract as mupdf4llm.ocr.paddletess_api (duplicate of rapidtess_api in upstream). + /// + public static class PaddleTessApi + { + public const char ReplacementUnicode = '\uFFFD'; + + public static bool OcrText(Span span) => TesseractApi.OcrText(span); + + public static string GetText(Pixmap pixmap, IRect irect, string language = "eng") + { + throw new System.NotImplementedException( + "PaddleTessApi.GetText requires RapidOCR + Tesseract integration; see mupdf4llm.ocr.paddletess_api."); + } + + public static void ExecOcr(Page page, int dpi = 300, Pixmap pixmap = null, string language = "eng", bool keepOcrText = false) + { + throw new System.NotImplementedException( + "PaddleTessApi.ExecOcr is not implemented for MuPDF.NET; see mupdf4llm.ocr.paddletess_api."); + } + } +} diff --git a/MuPDF.NET4LLM/ocr/rapidocr_api.cs b/MuPDF.NET4LLM/ocr/rapidocr_api.cs new file mode 100644 index 00000000..40a84e53 --- /dev/null +++ b/MuPDF.NET4LLM/ocr/rapidocr_api.cs @@ -0,0 +1,29 @@ +using System.Collections.Generic; +using MuPDF.NET; + +namespace MuPDF.NET4LLM.Ocr +{ + /// + /// RapidOCR-only pipeline, aligned with mupdf4llm.mupdf4llm.ocr.rapidocr_api. + /// + public static class RapidOcrApi + { + public const char ReplacementUnicode = '\uFFFD'; + + /// + /// Keyword arguments passed to RapidOCR in Python (KWARGS). + /// + public static readonly Dictionary Kwargs = new Dictionary(); + + public static bool OcrText(Span span) => TesseractApi.OcrText(span); + + /// + /// mupdf4llm.ocr.rapidocr_api.exec_ocr — not ported. + /// + public static void ExecOcr(Page page, int dpi = 300, Pixmap pixmap = null, string language = "eng", bool keepOcrText = false) + { + throw new System.NotImplementedException( + "RapidOcrApi.ExecOcr is not implemented for MuPDF.NET; see mupdf4llm.ocr.rapidocr_api."); + } + } +} diff --git a/MuPDF.NET4LLM/ocr/rapidtess_api.cs b/MuPDF.NET4LLM/ocr/rapidtess_api.cs new file mode 100644 index 00000000..8ff1ea86 --- /dev/null +++ b/MuPDF.NET4LLM/ocr/rapidtess_api.cs @@ -0,0 +1,33 @@ +using MuPDF.NET; + +namespace MuPDF.NET4LLM.Ocr +{ + /// + /// RapidOCR + Tesseract pipeline, aligned with mupdf4llm.ocr.rapidtess_api. + /// Requires rapidocr_onnxruntime in Python; not implemented for .NET. + /// + public static class RapidTessApi + { + public const char ReplacementUnicode = '\uFFFD'; + + public static bool OcrText(Span span) => TesseractApi.OcrText(span); + + /// + /// mupdf4llm.ocr.rapidtess_api.get_text — not ported (Tesseract region OCR + options). + /// + public static string GetText(Pixmap pixmap, IRect irect, string language = "eng") + { + throw new System.NotImplementedException( + "RapidTessApi.GetText requires RapidOCR + Tesseract integration; see mupdf4llm.ocr.rapidtess_api."); + } + + /// + /// mupdf4llm.ocr.rapidtess_api.exec_ocr — not ported. + /// + public static void ExecOcr(Page page, int dpi = 300, Pixmap pixmap = null, string language = "eng", bool keepOcrText = false) + { + throw new System.NotImplementedException( + "RapidTessApi.ExecOcr is not implemented for MuPDF.NET; see mupdf4llm.ocr.rapidtess_api."); + } + } +} diff --git a/MuPDF.NET4LLM/helpers/CheckOcr.cs b/MuPDF.NET4LLM/ocr/tesseract_api.cs similarity index 69% rename from MuPDF.NET4LLM/helpers/CheckOcr.cs rename to MuPDF.NET4LLM/ocr/tesseract_api.cs index 39f8aec2..326bacf6 100644 --- a/MuPDF.NET4LLM/helpers/CheckOcr.cs +++ b/MuPDF.NET4LLM/ocr/tesseract_api.cs @@ -5,11 +5,39 @@ using mupdf; using Char = MuPDF.NET.Char; -namespace MuPDF.NET4LLM.Helpers +namespace MuPDF.NET4LLM.Ocr { /// - /// OCR decision and repair utilities. - /// Ported and adapted from LLM helpers. + /// Tesseract-oriented OCR API and page/span helpers, aligned with mupdf4llm.ocr.tesseract_api. + /// + public static class TesseractApi + { + public const char ReplacementUnicode = '\uFFFD'; + + /// + /// Mirrors mupdf4llm.ocr.tesseract_api.ocr_text(span). + /// + public static bool OcrText(Span span) + { + int flags = span?.Chars != null && span.Chars.Count > 0 + ? (int)(span.Flags) + : (int)(span?.Flags ?? 0); + return (flags & 32) == 0 && (flags & 16) == 0; + } + + /// + /// Full-page OCR callback from mupdf4llm (redaction + pdfocr_tobytes pipeline). + /// Not ported; use for span repair and OCR decisions. + /// + public static void ExecOcr(Page page, int dpi = 300, Pixmap pixmap = null, string language = "eng", bool keepOcrText = false) + { + throw new NotImplementedException( + "TesseractApi.ExecOcr (mupdf4llm.ocr.tesseract_api.exec_ocr) is not implemented for MuPDF.NET; use CheckOcr for span-level repair."); + } + } + + /// + /// OCR decision and repair utilities used by the layout pipeline (MuPDF.NET / Tesseract). /// public static class CheckOcr { @@ -18,46 +46,32 @@ public static class CheckOcr mupdf.mupdf.FZ_STEXT_COLLECT_VECTORS | (int)TextFlags.TEXT_PRESERVE_IMAGES | (int)TextFlags.TEXT_ACCURATE_BBOXES - // | mupdf.mupdf.FZ_STEXT_MEDIABOX_CLIP ); - /// - /// Return OCR'd span text using Tesseract. - /// - /// MuPDF Page - /// MuPDF Rect or its sequence - /// Resolution for OCR image - /// The OCR-ed text of the bbox. public static string GetSpanOcr(Page page, Rect bbox, int dpi = 300) { - // Step 1: Make a high-resolution image of the bbox. Pixmap pix = page.GetPixmap(dpi: dpi, clip: bbox); byte[] ocrPdfBytes = pix.PdfOCR2Bytes(true); - + Document ocrPdf = new Document("pdf", ocrPdfBytes); Page ocrPage = ocrPdf.LoadPage(0); string text = ocrPage.GetText(); - text = text.Replace("\n", " ").Trim(); // Get rid of line breaks - + text = text.Replace("\n", " ").Trim(); + ocrPage.Dispose(); ocrPdf.Close(); pix.Dispose(); - + return text; } - /// - /// Repair text blocks with missing glyphs using OCR. - /// - /// TODO: Support non-linear block structure. - /// public static List RepairBlocks(List inputBlocks, Page page, int dpi = 300) { List repairedBlocks = new List(); - + foreach (var block in inputBlocks) { - if (block.Type != 0) // Accept non-text blocks as is + if (block.Type != 0) { repairedBlocks.Add(block); continue; @@ -81,7 +95,7 @@ public static List RepairBlocks(List inputBlocks, Page page, int d spanText = span.Text ?? ""; } - if (!spanText.Contains(Utils.REPLACEMENT_CHARACTER)) + if (!spanText.Contains(MuPDF.NET4LLM.Helpers.Utils.REPLACEMENT_CHARACTER)) continue; int spanTextLen = spanText.Length; @@ -91,7 +105,6 @@ public static List RepairBlocks(List inputBlocks, Page page, int d if (span.Chars != null && span.Chars.Count > 0) { - // Rebuild chars array List newChars = new List(); int minLen = Math.Min(newText.Length, span.Chars.Count); for (int i = 0; i < minLen; i++) @@ -102,7 +115,6 @@ public static List RepairBlocks(List inputBlocks, Page page, int d C = newText[i], Origin = oldChar.Origin, Bbox = oldChar.Bbox, - // Copy other properties as needed }; newChars.Add(newChar); } @@ -118,42 +130,25 @@ public static List RepairBlocks(List inputBlocks, Page page, int d } repairedBlocks.Add(block); } - + return repairedBlocks; } - /// - /// Determine whether the page contains text worthwhile to OCR. - /// - /// MuPDF.NET Page object - /// DPI used for rasterization *if* we decide to OCR - /// Area to consider for text presence - /// - /// The full-page transformation matrix, the full-page pixmap and a - /// boolean indicating whether the page is photo-like (True) or - /// text-like (False). - /// public static (Matrix matrix, Pixmap pix, bool photo) GetPageImage( - Page page, - int dpi = 150, + Page page, + int dpi = 150, Rect covered = null) { if (covered == null) covered = page.Rect; - IRect irect = new IRect((int)covered.X0, (int)covered.Y0, - (int)covered.X1, (int)covered.Y1); - - // Make a gray pixmap of the covered area Rect clipRect = new Rect(covered); Pixmap pixCovered = page.GetPixmap(colorSpace: "gray", clip: clipRect); - - // Convert to byte array for image quality analysis (convert to numpy array) + int width = pixCovered.W; int height = pixCovered.H; byte[] samples = pixCovered.SAMPLES; - - // Create 2D array for image quality analysis + byte[,] gray = new byte[height, width]; int sampleIndex = 0; for (int y = 0; y < height; y++) @@ -164,19 +159,17 @@ public static (Matrix matrix, Pixmap pix, bool photo) GetPageImage( } } - // Run photo checks - var scores = ImageQuality.AnalyzeImage(gray); + var scores = MuPDF.NET4LLM.Helpers.ImageQuality.AnalyzeImage(gray); double score = scores.ContainsKey("score") ? scores["score"].value : 0; - + if (score >= 3) { pixCovered.Dispose(); - return (new Matrix(1, 0, 0, 1, 0, 0), null, true); // Identity matrix + return (new Matrix(1, 0, 0, 1, 0, 0), null, true); } else { Pixmap pix = page.GetPixmap(dpi: dpi); - IRect pixRect = new IRect(0, 0, pix.W, pix.H); Matrix matrix = new Matrix( page.Rect.Width / pix.W, 0, @@ -190,16 +183,6 @@ public static (Matrix matrix, Pixmap pix, bool photo) GetPageImage( } } - /// - /// Decide whether a MuPDF.NET page should be OCR'd. - /// - /// MuPDF.NET page object - /// DPI used for rasterization - /// Minimum number of vector paths to suggest glyph simulation - /// Fraction of page area covered by images to trigger OCR - /// Fraction of readable characters to skip OCR - /// Output of page.get_text("dict") if already available - /// Dictionary with decision and diagnostic flags public static Dictionary ShouldOcrPage( Page page, int dpi = 150, @@ -216,25 +199,19 @@ public static Dictionary ShouldOcrPage( ["readable_text"] = false, ["image_covers_page"] = false, ["has_vector_chars"] = false, - ["transform"] = new Matrix(1, 0, 0, 1, 0, 0), // Identity matrix + ["transform"] = new Matrix(1, 0, 0, 1, 0, 0), ["pixmap"] = null, }; - Rect pageRect = page.Rect; - float pageArea = Math.Abs(pageRect.Width * pageRect.Height); - - // Analyze the page - var analysis = Utils.AnalyzePage(page, blocks); + var analysis = MuPDF.NET4LLM.Helpers.Utils.AnalyzePage(page, blocks); - // Return if page is completely blank Rect covered = analysis["covered"] as Rect; - if (Utils.BboxIsEmpty(covered)) + if (MuPDF.NET4LLM.Helpers.Utils.BboxIsEmpty(covered)) { decision["should_ocr"] = false; return decision; } - // Return if page has been OCR'd already int ocrSpans = (int)analysis["ocr_spans"]; if (ocrSpans > 0) { @@ -250,8 +227,6 @@ public static Dictionary ShouldOcrPage( float imgArea = (float)analysis["img_area"]; int charsBad = (int)analysis["chars_bad"]; - // Preset OCR if very little text area exists - // Less than 5% text area in covered area if (txtArea < 0.05f && charsTotal < 200 && txtJoins < 0.3f) { if (vecArea >= vectorThresh) @@ -287,15 +262,12 @@ public static Dictionary ShouldOcrPage( if (!(bool)decision["readable_text"] && (bool)decision["has_text"]) return decision; - // We need OCR and do a final check for potential text presence if (!(bool)decision["has_text"]) { - // Rasterize and check for photo versus text-heaviness var (matrix, pix, photo) = GetPageImage(page, dpi, covered); if (photo) { - // This seems to be a non-text picture page decision["should_ocr"] = false; decision["pixmap"] = null; }