ProcessDataStandardFormat

run.json descriptions.json MissingMethodException Infineon.Mesa.PDF.Text.Stripper 4.8.0.2 MSTEST0037
2025-03-03 11:32:29 -07:00
parent 1ae00ffd41
commit 3966b75da7
10 changed files with 139 additions and 90 deletions
--- a/Adaptation/FileHandlers/pcl/Convert.cs
+++ b/Adaptation/FileHandlers/pcl/Convert.cs
@ -4,6 +4,7 @@ using System.Collections.Generic;
 using System.Collections.ObjectModel;
 using System.Diagnostics;
 using System.IO;
+using System.Linq;

 namespace Adaptation.FileHandlers.pcl;

@ -31,7 +32,35 @@ internal class Convert
        return result;
    }

-    internal static ReadOnlyDictionary<string, string> PDF(Logistics logistics, string ghostPCLFileName, List<FileInfo> fileInfoCollection)
+    private static Dictionary<string, string> PortableDocumentFormatSplit(string pdfTextStripperFileName, string sourcePath, string sourceFileNamePdf)
+    {
+        Dictionary<string, string> results = new();
+        ProcessStartInfo processStartInfo = new(pdfTextStripperFileName, $"s \"{sourceFileNamePdf}\"")
+        {
+            UseShellExecute = false,
+            RedirectStandardError = true,
+            RedirectStandardOutput = true,
+        };
+        Process process = Process.Start(processStartInfo);
+        _ = process.WaitForExit(30000);
+        string text;
+        string checkFile;
+        string[] pdfFiles = Directory.GetFiles(sourcePath, "*.pdf", SearchOption.TopDirectoryOnly);
+        string[] textFiles = Directory.GetFiles(sourcePath, "*.txt", SearchOption.TopDirectoryOnly);
+        foreach (string pdfFile in pdfFiles)
+        {
+            if (pdfFile == sourceFileNamePdf)
+                continue;
+            checkFile = Path.ChangeExtension(pdfFile, ".txt");
+            if (!textFiles.Contains(checkFile))
+                continue;
+            text = File.ReadAllText(checkFile);
+            results.Add(pdfFile, text);
+        }
+        return results;
+    }
+
+    internal static ReadOnlyDictionary<string, string> PDF(Logistics logistics, string ghostPCLFileName, string pdfTextStripperFileName, List<FileInfo> fileInfoCollection)
    {
        Dictionary<string, string> results = new();
        object item;
@ -58,51 +87,59 @@ internal class Convert
        }
        if (results.Count == 0)
        {
-            java.io.File file = new(sourceFileNamePdf);
-            org.apache.pdfbox.util.Splitter splitter = new();
-            org.apache.pdfbox.pdmodel.PDDocument pdDocument = org.apache.pdfbox.pdmodel.PDDocument.load(file);
-            java.util.List list = splitter.split(pdDocument);
-            java.util.ListIterator iterator = list.listIterator();
-            org.apache.pdfbox.util.PDFTextStripper dataStripper = new();
-            for (short i = 1; i < short.MaxValue; i++)
+            try
            {
-                if (!iterator.hasNext())
-                    break;
-                item = iterator.next();
-                pagePDFFile = string.Concat(sourcePath, @"\", sourceFileNameWithoutExtension, "_", i, ".pdf");
-                pageTextFile = Path.ChangeExtension(pagePDFFile, ".txt");
-                if (File.Exists(pageTextFile))
+                java.io.File file = new(sourceFileNamePdf);
+                org.apache.pdfbox.util.Splitter splitter = new();
+                org.apache.pdfbox.pdmodel.PDDocument pdDocument = org.apache.pdfbox.pdmodel.PDDocument.load(file);
+                java.util.List list = splitter.split(pdDocument);
+                java.util.ListIterator iterator = list.listIterator();
+                org.apache.pdfbox.util.PDFTextStripper dataStripper = new();
+                for (short i = 1; i < short.MaxValue; i++)
                {
-                    pageText = File.ReadAllText(pageTextFile);
-                    sourceFiles.Add(pageTextFile);
-                    if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
-                        continue;
-                    pd.close();
+                    if (!iterator.hasNext())
+                        break;
+                    item = iterator.next();
+                    pagePDFFile = string.Concat(sourcePath, @"\", sourceFileNameWithoutExtension, "_", i, ".pdf");
+                    pageTextFile = Path.ChangeExtension(pagePDFFile, ".txt");
+                    if (File.Exists(pageTextFile))
+                    {
+                        pageText = File.ReadAllText(pageTextFile);
+                        sourceFiles.Add(pageTextFile);
+                        if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
+                            continue;
+                        pd.close();
+                    }
+                    else if (File.Exists(pagePDFFile))
+                    {
+                        org.apache.pdfbox.pdmodel.PDDocument document = org.apache.pdfbox.pdmodel.PDDocument.load(pagePDFFile);
+                        pageText = dataStripper.getText(document);
+                        document.close();
+                        sourceFiles.Add(pagePDFFile);
+                        if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
+                            continue;
+                        pd.close();
+                    }
+                    else
+                    {
+                        if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
+                            continue;
+                        pageText = dataStripper.getText(pd);
+                        pd.save(pagePDFFile);
+                        sourceFiles.Add(pagePDFFile);
+                        pd.close();
+                        File.WriteAllText(pageTextFile, pageText);
+                        sourceFiles.Add(pageTextFile);
+                    }
+                    results.Add(pagePDFFile, pageText);
                }
-                else if (File.Exists(pagePDFFile))
-                {
-                    org.apache.pdfbox.pdmodel.PDDocument document = org.apache.pdfbox.pdmodel.PDDocument.load(pagePDFFile);
-                    pageText = dataStripper.getText(document);
-                    document.close();
-                    sourceFiles.Add(pagePDFFile);
-                    if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
-                        continue;
-                    pd.close();
-                }
-                else
-                {
-                    if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
-                        continue;
-                    pageText = dataStripper.getText(pd);
-                    pd.save(pagePDFFile);
-                    sourceFiles.Add(pagePDFFile);
-                    pd.close();
-                    File.WriteAllText(pageTextFile, pageText);
-                    sourceFiles.Add(pageTextFile);
-                }
-                results.Add(pagePDFFile, pageText);
+                pdDocument.close();
+            }
+            catch (MissingMethodException)
+            {
+                if (results.Count == 0)
+                    results = PortableDocumentFormatSplit(pdfTextStripperFileName, sourcePath, sourceFileNamePdf);
            }
-            pdDocument.close();
        }
        foreach (string sourceFile in sourceFiles)
            fileInfoCollection.Add(new FileInfo(sourceFile));