ProcessDataStandardFormat

run.json
descriptions.json
MissingMethodException
Infineon.Mesa.PDF.Text.Stripper 4.8.0.2
MSTEST0037
This commit is contained in:
2025-03-03 11:32:29 -07:00
parent 1ae00ffd41
commit 3966b75da7
10 changed files with 139 additions and 90 deletions

View File

@ -4,6 +4,7 @@ using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.Diagnostics;
using System.IO;
using System.Linq;
namespace Adaptation.FileHandlers.pcl;
@ -31,7 +32,35 @@ internal class Convert
return result;
}
internal static ReadOnlyDictionary<string, string> PDF(Logistics logistics, string ghostPCLFileName, List<FileInfo> fileInfoCollection)
private static Dictionary<string, string> PortableDocumentFormatSplit(string pdfTextStripperFileName, string sourcePath, string sourceFileNamePdf)
{
Dictionary<string, string> results = new();
ProcessStartInfo processStartInfo = new(pdfTextStripperFileName, $"s \"{sourceFileNamePdf}\"")
{
UseShellExecute = false,
RedirectStandardError = true,
RedirectStandardOutput = true,
};
Process process = Process.Start(processStartInfo);
_ = process.WaitForExit(30000);
string text;
string checkFile;
string[] pdfFiles = Directory.GetFiles(sourcePath, "*.pdf", SearchOption.TopDirectoryOnly);
string[] textFiles = Directory.GetFiles(sourcePath, "*.txt", SearchOption.TopDirectoryOnly);
foreach (string pdfFile in pdfFiles)
{
if (pdfFile == sourceFileNamePdf)
continue;
checkFile = Path.ChangeExtension(pdfFile, ".txt");
if (!textFiles.Contains(checkFile))
continue;
text = File.ReadAllText(checkFile);
results.Add(pdfFile, text);
}
return results;
}
internal static ReadOnlyDictionary<string, string> PDF(Logistics logistics, string ghostPCLFileName, string pdfTextStripperFileName, List<FileInfo> fileInfoCollection)
{
Dictionary<string, string> results = new();
object item;
@ -58,51 +87,59 @@ internal class Convert
}
if (results.Count == 0)
{
java.io.File file = new(sourceFileNamePdf);
org.apache.pdfbox.util.Splitter splitter = new();
org.apache.pdfbox.pdmodel.PDDocument pdDocument = org.apache.pdfbox.pdmodel.PDDocument.load(file);
java.util.List list = splitter.split(pdDocument);
java.util.ListIterator iterator = list.listIterator();
org.apache.pdfbox.util.PDFTextStripper dataStripper = new();
for (short i = 1; i < short.MaxValue; i++)
try
{
if (!iterator.hasNext())
break;
item = iterator.next();
pagePDFFile = string.Concat(sourcePath, @"\", sourceFileNameWithoutExtension, "_", i, ".pdf");
pageTextFile = Path.ChangeExtension(pagePDFFile, ".txt");
if (File.Exists(pageTextFile))
java.io.File file = new(sourceFileNamePdf);
org.apache.pdfbox.util.Splitter splitter = new();
org.apache.pdfbox.pdmodel.PDDocument pdDocument = org.apache.pdfbox.pdmodel.PDDocument.load(file);
java.util.List list = splitter.split(pdDocument);
java.util.ListIterator iterator = list.listIterator();
org.apache.pdfbox.util.PDFTextStripper dataStripper = new();
for (short i = 1; i < short.MaxValue; i++)
{
pageText = File.ReadAllText(pageTextFile);
sourceFiles.Add(pageTextFile);
if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
continue;
pd.close();
if (!iterator.hasNext())
break;
item = iterator.next();
pagePDFFile = string.Concat(sourcePath, @"\", sourceFileNameWithoutExtension, "_", i, ".pdf");
pageTextFile = Path.ChangeExtension(pagePDFFile, ".txt");
if (File.Exists(pageTextFile))
{
pageText = File.ReadAllText(pageTextFile);
sourceFiles.Add(pageTextFile);
if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
continue;
pd.close();
}
else if (File.Exists(pagePDFFile))
{
org.apache.pdfbox.pdmodel.PDDocument document = org.apache.pdfbox.pdmodel.PDDocument.load(pagePDFFile);
pageText = dataStripper.getText(document);
document.close();
sourceFiles.Add(pagePDFFile);
if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
continue;
pd.close();
}
else
{
if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
continue;
pageText = dataStripper.getText(pd);
pd.save(pagePDFFile);
sourceFiles.Add(pagePDFFile);
pd.close();
File.WriteAllText(pageTextFile, pageText);
sourceFiles.Add(pageTextFile);
}
results.Add(pagePDFFile, pageText);
}
else if (File.Exists(pagePDFFile))
{
org.apache.pdfbox.pdmodel.PDDocument document = org.apache.pdfbox.pdmodel.PDDocument.load(pagePDFFile);
pageText = dataStripper.getText(document);
document.close();
sourceFiles.Add(pagePDFFile);
if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
continue;
pd.close();
}
else
{
if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
continue;
pageText = dataStripper.getText(pd);
pd.save(pagePDFFile);
sourceFiles.Add(pagePDFFile);
pd.close();
File.WriteAllText(pageTextFile, pageText);
sourceFiles.Add(pageTextFile);
}
results.Add(pagePDFFile, pageText);
pdDocument.close();
}
catch (MissingMethodException)
{
if (results.Count == 0)
results = PortableDocumentFormatSplit(pdfTextStripperFileName, sourcePath, sourceFileNamePdf);
}
pdDocument.close();
}
foreach (string sourceFile in sourceFiles)
fileInfoCollection.Add(new FileInfo(sourceFile));