150 lines
		
	
	
		
			6.5 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
			
		
		
	
	
			150 lines
		
	
	
		
			6.5 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
| using Adaptation.Shared;
 | |
| using System;
 | |
| using System.Collections.Generic;
 | |
| using System.Collections.ObjectModel;
 | |
| using System.Diagnostics;
 | |
| using System.IO;
 | |
| using System.Linq;
 | |
| 
 | |
| namespace Adaptation.FileHandlers.pcl;
 | |
| 
 | |
| internal class Convert
 | |
| {
 | |
| 
 | |
|     /// <summary>
 | |
|     /// Convert the raw data file to parsable file format - in this case from PCL to PDF
 | |
|     /// </summary>
 | |
|     /// <param name="sourceFile">source file to be converted to PDF</param>
 | |
|     /// <returns></returns>
 | |
|     private static string ConvertSourceFileToPdf(string ghostPCLFileName, Logistics logistics)
 | |
|     {
 | |
|         string result = Path.ChangeExtension(logistics.ReportFullPath, ".pdf");
 | |
|         if (!File.Exists(result))
 | |
|         {
 | |
|             //string arguments = string.Concat("-i \"", sourceFile, "\" -o \"", result, "\"");
 | |
|             string arguments = string.Concat("-dSAFER -dBATCH -dNOPAUSE -sOutputFile=\"", result, "\" -sDEVICE=pdfwrite \"", logistics.ReportFullPath, "\"");
 | |
|             //Process process = Process.Start(configData.LincPDFCFileName, arguments);
 | |
|             Process process = Process.Start(ghostPCLFileName, arguments);
 | |
|             _ = process.WaitForExit(30000);
 | |
|             if (!File.Exists(result))
 | |
|                 throw new Exception("PDF file wasn't created");
 | |
|         }
 | |
|         return result;
 | |
|     }
 | |
| 
 | |
|     private static Dictionary<string, string> PortableDocumentFormatSplit(string pdfTextStripperFileName, string sourcePath, string sourceFileNamePdf)
 | |
|     {
 | |
|         Dictionary<string, string> results = new();
 | |
|         ProcessStartInfo processStartInfo = new(pdfTextStripperFileName, $"s \"{sourceFileNamePdf}\"")
 | |
|         {
 | |
|             UseShellExecute = false,
 | |
|             RedirectStandardError = true,
 | |
|             RedirectStandardOutput = true,
 | |
|         };
 | |
|         Process process = Process.Start(processStartInfo);
 | |
|         _ = process.WaitForExit(30000);
 | |
|         string text;
 | |
|         string checkFile;
 | |
|         string[] pdfFiles = Directory.GetFiles(sourcePath, "*.pdf", SearchOption.TopDirectoryOnly);
 | |
|         string[] textFiles = Directory.GetFiles(sourcePath, "*.txt", SearchOption.TopDirectoryOnly);
 | |
|         foreach (string pdfFile in pdfFiles)
 | |
|         {
 | |
|             if (pdfFile == sourceFileNamePdf)
 | |
|                 continue;
 | |
|             checkFile = Path.ChangeExtension(pdfFile, ".txt");
 | |
|             if (!textFiles.Contains(checkFile))
 | |
|                 continue;
 | |
|             text = File.ReadAllText(checkFile);
 | |
|             results.Add(pdfFile, text);
 | |
|         }
 | |
|         return results;
 | |
|     }
 | |
| 
 | |
|     internal static ReadOnlyDictionary<string, string> PDF(Logistics logistics, string ghostPCLFileName, string pdfTextStripperFileName, List<FileInfo> fileInfoCollection)
 | |
|     {
 | |
|         Dictionary<string, string> results = new();
 | |
|         object item;
 | |
|         string pageText;
 | |
|         string pagePDFFile;
 | |
|         string pageTextFile;
 | |
|         List<string> sourceFiles = new();
 | |
|         string sourceFileNamePdf = ConvertSourceFileToPdf(ghostPCLFileName, logistics);
 | |
|         sourceFiles.Add(sourceFileNamePdf);
 | |
|         string sourcePath = Path.GetDirectoryName(logistics.ReportFullPath) ?? throw new Exception();
 | |
|         string sourceFileNameWithoutExtension = Path.GetFileNameWithoutExtension(logistics.ReportFullPath);
 | |
|         string[] txtFiles = Directory.GetFiles(sourcePath, $"{sourceFileNameWithoutExtension}_*.txt", SearchOption.TopDirectoryOnly);
 | |
|         if (txtFiles.Length != 0)
 | |
|         {
 | |
|             txtFiles = (from l in txtFiles orderby l.Length, l select l).ToArray();
 | |
|             foreach (string txtFile in txtFiles)
 | |
|             {
 | |
|                 sourceFiles.Add(txtFile);
 | |
|                 pageText = File.ReadAllText(txtFile);
 | |
|                 pagePDFFile = Path.ChangeExtension(txtFile, ".pdf");
 | |
|                 if (!File.Exists(pagePDFFile))
 | |
|                     continue;
 | |
|                 results.Add(pagePDFFile, pageText);
 | |
|             }
 | |
|         }
 | |
|         if (results.Count == 0)
 | |
|         {
 | |
|             try
 | |
|             {
 | |
|                 java.io.File file = new(sourceFileNamePdf);
 | |
|                 org.apache.pdfbox.util.Splitter splitter = new();
 | |
|                 org.apache.pdfbox.pdmodel.PDDocument pdDocument = org.apache.pdfbox.pdmodel.PDDocument.load(file);
 | |
|                 java.util.List list = splitter.split(pdDocument);
 | |
|                 java.util.ListIterator iterator = list.listIterator();
 | |
|                 org.apache.pdfbox.util.PDFTextStripper dataStripper = new();
 | |
|                 for (short i = 1; i < short.MaxValue; i++)
 | |
|                 {
 | |
|                     if (!iterator.hasNext())
 | |
|                         break;
 | |
|                     item = iterator.next();
 | |
|                     pagePDFFile = string.Concat(sourcePath, @"\", sourceFileNameWithoutExtension, "_", i, ".pdf");
 | |
|                     pageTextFile = Path.ChangeExtension(pagePDFFile, ".txt");
 | |
|                     if (File.Exists(pageTextFile))
 | |
|                     {
 | |
|                         pageText = File.ReadAllText(pageTextFile);
 | |
|                         sourceFiles.Add(pageTextFile);
 | |
|                         if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
 | |
|                             continue;
 | |
|                         pd.close();
 | |
|                     }
 | |
|                     else if (File.Exists(pagePDFFile))
 | |
|                     {
 | |
|                         org.apache.pdfbox.pdmodel.PDDocument document = org.apache.pdfbox.pdmodel.PDDocument.load(pagePDFFile);
 | |
|                         pageText = dataStripper.getText(document);
 | |
|                         document.close();
 | |
|                         sourceFiles.Add(pagePDFFile);
 | |
|                         if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
 | |
|                             continue;
 | |
|                         pd.close();
 | |
|                     }
 | |
|                     else
 | |
|                     {
 | |
|                         if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
 | |
|                             continue;
 | |
|                         pageText = dataStripper.getText(pd);
 | |
|                         pd.save(pagePDFFile);
 | |
|                         sourceFiles.Add(pagePDFFile);
 | |
|                         pd.close();
 | |
|                         File.WriteAllText(pageTextFile, pageText);
 | |
|                         sourceFiles.Add(pageTextFile);
 | |
|                     }
 | |
|                     results.Add(pagePDFFile, pageText);
 | |
|                 }
 | |
|                 pdDocument.close();
 | |
|             }
 | |
|             catch (MissingMethodException)
 | |
|             {
 | |
|                 if (results.Count == 0)
 | |
|                     results = PortableDocumentFormatSplit(pdfTextStripperFileName, sourcePath, sourceFileNamePdf);
 | |
|             }
 | |
|         }
 | |
|         foreach (string sourceFile in sourceFiles)
 | |
|             fileInfoCollection.Add(new FileInfo(sourceFile));
 | |
|         return new(results);
 | |
|     }
 | |
| 
 | |
| } |