met08ddupsfs6420/Adaptation/FileHandlers/pcl/Convert.cs

using Adaptation.Shared;
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.Diagnostics;
using System.IO;
using System.Linq;

namespace Adaptation.FileHandlers.pcl;

internal class Convert
{

    /// <summary>
    /// Convert the raw data file to parsable file format - in this case from PCL to PDF
    /// </summary>
    /// <param name="sourceFile">source file to be converted to PDF</param>
    /// <returns></returns>
    private static string ConvertSourceFileToPdf(string ghostPCLFileName, Logistics logistics)
    {
        string result = Path.ChangeExtension(logistics.ReportFullPath, ".pdf");
        if (!File.Exists(result))
        {
            //string arguments = string.Concat("-i \"", sourceFile, "\" -o \"", result, "\"");
            string arguments = string.Concat("-dSAFER -dBATCH -dNOPAUSE -sOutputFile=\"", result, "\" -sDEVICE=pdfwrite \"", logistics.ReportFullPath, "\"");
            //Process process = Process.Start(configData.LincPDFCFileName, arguments);
            Process process = Process.Start(ghostPCLFileName, arguments);
            _ = process.WaitForExit(30000);
            if (!File.Exists(result))
                throw new Exception("PDF file wasn't created");
        }
        return result;
    }

    private static Dictionary<string, string> PortableDocumentFormatSplit(string pdfTextStripperFileName, string sourcePath, string sourceFileNamePdf)
    {
        Dictionary<string, string> results = new();
        ProcessStartInfo processStartInfo = new(pdfTextStripperFileName, $"s \"{sourceFileNamePdf}\"")
        {
            UseShellExecute = false,
            RedirectStandardError = true,
            RedirectStandardOutput = true,
        };
        Process process = Process.Start(processStartInfo);
        _ = process.WaitForExit(30000);
        string text;
        string checkFile;
        string[] pdfFiles = Directory.GetFiles(sourcePath, "*.pdf", SearchOption.TopDirectoryOnly);
        string[] textFiles = Directory.GetFiles(sourcePath, "*.txt", SearchOption.TopDirectoryOnly);
        foreach (string pdfFile in pdfFiles)
        {
            if (pdfFile == sourceFileNamePdf)
                continue;
            checkFile = Path.ChangeExtension(pdfFile, ".txt");
            if (!textFiles.Contains(checkFile))
                continue;
            text = File.ReadAllText(checkFile);
            results.Add(pdfFile, text);
        }
        return results;
    }

    internal static ReadOnlyDictionary<string, string> PDF(Logistics logistics, string ghostPCLFileName, string pdfTextStripperFileName, List<FileInfo> fileInfoCollection)
    {
        Dictionary<string, string> results = new();
        object item;
        string pageText;
        string pagePDFFile;
        string pageTextFile;
        List<string> sourceFiles = new();
        string sourceFileNamePdf = ConvertSourceFileToPdf(ghostPCLFileName, logistics);
        sourceFiles.Add(sourceFileNamePdf);
        string sourcePath = Path.GetDirectoryName(logistics.ReportFullPath) ?? throw new Exception();
        string sourceFileNameWithoutExtension = Path.GetFileNameWithoutExtension(logistics.ReportFullPath);
        string[] txtFiles = Directory.GetFiles(sourcePath, $"{sourceFileNameWithoutExtension}_*.txt", SearchOption.TopDirectoryOnly);
        if (txtFiles.Length != 0)
        {
            txtFiles = (from l in txtFiles orderby l.Length, l select l).ToArray();
            foreach (string txtFile in txtFiles)
            {
                sourceFiles.Add(txtFile);
                pageText = File.ReadAllText(txtFile);
                pagePDFFile = Path.ChangeExtension(txtFile, ".pdf");
                if (!File.Exists(pagePDFFile))
                    continue;
                results.Add(pagePDFFile, pageText);
            }
        }
        if (results.Count == 0)
        {
            try
            {
                java.io.File file = new(sourceFileNamePdf);
                org.apache.pdfbox.util.Splitter splitter = new();
                org.apache.pdfbox.pdmodel.PDDocument pdDocument = org.apache.pdfbox.pdmodel.PDDocument.load(file);
                java.util.List list = splitter.split(pdDocument);
                java.util.ListIterator iterator = list.listIterator();
                org.apache.pdfbox.util.PDFTextStripper dataStripper = new();
                for (short i = 1; i < short.MaxValue; i++)
                {
                    if (!iterator.hasNext())
                        break;
                    item = iterator.next();
                    pagePDFFile = string.Concat(sourcePath, @"\", sourceFileNameWithoutExtension, "_", i, ".pdf");
                    pageTextFile = Path.ChangeExtension(pagePDFFile, ".txt");
                    if (File.Exists(pageTextFile))
                    {
                        pageText = File.ReadAllText(pageTextFile);
                        sourceFiles.Add(pageTextFile);
                        if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
                            continue;
                        pd.close();
                    }
                    else if (File.Exists(pagePDFFile))
                    {
                        org.apache.pdfbox.pdmodel.PDDocument document = org.apache.pdfbox.pdmodel.PDDocument.load(pagePDFFile);
                        pageText = dataStripper.getText(document);
                        document.close();
                        sourceFiles.Add(pagePDFFile);
                        if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
                            continue;
                        pd.close();
                    }
                    else
                    {
                        if (item is not org.apache.pdfbox.pdmodel.PDDocument pd)
                            continue;
                        pageText = dataStripper.getText(pd);
                        pd.save(pagePDFFile);
                        sourceFiles.Add(pagePDFFile);
                        pd.close();
                        File.WriteAllText(pageTextFile, pageText);
                        sourceFiles.Add(pageTextFile);
                    }
                    results.Add(pagePDFFile, pageText);
                }
                pdDocument.close();
            }
            catch (MissingMethodException)
            {
                if (results.Count == 0)
                    results = PortableDocumentFormatSplit(pdfTextStripperFileName, sourcePath, sourceFileNamePdf);
            }
        }
        foreach (string sourceFile in sourceFiles)
            fileInfoCollection.Add(new FileInfo(sourceFile));
        return new(results);
    }

}