package edu.berkeley.cs.nlp.ocular.main;

import edu.berkeley.cs.nlp.ocular.data.Document;
import edu.berkeley.cs.nlp.ocular.data.LazyRawImageLoader;
import edu.berkeley.cs.nlp.ocular.eval.BasicMultiDocumentTranscriber;
import edu.berkeley.cs.nlp.ocular.eval.MultiDocumentTranscriber;
import edu.berkeley.cs.nlp.ocular.eval.SingleDocumentEvaluatorAndOutputPrinter;
import edu.berkeley.cs.nlp.ocular.font.Font;
import edu.berkeley.cs.nlp.ocular.gsm.BasicGlyphSubstitutionModel;
import edu.berkeley.cs.nlp.ocular.gsm.GlyphSubstitutionModel;
import edu.berkeley.cs.nlp.ocular.gsm.NoSubGlyphSubstitutionModel;
import edu.berkeley.cs.nlp.ocular.lm.CodeSwitchLanguageModel;
import edu.berkeley.cs.nlp.ocular.model.DecoderEM;
import edu.berkeley.cs.nlp.ocular.model.em.CUDAInnerLoop;
import edu.berkeley.cs.nlp.ocular.model.em.DefaultInnerLoop;
import edu.berkeley.cs.nlp.ocular.model.em.EmissionCacheInnerLoop;
import edu.berkeley.cs.nlp.ocular.model.em.JOCLInnerLoop;
import edu.berkeley.cs.nlp.ocular.model.emission.CachingEmissionModel;
import edu.berkeley.cs.nlp.ocular.model.emission.CachingEmissionModelExplicitOffset;
import edu.berkeley.cs.nlp.ocular.model.emission.EmissionModel;
import edu.berkeley.cs.nlp.ocular.util.CollectionHelper;
import edu.berkeley.cs.nlp.ocular.util.StringHelper;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.jocl.CL;
import tberg.murphy.fig.Option;
import tberg.murphy.indexer.Indexer;

/* loaded from: input_file:main/ocular_2.12-0.3-SNAPSHOT.jar:edu/berkeley/cs/nlp/ocular/main/FonttrainTranscribeShared.class */
public abstract class FonttrainTranscribeShared extends LineExtractionOptions {

    @Option(gloss = "Path of the directory that will contain output transcriptions.")
    public static String outputPath = null;

    @Option(gloss = "Output formats to be generated. Choose from one or multiple of {dipl,norm,normlines,comp,html,alto}, comma-separated.  dipl = diplomatic, norm = normalized (lines joined), normlines = normalized (separate lines), comp = comparisons.  Default: dipl,norm if -allowGlyphSubstitution=true; dipl otherwise.")
    public static String outputFormats = "";

    @Option(gloss = "Path to the input language model file.")
    public static String inputLmPath = null;

    @Option(gloss = "Path of the input font file.")
    public static String inputFontPath = null;

    @Option(gloss = "Path to write the learned font file to. Required if updateFont is set to true, otherwise ignored.")
    public static String outputFontPath = null;

    @Option(gloss = "Number of documents to process for each parameter update.  This is useful if you are transcribing a large number of documents, and want to have Ocular slowly improve the model as it goes, which you would achieve with updateFont=true.  Default: Update only after each full pass over the document set.")
    public static int updateDocBatchSize = -1;

    @Option(gloss = "Should the language model be updated along with the font?")
    public static boolean updateLM = false;

    @Option(gloss = "Path to write the retrained language model file to. Required if updateLM is set to true, otherwise ignored.")
    public static String outputLmPath = null;

    @Option(gloss = "Should the model allow glyph substitutions? This includes substituted letters as well as letter elisions.")
    public static boolean allowGlyphSubstitution = false;

    @Option(gloss = "Path to the input glyph substitution model file. (Only relevant if allowGlyphSubstitution is set to true.) Default: Don't use a pre-initialized GSM. (Learn one from scratch).")
    public static String inputGsmPath = null;

    @Option(gloss = "Exponent on GSM scores.")
    public static double gsmPower = 4.0d;

    @Option(gloss = "The prior probability of not-substituting the LM char. This includes substituted letters as well as letter elisions.")
    public static double gsmNoCharSubPrior = 0.9d;

    @Option(gloss = "Should the GSM be allowed to elide letters even without the presence of an elision-marking tilde?")
    public static boolean gsmElideAnything = false;

    @Option(gloss = "Should the glyph substitution model be trained (or updated) along with the font? (Only relevant if allowGlyphSubstitution is set to true.)")
    public static boolean updateGsm = false;

    @Option(gloss = "Path to write the retrained glyph substitution model file to. Required if updateGsm is set to true, otherwise ignored.")
    public static String outputGsmPath = null;

    @Option(gloss = "The default number of counts that every glyph gets in order to smooth the glyph substitution model estimation.")
    public static double gsmSmoothingCount = 1.0d;

    @Option(gloss = "gsmElisionSmoothingCountMultiplier.")
    public static double gsmElisionSmoothingCountMultiplier = 100.0d;

    @Option(gloss = "Should documents that cause errors be skipped instead of stopping the whole program?")
    public static boolean skipFailedDocs = false;

    @Option(gloss = "Engine to use for inner loop of emission cache computation. `DEFAULT`: Uses Java on CPU, which works on any machine but is the slowest method. `OPENCL`: Faster engine that uses either the CPU or integrated GPU (depending on processor) and requires OpenCL installation. `CUDA`: Fastest method, but requires a discrete NVIDIA GPU and CUDA installation.")
    public static EmissionCacheInnerLoopType emissionEngine = EmissionCacheInnerLoopType.DEFAULT;

    @Option(gloss = "Size of beam for Viterbi inference. (Usually in range 10-50. Increasing beam size can improve accuracy, but will reduce speed.)")
    public static int beamSize = 10;

    @Option(gloss = "GPU ID when using CUDA emission engine.")
    public static int cudaDeviceID = 0;

    @Option(gloss = "Number of threads to use for LFBGS during m-step.")
    public static int numMstepThreads = 8;

    @Option(gloss = "Number of threads to use during emission cache computation. (Only has effect when emissionEngine is set to DEFAULT.)")
    public static int numEmissionCacheThreads = 8;

    @Option(gloss = "Number of threads to use for decoding. (More thread may increase speed, but may cause a loss of continuity across lines.)")
    public static int numDecodeThreads = 1;

    @Option(gloss = "Number of lines that compose a single decode batch. (Smaller batch size can reduce memory consumption.)")
    public static int decodeBatchSize = 32;

    @Option(gloss = "Min horizontal padding between characters in pixels. (Best left at default value.)")
    public static int paddingMinWidth = 1;

    @Option(gloss = "Max horizontal padding between characters in pixels (Best left at default value.)")
    public static int paddingMaxWidth = 5;

    @Option(gloss = "Use Markov chain to generate vertical offsets. (Slower, but more accurate. Turning on Markov offsets my require larger beam size for good results.)")
    public static boolean markovVerticalOffset = false;

    @Option(gloss = "A language model to be used to assign diacritics to the transcription output.")
    public static boolean allowLanguageSwitchOnPunct = true;

    @Option(gloss = "When evaluation should be done during training (after each parameter update in EM), this is the path of the directory that contains the evaluation input document images. The entire directory will be recursively searched for any files that do not end in `.txt` (and that do not start with `.`). (Only relevant if updateFont is set to true.)")
    public static String evalInputDocPath = null;

    @Option(gloss = "When using -evalInputDocPath, this is the path of the directory where the evaluation line-extraction images should be read/written.  If the line files exist here, they will be used; if not, they will be extracted and then written here.  Useful if: 1) you plan to run Ocular on the same documents multiple times and you want to save some time by not re-extracting the lines, or 2) you use an alternate line extractor (such as Tesseract) to pre-process the document.  If ignored, the document will simply be read from the original document image file, and no line images will be written.")
    public static String evalExtractedLinesPath = null;

    @Option(gloss = "When using -evalInputDocPath, this is the number of documents that will be evaluated on. Ignore or use 0 to use all documents. Default: Use all documents in the specified path.")
    public static int evalNumDocs = CL.CL_INT_MAX;

    @Option(gloss = "When using -evalInputDocPath, on iterations in which we run the evaluation, should the evaluation be run after each batch, as determined by -updateDocBatchSize (in addition to after each iteration)?")
    public static boolean evalBatches = false;

    /* loaded from: input_file:main/ocular_2.12-0.3-SNAPSHOT.jar:edu/berkeley/cs/nlp/ocular/main/FonttrainTranscribeShared$EmissionCacheInnerLoopType.class */
    public enum EmissionCacheInnerLoopType {
        DEFAULT,
        OPENCL,
        CUDA
    }

    /* loaded from: input_file:main/ocular_2.12-0.3-SNAPSHOT.jar:edu/berkeley/cs/nlp/ocular/main/FonttrainTranscribeShared$OutputFormat.class */
    public enum OutputFormat {
        DIPL,
        NORM,
        NORMLINES,
        COMP,
        HTML,
        ALTO,
        WHITESPACE
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static Set<OutputFormat> parseOutputFormats() {
        HashSet hashSet = new HashSet();
        ArrayList arrayList = new ArrayList();
        for (String str : outputFormats.replaceAll("\\s+", "").split(",")) {
            if (!str.isEmpty()) {
                OutputFormat outputFormat = null;
                try {
                    outputFormat = OutputFormat.valueOf(str.toUpperCase());
                } catch (IllegalArgumentException e) {
                    arrayList.add(str);
                }
                if (outputFormat == null) {
                    continue;
                } else {
                    if ((outputFormat == OutputFormat.NORM || outputFormat == OutputFormat.NORMLINES) && !allowGlyphSubstitution) {
                        throw new IllegalArgumentException("-outputFormats 'norm' and 'normlines' are not valid if -allowGlyphSubstitution is false");
                    }
                    hashSet.add(outputFormat);
                }
            }
        }
        if (!arrayList.isEmpty()) {
            throw new IllegalArgumentException("Invalid output formats: {" + StringHelper.join(arrayList, ", ") + "}");
        }
        if (hashSet.isEmpty()) {
            hashSet.add(OutputFormat.DIPL);
            if (allowGlyphSubstitution) {
                hashSet.add(OutputFormat.NORM);
            }
        }
        return hashSet;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // edu.berkeley.cs.nlp.ocular.main.LineExtractionOptions, edu.berkeley.cs.nlp.ocular.main.OcularRunnable
    public void validateOptions() {
        super.validateOptions();
        if (outputPath == null) {
            throw new IllegalArgumentException("-outputPath not set");
        }
        parseOutputFormats();
        if (inputFontPath == null) {
            throw new IllegalArgumentException("-inputFontPath is required");
        }
        if (!new File(inputFontPath).exists()) {
            throw new RuntimeException("inputFontPath " + inputFontPath + " does not exist [looking in " + new File(".").getAbsolutePath() + "]");
        }
        if (inputLmPath == null) {
            throw new IllegalArgumentException("-inputLmPath is required");
        }
        if (inputLmPath != null && !new File(inputLmPath).exists()) {
            throw new RuntimeException("inputLmPath " + inputLmPath + " does not exist [looking in " + new File(".").getAbsolutePath() + "]");
        }
        if (updateLM && outputLmPath == null) {
            throw new IllegalArgumentException("-outputLmPath required when -updateLM is true.");
        }
        if (!updateLM && outputLmPath != null) {
            throw new IllegalArgumentException("-outputLmPath not permitted when -updateLM is false.");
        }
        if (outputLmPath != null && outputFontPath == null) {
            throw new IllegalArgumentException("It is not possible to retrain the LM (-updateLM=true) when not retraining the font (-updateFont=false).");
        }
        if (updateGsm && !allowGlyphSubstitution) {
            throw new IllegalArgumentException("-updateGsm not permitted if -allowGlyphSubstitution is false.");
        }
        if (inputGsmPath != null && !new File(inputGsmPath).exists()) {
            throw new RuntimeException("inputGsmPath " + inputGsmPath + " does not exist [looking in " + new File(".").getAbsolutePath() + "]");
        }
        if (inputGsmPath != null && !allowGlyphSubstitution) {
            throw new IllegalArgumentException("-inputGsmPath not permitted if -allowGlyphSubstitution is false.");
        }
        if (outputGsmPath != null && !allowGlyphSubstitution) {
            throw new IllegalArgumentException("-outputGsmPath not permitted if -allowGlyphSubstitution is false.");
        }
        if (updateGsm && outputGsmPath == null) {
            throw new IllegalArgumentException("-outputGsmPath required when -updateGsm is true.");
        }
        if (!updateGsm && outputGsmPath != null) {
            throw new IllegalArgumentException("-outputGsmPath not permitted when -updateGsm is false.");
        }
        if (allowGlyphSubstitution && inputGsmPath == null && outputGsmPath == null) {
            throw new IllegalArgumentException("If -allowGlyphSubstitution=true, either an -inputGsmPath must be given, or a GSM must be trained by giving an -outputGsmPath.");
        }
        if (outputGsmPath != null && outputFontPath == null) {
            throw new IllegalArgumentException("It is not possible to retrain the GSM (-updateGsm=true) when not retraining the font (-updateFont=false).");
        }
        if (evalExtractedLinesPath != null && evalInputDocPath == null) {
            throw new IllegalArgumentException("-evalExtractedLinesPath not permitted without -evalInputDocPath.");
        }
        new File(outputPath).mkdirs();
        if (updateLM != (outputLmPath != null)) {
            throw new IllegalArgumentException("-updateLM is not as expected");
        }
        if (updateGsm != (outputGsmPath != null)) {
            throw new IllegalArgumentException("-updateGsm is not as expected");
        }
        if (allowGlyphSubstitution != ((inputGsmPath == null && outputGsmPath == null) ? false : true)) {
            throw new IllegalArgumentException("-allowGlyphSubstitution is not as expected");
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static CodeSwitchLanguageModel loadInputLM() {
        System.out.println("Loading initial LM from " + inputLmPath);
        CodeSwitchLanguageModel readCodeSwitchLM = InitializeLanguageModel.readCodeSwitchLM(inputLmPath);
        System.out.println("Loaded CodeSwitchLanguageModel from " + inputLmPath);
        Indexer<String> characterIndexer = readCodeSwitchLM.getCharacterIndexer();
        for (int i = 0; i < readCodeSwitchLM.getLanguageIndexer().size(); i++) {
            ArrayList arrayList = new ArrayList();
            Iterator<Integer> it = readCodeSwitchLM.get(i).getActiveCharacters().iterator();
            while (it.hasNext()) {
                arrayList.add(characterIndexer.getObject(it.next().intValue()));
            }
            Collections.sort(arrayList);
            System.out.println("    " + readCodeSwitchLM.getLanguageIndexer().getObject(i) + ": " + arrayList);
        }
        List makeList = CollectionHelper.makeList(characterIndexer.getObjects());
        Collections.sort(makeList);
        System.out.println("Characters: " + makeList);
        System.out.println("Num characters: " + characterIndexer.size());
        return readCodeSwitchLM;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static Font loadInputFont() {
        System.out.println("Loading font from " + inputFontPath);
        return InitializeFont.readFont(inputFontPath);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static BasicGlyphSubstitutionModel.BasicGlyphSubstitutionModelFactory makeGsmFactory(CodeSwitchLanguageModel codeSwitchLanguageModel) {
        Indexer<String> characterIndexer = codeSwitchLanguageModel.getCharacterIndexer();
        return new BasicGlyphSubstitutionModel.BasicGlyphSubstitutionModelFactory(gsmSmoothingCount, gsmElisionSmoothingCountMultiplier, codeSwitchLanguageModel.getLanguageIndexer(), characterIndexer, makeActiveCharacterSets(codeSwitchLanguageModel), gsmPower, 0, outputPath);
    }

    public static Set<Integer>[] makeActiveCharacterSets(CodeSwitchLanguageModel codeSwitchLanguageModel) {
        int size = codeSwitchLanguageModel.getLanguageIndexer().size();
        Set<Integer>[] setArr = new Set[size];
        for (int i = 0; i < size; i++) {
            setArr[i] = codeSwitchLanguageModel.get(i).getActiveCharacters();
        }
        return setArr;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static GlyphSubstitutionModel loadInitialGSM(BasicGlyphSubstitutionModel.BasicGlyphSubstitutionModelFactory basicGlyphSubstitutionModelFactory) {
        if (!allowGlyphSubstitution) {
            System.out.println("Glyph substitution not allowed; constructing no-sub GSM.");
            return new NoSubGlyphSubstitutionModel();
        }
        if (inputGsmPath != null) {
            System.out.println("Loading initial GSM from " + inputGsmPath);
            return InitializeGlyphSubstitutionModel.readGSM(inputGsmPath);
        }
        System.out.println("No initial GSM provided; initializing to uniform model.");
        return basicGlyphSubstitutionModelFactory.uniform();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static DecoderEM makeDecoder(Indexer<String> indexer) {
        return new DecoderEM(makeEmissionModelFactory(indexer), allowGlyphSubstitution, gsmNoCharSubPrior, gsmElideAnything, allowLanguageSwitchOnPunct, markovVerticalOffset, beamSize, numDecodeThreads, numMstepThreads, decodeBatchSize);
    }

    protected static EmissionModel.EmissionModelFactory makeEmissionModelFactory(Indexer<String> indexer) {
        EmissionCacheInnerLoop emissionInnerLoop = getEmissionInnerLoop();
        return markovVerticalOffset ? new CachingEmissionModelExplicitOffset.CachingEmissionModelExplicitOffsetFactory(indexer, paddingMinWidth, paddingMaxWidth, emissionInnerLoop) : new CachingEmissionModel.CachingEmissionModelFactory(indexer, paddingMinWidth, paddingMaxWidth, emissionInnerLoop);
    }

    protected static EmissionCacheInnerLoop getEmissionInnerLoop() {
        switch (emissionEngine) {
            case DEFAULT:
                return new DefaultInnerLoop(numEmissionCacheThreads);
            case OPENCL:
                return new JOCLInnerLoop(numEmissionCacheThreads);
            case CUDA:
                return new CUDAInnerLoop(numEmissionCacheThreads, cudaDeviceID);
            default:
                throw new RuntimeException("emissionEngine=" + emissionEngine + " not supported");
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static MultiDocumentTranscriber makeEvalSetEvaluator(Indexer<String> indexer, DecoderEM decoderEM, SingleDocumentEvaluatorAndOutputPrinter singleDocumentEvaluatorAndOutputPrinter) {
        if (evalInputDocPath == null) {
            return new MultiDocumentTranscriber.NoOpMultiDocumentTranscriber();
        }
        List<Document> loadDocuments = LazyRawImageLoader.loadDocuments(evalInputDocPath, evalExtractedLinesPath, evalNumDocs, 0, uniformLineHeight, binarizeThreshold, crop);
        if (loadDocuments.isEmpty()) {
            throw new NoDocumentsFoundException("No evaluation documents found! Checked -evalInputDocPath = " + evalInputDocPath);
        }
        for (Document document : loadDocuments) {
            if ((document.loadDiplomaticTextLines() == null) & (document.loadNormalizedText() == null)) {
                throw new RuntimeException("Evaluation document " + document.baseName() + " has no gold transcriptions.");
            }
        }
        return new BasicMultiDocumentTranscriber(loadDocuments, evalInputDocPath, outputPath, parseOutputFormats(), decoderEM, singleDocumentEvaluatorAndOutputPrinter, indexer, skipFailedDocs);
    }
}
