package edu.berkeley.cs.nlp.ocular.main;

import edu.berkeley.cs.nlp.ocular.data.textreader.BasicTextReader;
import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
import edu.berkeley.cs.nlp.ocular.data.textreader.ConvertLongSTextReader;
import edu.berkeley.cs.nlp.ocular.data.textreader.FlipUVTextReader;
import edu.berkeley.cs.nlp.ocular.data.textreader.RemoveAllDiacriticsTextReader;
import edu.berkeley.cs.nlp.ocular.data.textreader.TextReader;
import edu.berkeley.cs.nlp.ocular.data.textreader.WhitelistCharacterSetTextReader;
import edu.berkeley.cs.nlp.ocular.lm.NgramLanguageModel;
import edu.berkeley.cs.nlp.ocular.lm.SingleLanguageModel;
import edu.berkeley.cs.nlp.ocular.util.CollectionHelper;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import tberg.murphy.fig.Option;
import tberg.murphy.fig.OptionsParser;

/* loaded from: input_file:main/ocular_2.12-0.3-SNAPSHOT.jar:edu/berkeley/cs/nlp/ocular/main/LMTrainMain.class */
public class LMTrainMain implements Runnable {

    @Option(gloss = "Output LM file path.")
    public static String lmPath = null;

    @Option(gloss = "Input corpus path.")
    public static String textPath = null;

    @Option(gloss = "Use separate character type for long s.")
    public static boolean insertLongS = true;

    @Option(gloss = "Allow 'u' and 'v' to interchange.")
    public static boolean allowUVFlip = true;

    @Option(gloss = "Remove diacritics?")
    public static boolean removeDiacritics = false;

    @Option(gloss = "Maximum number of lines to use from corpus.")
    public static int maxLines = 1000000;

    @Option(gloss = "LM character n-gram length.")
    public static int charN = 6;

    @Option(gloss = "Exponent on LM scores.")
    public static double power = 4.0d;

    public static void main(String[] strArr) {
        LMTrainMain lMTrainMain = new LMTrainMain();
        OptionsParser optionsParser = new OptionsParser();
        optionsParser.doRegisterAll(new Object[]{lMTrainMain});
        if (!optionsParser.doParse(strArr)) {
            System.exit(1);
        }
        lMTrainMain.run();
    }

    @Override // java.lang.Runnable
    public void run() {
        if (lmPath == null) {
            throw new IllegalArgumentException("-lmPath not set");
        }
        if (textPath == null) {
            throw new IllegalArgumentException("-textPath not set");
        }
        Set makeSet = CollectionHelper.makeSet("&", ".", ",", ";", ":", "\"", "'", "!", "?", "(", ")", Charset.HYPHEN);
        Set makeSet2 = CollectionHelper.makeSet("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z");
        HashSet hashSet = new HashSet();
        hashSet.addAll(makeSet);
        hashSet.addAll(makeSet2);
        hashSet.add(Charset.HYPHEN);
        TextReader whitelistCharacterSetTextReader = new WhitelistCharacterSetTextReader(hashSet, new BasicTextReader(false));
        if (removeDiacritics) {
            whitelistCharacterSetTextReader = new RemoveAllDiacriticsTextReader(whitelistCharacterSetTextReader);
        }
        if (insertLongS) {
            whitelistCharacterSetTextReader = new ConvertLongSTextReader(whitelistCharacterSetTextReader);
        }
        if (allowUVFlip) {
            whitelistCharacterSetTextReader = new FlipUVTextReader(0.5d, whitelistCharacterSetTextReader);
        }
        writeLM(NgramLanguageModel.buildFromText(textPath, maxLines, charN, NgramLanguageModel.LMType.KNESER_NEY, power, whitelistCharacterSetTextReader), lmPath);
    }

    public static SingleLanguageModel readLM(String str) {
        try {
            File file = new File(str);
            if (!file.exists()) {
                System.out.println("Serialized lm file " + str + " not found");
                return null;
            }
            FileInputStream fileInputStream = new FileInputStream(file);
            ObjectInputStream objectInputStream = new ObjectInputStream(new GZIPInputStream(fileInputStream));
            SingleLanguageModel singleLanguageModel = (SingleLanguageModel) objectInputStream.readObject();
            objectInputStream.close();
            fileInputStream.close();
            return singleLanguageModel;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public static void writeLM(SingleLanguageModel singleLanguageModel, String str) {
        try {
            new File(str).getAbsoluteFile().getParentFile().mkdirs();
            FileOutputStream fileOutputStream = new FileOutputStream(str);
            ObjectOutputStream objectOutputStream = new ObjectOutputStream(new GZIPOutputStream(fileOutputStream));
            objectOutputStream.writeObject(singleLanguageModel);
            objectOutputStream.close();
            fileOutputStream.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}
