package edu.berkeley.cs.nlp.ocular.lm;

import edu.berkeley.cs.nlp.ocular.data.textreader.Charset;
import edu.berkeley.cs.nlp.ocular.data.textreader.TextReader;
import edu.berkeley.cs.nlp.ocular.util.CollectionHelper;
import edu.berkeley.cs.nlp.ocular.util.FileUtil;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import tberg.murphy.indexer.Indexer;

/* loaded from: input_file:main/ocular_2.12-0.3-SNAPSHOT.jar:edu/berkeley/cs/nlp/ocular/lm/CorpusCounter.class */
public class CorpusCounter {
    public final CountDbBig[] counts;
    public final int maxNgramOrder;
    private long tokenCount;
    public final int MILLION = 1000000;
    public final int[] INITIAL_CHAR_DB_SIZES = {100, 6000, 60000, 300000, 1000000, 3000000, 6000000, 10000000, 20000000, 40000000, 60000000, 80000000};
    private final Set<Integer> activeCharacters;
    private final Map<Integer, Integer> unigramCounts;

    public CorpusCounter(int i) {
        this.tokenCount = 0L;
        this.counts = new CountDbBig[i];
        int[] iArr = new int[i];
        for (int i2 = 0; i2 < iArr.length; i2++) {
            if (i2 < this.INITIAL_CHAR_DB_SIZES.length) {
                iArr[i2] = this.INITIAL_CHAR_DB_SIZES[i2];
            } else {
                iArr[i2] = 100000000;
            }
        }
        for (int i3 = 0; i3 < i - 2; i3++) {
            this.counts[i3] = new CountDbBig(iArr[i3], 4);
        }
        this.counts[i - 2] = new CountDbBig(iArr[i - 2], 3);
        this.counts[i - 1] = new CountDbBig(iArr[i - 1], 1);
        this.maxNgramOrder = i;
        this.tokenCount = 0L;
        this.activeCharacters = new TreeSet();
        this.unigramCounts = new HashMap();
    }

    public CountDbBig[] getCounts() {
        return this.counts;
    }

    public void countRecursive(String str, int i, Indexer<String> indexer, TextReader textReader) {
        System.out.println("CorpusCounter:  Count recursive starting from " + str);
        for (File file : FileUtil.recursiveFiles(str)) {
            System.out.println("    counting file: " + file);
            count(file.getPath(), i, indexer, textReader);
        }
    }

    public void count(String str, int i, Indexer<String> indexer, TextReader textReader) {
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), "UTF-8"));
            for (int i2 = 0; bufferedReader.ready() && i2 < i; i2++) {
                countLine(bufferedReader.readLine(), indexer, textReader, i2);
            }
            bufferedReader.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public void countLine(String str, Indexer<String> indexer, TextReader textReader, int i) {
        countChars(textReader.readCharacters(str), indexer, i);
    }

    public void countChars(List<String> list, Indexer<String> indexer, int i) {
        int[] iArr = new int[list.size()];
        int i2 = 0;
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            String next = it.next();
            if ("\\".equals(next)) {
                next = "\\\\";
            }
            if (!indexer.locked() || indexer.contains(next)) {
                int i3 = i2;
                i2++;
                iArr[i3] = indexer.getIndex(next);
            } else {
                int i4 = i2;
                i2++;
                iArr[i4] = -1;
            }
        }
        countLine(iArr, i);
    }

    public void count(int[][] iArr) {
        for (int i = 0; i < iArr.length; i++) {
            countLine(iArr[i], i);
        }
    }

    public void countLine(int[] iArr, int i) {
        int[] iArr2 = new int[this.maxNgramOrder];
        Arrays.fill(iArr2, -1);
        for (int i2 = 0; i2 < iArr.length; i2++) {
            for (int i3 = 0; i3 < iArr2.length - 1; i3++) {
                iArr2[i3] = iArr2[i3 + 1];
            }
            iArr2[iArr2.length - 1] = iArr[i2];
            if (iArr[i2] != -1) {
                incrementCounts(iArr2, this.maxNgramOrder - (firstMinusOneLookingBack(iArr2) + 1));
                int i4 = iArr[i2];
                this.activeCharacters.add(Integer.valueOf(i4));
                this.unigramCounts.put(Integer.valueOf(i4), Integer.valueOf(((Integer) CollectionHelper.getOrElse(this.unigramCounts, Integer.valueOf(i4), 0)).intValue() + 1));
            }
            this.tokenCount++;
            for (int i5 = 0; i5 < this.counts.length; i5++) {
                this.counts[i5].maybeResize();
            }
        }
    }

    private int firstMinusOneLookingBack(int[] iArr) {
        for (int length = iArr.length - 1; length >= 0; length--) {
            if (iArr[length] == -1) {
                return length;
            }
        }
        return -1;
    }

    public void printStats(int i) {
        System.out.println("=============================================");
        System.out.println("Line " + i);
        System.out.println("Number of tokens: train: " + this.tokenCount);
        for (int i2 = 0; i2 < this.counts.length; i2++) {
            System.out.println((i2 + 1) + "-gram DB:\n\t" + this.counts[i2].getStringAnalysis());
            System.out.println("\t" + (i2 + 1) + "-grams total and curr: " + this.counts[i2].totalSize() + Charset.SPACE + this.counts[i2].currSize());
        }
    }

    private void incrementCounts(int[] iArr, int i) {
        if (i < 1) {
            throw new RuntimeException("order < 1.  was order=" + i);
        }
        NgramWrapper ngramWrapper = NgramWrapper.getNew(iArr, iArr.length - i, iArr.length);
        if (this.counts[i - 1].incrementCount(ngramWrapper, CountType.TOKEN_INDEX) == 0 && i > 1) {
            NgramWrapper lowerOrder = ngramWrapper.getLowerOrder();
            NgramWrapper history = ngramWrapper.getHistory();
            this.counts[i - 2].incrementCount(lowerOrder, CountType.LOWER_ORDER_TYPE_INDEX);
            this.counts[i - 2].incrementCount(history, CountType.HISTORY_TYPE_INDEX);
            if (i > 2) {
                this.counts[i - 3].incrementCount(history.getLowerOrder(), CountType.LOWER_ORDER_TYPE_NORMALIZER);
            } else {
                this.counts[i - 2].incrementBigramTypes();
            }
        }
        if (i > 1) {
            incrementCounts(iArr, i - 1);
        }
    }

    public long getTokenCount() {
        return this.tokenCount;
    }

    public Set<Integer> getActiveCharacters() {
        return this.activeCharacters;
    }

    public Map<Integer, Integer> getUnigramCounts() {
        return this.unigramCounts;
    }
}
