/*
 * Decompiled with CFR 0.152.
 */
package org.apache.tika.eval.core.tokens;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.tika.eval.core.tokens.AlphaIdeographFilterFactory;
import org.apache.tika.eval.core.tokens.CommonTokenResult;
import org.apache.tika.eval.core.tokens.LangModel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CommonTokenCountManager {
    private static final Logger LOG = LoggerFactory.getLogger(CommonTokenCountManager.class);
    private static final Charset COMMON_TOKENS_CHARSET = StandardCharsets.UTF_8;
    private static final String TERM_FREQS = "#SUM_TERM_FREQS\t";
    private final Path commonTokensDir;
    private final String defaultLangCode;
    Map<String, LangModel> commonTokenMap = new ConcurrentHashMap<String, LangModel>();
    Set<String> alreadyTriedToLoad = new HashSet<String>();
    private Matcher digitsMatcher = Pattern.compile("(\\d+)").matcher("");

    public CommonTokenCountManager() {
        this(null, null);
    }

    public CommonTokenCountManager(Path commonTokensDir, String defaultLangCode) {
        if (defaultLangCode == null) {
            defaultLangCode = "";
        }
        this.defaultLangCode = defaultLangCode;
        this.commonTokensDir = commonTokensDir;
        if (!"".equals(defaultLangCode)) {
            this.tryToLoad(defaultLangCode);
            LangModel langModel = this.commonTokenMap.get(defaultLangCode);
            if (langModel == null) {
                LOG.warn("No common tokens for default language: '" + defaultLangCode + "'");
                this.commonTokenMap.put(defaultLangCode, LangModel.EMPTY_MODEL);
            }
        } else {
            this.commonTokenMap.put(defaultLangCode, LangModel.EMPTY_MODEL);
        }
    }

    @Deprecated
    public CommonTokenResult countTokenOverlaps(String langCode, Map<String, MutableInt> tokens) throws IOException {
        String actualLangCode = this.getActualLangCode(langCode);
        int numUniqueCommonTokens = 0;
        int numCommonTokens = 0;
        int numUniqueAlphabeticTokens = 0;
        int numAlphabeticTokens = 0;
        LangModel model = this.commonTokenMap.get(actualLangCode);
        for (Map.Entry<String, MutableInt> e : tokens.entrySet()) {
            String token = e.getKey();
            int count = e.getValue().intValue();
            if (AlphaIdeographFilterFactory.isAlphabetic(token.toCharArray(), token.length())) {
                numAlphabeticTokens += count;
                ++numUniqueAlphabeticTokens;
            }
            if (!model.contains(token)) continue;
            numCommonTokens += count;
            ++numUniqueCommonTokens;
        }
        return new CommonTokenResult(actualLangCode, numUniqueCommonTokens, numCommonTokens, numUniqueAlphabeticTokens, numAlphabeticTokens);
    }

    public Set<String> getTokens(String lang) {
        return Collections.unmodifiableSet(new HashSet<String>(this.commonTokenMap.get(this.getActualLangCode(lang)).getTokens()));
    }

    public Set<String> getLangs() {
        return this.commonTokenMap.keySet();
    }

    public Pair<String, LangModel> getLangTokens(String lang) {
        String actualLangCode = this.getActualLangCode(lang);
        return Pair.of(actualLangCode, this.commonTokenMap.get(actualLangCode));
    }

    private String getActualLangCode(String langCode) {
        if (langCode == null || "".equals(langCode)) {
            return this.defaultLangCode;
        }
        if (this.commonTokenMap.containsKey(langCode)) {
            return langCode;
        }
        this.tryToLoad(langCode);
        LangModel model = this.commonTokenMap.get(langCode);
        if (model == null) {
            return this.defaultLangCode;
        }
        return langCode;
    }

    public void close() throws IOException {
        this.commonTokenMap.clear();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private synchronized void tryToLoad(String langCode) {
        InputStream is;
        block19: {
            if (this.alreadyTriedToLoad.contains(langCode)) {
                return;
            }
            if (this.commonTokenMap.get(langCode) != null) {
                return;
            }
            is = null;
            Path p = null;
            if (this.commonTokensDir != null) {
                p = this.commonTokensDir.resolve(langCode);
            }
            is = p == null || !Files.isRegularFile(p, new LinkOption[0]) ? this.getClass().getResourceAsStream("/common_tokens/" + langCode) : Files.newInputStream(p, new OpenOption[0]);
            if (is != null) break block19;
            String path = p == null ? "resource on class path: /common_tokens/" + langCode : p.toAbsolutePath().toString();
            LOG.warn("Couldn't find common tokens file for: '" + langCode + "' tried here: " + path);
            this.alreadyTriedToLoad.add(langCode);
            IOUtils.closeQuietly(is);
            return;
        }
        try {
            LangModel model = null;
            try (BufferedReader reader = new BufferedReader(new InputStreamReader(is, COMMON_TOKENS_CHARSET));){
                this.alreadyTriedToLoad.add(langCode);
                String line = reader.readLine();
                while (line != null) {
                    if ((line = line.trim()).startsWith("#")) {
                        if (line.startsWith(TERM_FREQS)) {
                            this.digitsMatcher.reset(line);
                            if (this.digitsMatcher.find()) {
                                model = new LangModel(Long.parseLong(this.digitsMatcher.group(1)));
                            }
                        }
                        line = reader.readLine();
                        continue;
                    }
                    String[] cols = line.split("\t");
                    String t = cols[0].trim();
                    if (t.length() > 0 && cols.length > 2) {
                        if (model == null) {
                            throw new IllegalArgumentException("Common tokens file must have included comment line  with #SUM_TERM_FREQS\t");
                        }
                        String df = cols[1];
                        long tf = Long.parseLong(cols[2]);
                        model.add(t, tf);
                    }
                    line = reader.readLine();
                }
            }
            this.commonTokenMap.put(langCode, model);
        }
        catch (IOException e) {
            try {
                LOG.warn("IOException trying to read: '" + langCode + "'");
            }
            catch (Throwable throwable) {
                IOUtils.closeQuietly(is);
                throw throwable;
            }
            IOUtils.closeQuietly(is);
        }
        IOUtils.closeQuietly(is);
    }
}

