/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.util;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class EncodingDetector {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static final int NO_THRESHOLD = -1;
    public static final String MIN_CONFIDENCE_KEY = "encodingdetector.charset.min.confidence";
    private static final HashMap<String, String> ALIASES = new HashMap();
    private static final HashSet<String> DETECTABLES = new HashSet();
    private static final int MIN_LENGTH = 4;
    private int minConfidence;
    private CharsetDetector detector;
    private List<EncodingClue> clues;

    public EncodingDetector(Configuration conf) {
        this.minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
        this.detector = new CharsetDetector();
        this.clues = new ArrayList<EncodingClue>();
    }

    public void autoDetectClues(Content content, boolean filter) {
        byte[] data = content.getContent();
        if (this.minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > 4) {
            CharsetMatch[] matches = null;
            try {
                this.detector.enableInputFilter(filter);
                this.detector.setText(data);
                matches = this.detector.detectAll();
            }
            catch (Exception e) {
                LOG.debug("Exception from ICU4J (ignoring): ", (Throwable)e);
            }
            if (matches != null) {
                for (CharsetMatch match : matches) {
                    this.addClue(match.getName(), "detect", match.getConfidence());
                }
            }
        }
        this.addClue(EncodingDetector.parseCharacterEncoding(content.getMetadata().get("Content-Type")), "header");
    }

    public void addClue(String value, String source, int confidence) {
        if (value == null || "".equals(value)) {
            return;
        }
        if ((value = EncodingDetector.resolveEncodingAlias(value)) != null) {
            this.clues.add(new EncodingClue(value, source, confidence));
        }
    }

    public void addClue(String value, String source) {
        this.addClue(value, source, -1);
    }

    public String guessEncoding(Content content, String defaultValue) {
        EncodingClue defaultClue;
        String base = content.getBaseUrl();
        if (LOG.isTraceEnabled()) {
            this.findDisagreements(base, this.clues);
        }
        EncodingClue bestClue = defaultClue = new EncodingClue(defaultValue, "default");
        for (EncodingClue clue : this.clues) {
            LOG.trace("{}: charset {}", (Object)base, (Object)clue);
            String charset = clue.value;
            if (this.minConfidence >= 0 && clue.confidence >= this.minConfidence) {
                LOG.trace("{}: Choosing encoding: {} with confidence {}", new Object[]{base, charset, clue.confidence});
                return EncodingDetector.resolveEncodingAlias(charset).toLowerCase();
            }
            if (clue.confidence != -1 || bestClue != defaultClue) continue;
            bestClue = clue;
        }
        LOG.trace("{}: Choosing encoding: {}", (Object)base, (Object)bestClue);
        return bestClue.value.toLowerCase();
    }

    public void clearClues() {
        this.clues.clear();
    }

    private void findDisagreements(String url, List<EncodingClue> newClues) {
        HashSet<String> valsSeen = new HashSet<String>();
        HashSet<String> sourcesSeen = new HashSet<String>();
        boolean disagreement = false;
        for (int i = 0; i < newClues.size(); ++i) {
            EncodingClue clue = newClues.get(i);
            if (clue.isEmpty() || sourcesSeen.contains(clue.source)) continue;
            if (valsSeen.size() > 0 && !valsSeen.contains(clue.value) && clue.meetsThreshold()) {
                disagreement = true;
            }
            if (clue.meetsThreshold()) {
                valsSeen.add(clue.value);
            }
            sourcesSeen.add(clue.source);
        }
        if (disagreement) {
            StringBuffer sb = new StringBuffer();
            sb.append("Disagreement: " + url + "; ");
            for (int i = 0; i < newClues.size(); ++i) {
                if (i > 0) {
                    sb.append(", ");
                }
                sb.append(newClues.get(i));
            }
            LOG.trace(sb.toString());
        }
    }

    public static String resolveEncodingAlias(String encoding) {
        try {
            if (encoding == null || !Charset.isSupported(encoding)) {
                return null;
            }
            String canonicalName = new String(Charset.forName(encoding).name());
            return ALIASES.containsKey(canonicalName) ? ALIASES.get(canonicalName) : canonicalName;
        }
        catch (Exception e) {
            LOG.warn("Invalid encoding {} detected, using default.", (Object)encoding);
            return null;
        }
    }

    public static String parseCharacterEncoding(String contentType) {
        if (contentType == null) {
            return null;
        }
        int start = contentType.indexOf("charset=");
        if (start < 0) {
            return null;
        }
        String encoding = contentType.substring(start + 8);
        int end = encoding.indexOf(59);
        if (end >= 0) {
            encoding = encoding.substring(0, end);
        }
        if ((encoding = encoding.trim()).length() > 2 && encoding.startsWith("\"") && encoding.endsWith("\"")) {
            encoding = encoding.substring(1, encoding.length() - 1);
        }
        return encoding.trim();
    }

    public static void main(String[] args) throws IOException {
        if (args.length != 1) {
            System.err.println("Usage: EncodingDetector <file>");
            System.exit(1);
        }
        Configuration conf = NutchConfiguration.create();
        EncodingDetector detector = new EncodingDetector(NutchConfiguration.create());
        BufferedInputStream istr = new BufferedInputStream(new FileInputStream(args[0]));
        ByteArrayOutputStream ostr = new ByteArrayOutputStream();
        byte[] bytes = new byte[1000];
        boolean more = true;
        while (more) {
            int len = istr.read(bytes);
            if (len < bytes.length) {
                more = false;
                if (len <= 0) continue;
                ostr.write(bytes, 0, len);
                continue;
            }
            ostr.write(bytes);
        }
        byte[] data = ostr.toByteArray();
        Content content = new Content("", "", data, "text/html", new Metadata(), conf);
        detector.autoDetectClues(content, true);
        String encoding = detector.guessEncoding(content, conf.get("parser.character.encoding.default"));
        System.out.println("Guessed encoding: " + encoding);
    }

    static {
        DETECTABLES.add("text/html");
        DETECTABLES.add("text/plain");
        DETECTABLES.add("text/richtext");
        DETECTABLES.add("text/rtf");
        DETECTABLES.add("text/sgml");
        DETECTABLES.add("text/tab-separated-values");
        DETECTABLES.add("text/xml");
        DETECTABLES.add("application/rss+xml");
        DETECTABLES.add("application/xhtml+xml");
        ALIASES.put("ISO-8859-1", "windows-1252");
        ALIASES.put("EUC-KR", "x-windows-949");
        ALIASES.put("x-EUC-CN", "GB18030");
        ALIASES.put("GBK", "GB18030");
    }

    private class EncodingClue {
        private String value;
        private String source;
        private int confidence;

        public EncodingClue(String value, String source) {
            this(value, source, -1);
        }

        public EncodingClue(String value, String source, int confidence) {
            this.value = value.toLowerCase();
            this.source = source;
            this.confidence = confidence;
        }

        public String getSource() {
            return this.source;
        }

        public String getValue() {
            return this.value;
        }

        public String toString() {
            return this.value + " (" + this.source + (String)(this.confidence >= 0 ? ", " + this.confidence + "% confidence" : "") + ")";
        }

        public boolean isEmpty() {
            return this.value == null || "".equals(this.value);
        }

        public boolean meetsThreshold() {
            return this.confidence < 0 || EncodingDetector.this.minConfidence >= 0 && this.confidence >= EncodingDetector.this.minConfidence;
        }
    }
}

