/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.analysis.lang;

import java.lang.invoke.MethodHandles;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NodeWalker;
import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

public class HTMLLanguageParser
implements HtmlParseFilter {
    private static final Logger LOG;
    private int detect = -1;
    private int identify = -1;
    private int contentMaxlength = -1;
    private boolean onlyCertain = false;
    private OptimaizeLangDetector languageDetector;
    private static Map<String, String> LANGUAGES_MAP;
    private Configuration conf;

    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
        String lang = null;
        Parse parse = parseResult.get(content.getUrl());
        if (this.detect >= 0 && this.identify < 0) {
            lang = this.detectLanguage(parse, doc);
        } else if (this.detect < 0 && this.identify >= 0) {
            lang = this.identifyLanguage(parse);
        } else if (this.detect < this.identify) {
            lang = this.detectLanguage(parse, doc);
            if (lang == null) {
                lang = this.identifyLanguage(parse);
            }
        } else if (this.identify < this.detect) {
            lang = this.identifyLanguage(parse);
            if (lang == null) {
                lang = this.detectLanguage(parse, doc);
            }
        } else {
            LOG.warn("No configuration for language extraction policy is provided");
            return parseResult;
        }
        if (lang != null) {
            parse.getData().getParseMeta().set("language", lang);
            return parseResult;
        }
        return parseResult;
    }

    private String detectLanguage(Parse page, DocumentFragment doc) {
        String lang = HTMLLanguageParser.getLanguageFromMetadata(page.getData().getParseMeta());
        if (lang == null) {
            LanguageParser parser = new LanguageParser(doc);
            lang = parser.getLanguage();
        }
        if (lang != null) {
            return lang;
        }
        lang = page.getData().getContentMeta().get("Content-Language");
        return lang;
    }

    private String identifyLanguage(Parse parse) {
        String content;
        if (parse == null) {
            return null;
        }
        StringBuilder text = new StringBuilder();
        String title = parse.getData().getTitle();
        if (title != null) {
            text.append(title);
            text.append(' ');
        }
        if ((content = parse.getText()) != null) {
            text.append(content);
        }
        if (this.contentMaxlength != -1 && text.length() > this.contentMaxlength) {
            text.setLength(this.contentMaxlength);
        }
        return this.identifyLanguage(text);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    String identifyLanguage(CharSequence text) {
        LanguageResult result = null;
        HTMLLanguageParser hTMLLanguageParser = this;
        synchronized (hTMLLanguageParser) {
            this.languageDetector.reset();
            result = this.languageDetector.detect(text);
        }
        if (this.onlyCertain && !result.isReasonablyCertain()) {
            return null;
        }
        return result.getLanguage();
    }

    private static String getLanguageFromMetadata(Metadata meta) {
        if (meta == null) {
            return null;
        }
        String lang = meta.get("dc.language");
        if (lang != null) {
            return lang;
        }
        lang = meta.get("content-language");
        if (lang != null) {
            return lang;
        }
        return meta.get("lang");
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        this.contentMaxlength = conf.getInt("lang.analyze.max.length", -1);
        this.onlyCertain = conf.getBoolean("lang.identification.only.certain", false);
        String[] policy = conf.getStrings("lang.extraction.policy");
        for (int i = 0; i < policy.length; ++i) {
            if (policy[i].equals("detect")) {
                this.detect = i;
                continue;
            }
            if (!policy[i].equals("identify")) continue;
            this.identify = i;
        }
        if (this.identify >= 0) {
            this.languageDetector = new OptimaizeLangDetector();
            this.languageDetector.loadModels();
        }
    }

    public Configuration getConf() {
        return this.conf;
    }

    static {
        block4: {
            LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
            LANGUAGES_MAP = new HashMap<String, String>();
            try {
                Properties p = new Properties();
                p.load(HTMLLanguageParser.class.getResourceAsStream("langmappings.properties"));
                Enumeration<Object> keys = p.keys();
                while (keys.hasMoreElements()) {
                    String key = (String)keys.nextElement();
                    String[] values = p.getProperty(key).split(",", -1);
                    LANGUAGES_MAP.put(key, key);
                    for (int i = 0; i < values.length; ++i) {
                        LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
                    }
                }
            }
            catch (Exception e) {
                if (!LOG.isErrorEnabled()) break block4;
                LOG.error(e.toString());
            }
        }
    }

    static class LanguageParser {
        private String dublinCore = null;
        private String htmlAttribute = null;
        private String httpEquiv = null;
        private String language = null;

        LanguageParser(Node node) {
            this.parse(node);
            this.language = this.htmlAttribute != null ? this.htmlAttribute : (this.dublinCore != null ? this.dublinCore : this.httpEquiv);
        }

        String getLanguage() {
            return this.language;
        }

        void parse(Node node) {
            NodeWalker walker = new NodeWalker(node);
            while (walker.hasNext()) {
                Node currentNode = walker.nextNode();
                String nodeName = currentNode.getNodeName();
                short nodeType = currentNode.getNodeType();
                if (nodeType == 1) {
                    if (this.htmlAttribute == null) {
                        this.htmlAttribute = LanguageParser.parseLanguage(((Element)currentNode).getAttribute("lang"));
                    }
                    if ("meta".equalsIgnoreCase(nodeName)) {
                        Node valueattr;
                        Node attrnode;
                        int i;
                        NamedNodeMap attrs = currentNode.getAttributes();
                        if (this.dublinCore == null) {
                            for (i = 0; i < attrs.getLength(); ++i) {
                                attrnode = attrs.item(i);
                                if (!"name".equalsIgnoreCase(attrnode.getNodeName()) || !"dc.language".equalsIgnoreCase(attrnode.getNodeValue()) || (valueattr = attrs.getNamedItem("content")) == null) continue;
                                this.dublinCore = LanguageParser.parseLanguage(valueattr.getNodeValue());
                            }
                        }
                        if (this.httpEquiv == null) {
                            for (i = 0; i < attrs.getLength(); ++i) {
                                attrnode = attrs.item(i);
                                if (!"http-equiv".equalsIgnoreCase(attrnode.getNodeName()) || !"content-language".equals(attrnode.getNodeValue().toLowerCase()) || (valueattr = attrs.getNamedItem("content")) == null) continue;
                                this.httpEquiv = LanguageParser.parseLanguage(valueattr.getNodeValue());
                            }
                        }
                    }
                }
                if (this.dublinCore == null || this.htmlAttribute == null || this.httpEquiv == null) continue;
                return;
            }
        }

        static final String parseLanguage(String lang) {
            if (lang == null) {
                return null;
            }
            String code = null;
            String language = null;
            String[] langs = lang.split(",| |;|\\.|\\(|\\)|=", -1);
            for (int i = 0; language == null && i < langs.length; ++i) {
                code = langs[i].split("-")[0];
                code = code.split("_")[0];
                language = LANGUAGES_MAP.get(code.toLowerCase());
            }
            return language;
        }
    }
}

