/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.net.urlnormalizer.regex;

import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;

public class RegexURLNormalizer
extends Configured
implements URLNormalizer {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = new ThreadLocal<HashMap<String, List<Rule>>>(){

        @Override
        protected HashMap<String, List<Rule>> initialValue() {
            return new HashMap<String, List<Rule>>();
        }
    };
    private List<Rule> defaultRules;
    private static final List<Rule> EMPTY_RULES = Collections.emptyList();

    public HashMap<String, List<Rule>> getScopedRules() {
        return this.scopedRulesThreadLocal.get();
    }

    public RegexURLNormalizer() {
        super(null);
    }

    public RegexURLNormalizer(Configuration conf) {
        super(conf);
    }

    public RegexURLNormalizer(Configuration conf, String filename) throws IOException, PatternSyntaxException {
        super(conf);
        List<Rule> rules = this.readConfigurationFile(filename);
        if (rules != null) {
            this.defaultRules = rules;
        }
    }

    public void setConf(Configuration conf) {
        super.setConf(conf);
        if (conf == null) {
            return;
        }
        String filename = this.getConf().get("urlnormalizer.regex.file");
        String stringRules = this.getConf().get("urlnormalizer.regex.rules");
        Reader reader = null;
        reader = stringRules != null ? new StringReader(stringRules) : this.getConf().getConfResourceAsReader(filename);
        List<Rule> rules = null;
        if (reader == null) {
            LOG.warn("Can't load the default rules! ");
            rules = EMPTY_RULES;
        } else {
            try {
                rules = this.readConfiguration(reader);
            }
            catch (Exception e) {
                LOG.warn("Couldn't read default config: " + e);
                rules = EMPTY_RULES;
            }
        }
        this.defaultRules = rules;
    }

    void setConfiguration(Reader reader, String scope) {
        List<Rule> rules = this.readConfiguration(reader);
        this.getScopedRules().put(scope, rules);
        LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules.");
    }

    public String regexNormalize(String urlString, String scope) {
        HashMap<String, List<Rule>> scopedRules = this.getScopedRules();
        List<Rule> curRules = scopedRules.get(scope);
        if (curRules == null) {
            String configFile = this.getConf().get("urlnormalizer.regex.file." + scope);
            if (configFile != null) {
                LOG.debug("resource for scope '" + scope + "': " + configFile);
                try {
                    Reader reader = this.getConf().getConfResourceAsReader(configFile);
                    curRules = this.readConfiguration(reader);
                    scopedRules.put(scope, curRules);
                }
                catch (Exception e) {
                    LOG.warn("Couldn't load resource '" + configFile + "': " + e);
                }
            }
            if (curRules == EMPTY_RULES || curRules == null) {
                LOG.info("can't find rules for scope '" + scope + "', using default");
                scopedRules.put(scope, EMPTY_RULES);
            }
        }
        if (curRules == EMPTY_RULES || curRules == null) {
            curRules = this.defaultRules;
        }
        for (Rule r : curRules) {
            Matcher matcher = r.pattern.matcher(urlString);
            urlString = matcher.replaceAll(r.substitution);
        }
        return urlString;
    }

    public String normalize(String urlString, String scope) throws MalformedURLException {
        return this.regexNormalize(urlString, scope);
    }

    private List<Rule> readConfigurationFile(String filename) {
        if (LOG.isInfoEnabled()) {
            LOG.info("loading " + filename);
        }
        try {
            FileReader reader = new FileReader(filename);
            return this.readConfiguration(reader);
        }
        catch (Exception e) {
            LOG.error("Error loading rules from '" + filename + "': " + e);
            return EMPTY_RULES;
        }
    }

    private List<Rule> readConfiguration(Reader reader) {
        ArrayList<Rule> rules = new ArrayList<Rule>();
        try {
            Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new InputSource(reader));
            Element root = doc.getDocumentElement();
            if (!"regex-normalize".equals(root.getTagName()) && LOG.isErrorEnabled()) {
                LOG.error("bad conf file: top-level element not <regex-normalize>");
            }
            NodeList regexes = root.getChildNodes();
            for (int i = 0; i < regexes.getLength(); ++i) {
                Node regexNode = regexes.item(i);
                if (!(regexNode instanceof Element)) continue;
                Element regex = (Element)regexNode;
                if (!"regex".equals(regex.getTagName()) && LOG.isWarnEnabled()) {
                    LOG.warn("bad conf file: element not <regex>");
                }
                NodeList fields = regex.getChildNodes();
                String patternValue = null;
                String subValue = null;
                for (int j = 0; j < fields.getLength(); ++j) {
                    Node fieldNode = fields.item(j);
                    if (!(fieldNode instanceof Element)) continue;
                    Element field = (Element)fieldNode;
                    if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) {
                        patternValue = ((Text)field.getFirstChild()).getData();
                    }
                    if ("substitution".equals(field.getTagName()) && field.hasChildNodes()) {
                        subValue = ((Text)field.getFirstChild()).getData();
                    }
                    if (field.hasChildNodes()) continue;
                    subValue = "";
                }
                if (patternValue == null || subValue == null) continue;
                Rule rule = new Rule();
                try {
                    rule.pattern = Pattern.compile(patternValue);
                }
                catch (PatternSyntaxException e) {
                    if (!LOG.isErrorEnabled()) continue;
                    LOG.error("skipped rule: " + patternValue + " -> " + subValue + " : invalid regular expression pattern: " + e);
                    continue;
                }
                rule.substitution = subValue;
                rules.add(rule);
            }
        }
        catch (Exception e) {
            if (LOG.isErrorEnabled()) {
                LOG.error("error parsing conf file: " + e);
            }
            return EMPTY_RULES;
        }
        if (rules.size() == 0) {
            return EMPTY_RULES;
        }
        return rules;
    }

    public static void main(String[] args) throws PatternSyntaxException, IOException {
        RegexURLNormalizer normalizer = new RegexURLNormalizer();
        normalizer.setConf(NutchConfiguration.create());
        HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules();
        Iterator<Rule> i = normalizer.defaultRules.iterator();
        System.out.println("* Rules for 'DEFAULT' scope:");
        while (i.hasNext()) {
            Rule r = i.next();
            System.out.print("  " + r.pattern.pattern() + " -> ");
            System.out.println(r.substitution);
        }
        if (args.length > 1) {
            normalizer.normalize("http://test.com", args[1]);
        }
        if (scopedRules.size() > 1) {
            for (String scope : scopedRules.keySet()) {
                if ("default".equals(scope)) continue;
                System.out.println("* Rules for '" + scope + "' scope:");
                for (Rule r : scopedRules.get(scope)) {
                    System.out.print("  " + r.pattern.pattern() + " -> ");
                    System.out.println(r.substitution);
                }
            }
        }
        if (args.length > 0) {
            System.out.println("\n---------- Normalizer test -----------");
            String scope = "default";
            if (args.length > 1) {
                scope = args[1];
            }
            System.out.println("Scope: " + scope);
            System.out.println("Input url:  '" + args[0] + "'");
            System.out.println("Output url: '" + normalizer.normalize(args[0], scope) + "'");
        }
        System.exit(0);
    }

    private static class Rule {
        public Pattern pattern;
        public String substitution;

        private Rule() {
        }
    }
}

