/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.indexer.replace;

import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.NutchField;
import org.apache.nutch.indexer.replace.FieldReplacer;
import org.apache.nutch.parse.Parse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ReplaceIndexer
implements IndexingFilter {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private static final String HOSTMATCH = "hostmatch";
    private static final String URLMATCH = "urlmatch";
    private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_HOST = new LinkedHashMap<Pattern, List<FieldReplacer>>();
    private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_URL = new LinkedHashMap<Pattern, List<FieldReplacer>>();
    private static Pattern LINE_SPLIT = Pattern.compile("(^.+$)+", 8);
    private static Pattern NAME_VALUE_SPLIT = Pattern.compile("(.*?)=(.*)");
    private Configuration conf;

    public void setConf(Configuration conf) {
        this.conf = conf;
        FIELDREPLACERS_BY_HOST.clear();
        FIELDREPLACERS_BY_URL.clear();
        String value = conf.get("index.replace.regexp", null);
        if (value != null) {
            LOG.debug("Parsing index.replace.regexp property");
            this.parseConf(value);
        }
    }

    public Configuration getConf() {
        return this.conf;
    }

    private void parseConf(String propertyValue) {
        if (propertyValue == null || propertyValue.trim().length() == 0) {
            return;
        }
        Pattern hostPattern = Pattern.compile(".*");
        Pattern urlPattern = null;
        Matcher lineMatcher = LINE_SPLIT.matcher(propertyValue);
        while (lineMatcher.find()) {
            List<FieldReplacer> lfp;
            Matcher nameValueMatcher;
            String line = lineMatcher.group();
            if (line == null || line.length() <= 0 || !(nameValueMatcher = NAME_VALUE_SPLIT.matcher(line.trim())).find()) continue;
            String fieldName = nameValueMatcher.group(1).trim();
            String value = nameValueMatcher.group(2);
            if (fieldName == null || value == null) continue;
            if (HOSTMATCH.equals(fieldName)) {
                urlPattern = null;
                try {
                    hostPattern = Pattern.compile(value);
                }
                catch (PatternSyntaxException pse) {
                    LOG.error("hostmatch pattern " + value + " does not compile: " + pse.getMessage());
                    hostPattern = Pattern.compile("willnotmatchanyhost");
                }
                continue;
            }
            if (URLMATCH.equals(fieldName)) {
                try {
                    urlPattern = Pattern.compile(value);
                }
                catch (PatternSyntaxException pse) {
                    LOG.error("urlmatch pattern " + value + " does not compile: " + pse.getMessage());
                    urlPattern = Pattern.compile("willnotmatchanyurl");
                }
                continue;
            }
            if (value.length() <= 3) continue;
            String toFieldName = fieldName;
            if (fieldName.indexOf(58) > 0) {
                toFieldName = fieldName.substring(fieldName.indexOf(58) + 1);
                fieldName = fieldName.substring(0, fieldName.indexOf(58));
            }
            String sep = value.substring(0, 1);
            if (!(value = value.substring(1)).contains(sep)) {
                LOG.error("Pattern '" + line + "', not parseable.  Missing separator " + sep);
                continue;
            }
            String pattern = value.substring(0, value.indexOf(sep));
            String replacement = value = value.substring(pattern.length() + 1);
            if (value.contains(sep)) {
                replacement = value.substring(0, value.indexOf(sep));
            }
            int flags = 0;
            if (value.length() > replacement.length() + 1) {
                value = value.substring(replacement.length() + 1).trim();
                try {
                    flags = Integer.parseInt(value);
                }
                catch (NumberFormatException e) {
                    LOG.error("Pattern " + line + ", has invalid flags component");
                    continue;
                }
            }
            Integer iFlags = flags > 0 ? Integer.valueOf(flags) : null;
            FieldReplacer fr = new FieldReplacer(fieldName, toFieldName, pattern, replacement, iFlags);
            if (urlPattern != null) {
                lfp = FIELDREPLACERS_BY_URL.get(urlPattern);
                if (lfp == null) {
                    lfp = new ArrayList<FieldReplacer>();
                }
                lfp.add(fr);
                FIELDREPLACERS_BY_URL.put(urlPattern, lfp);
                continue;
            }
            lfp = FIELDREPLACERS_BY_HOST.get(hostPattern);
            if (lfp == null) {
                lfp = new ArrayList<FieldReplacer>();
            }
            lfp.add(fr);
            FIELDREPLACERS_BY_HOST.put(hostPattern, lfp);
        }
    }

    public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
        if (doc != null) {
            if (FIELDREPLACERS_BY_HOST.size() > 0) {
                this.doReplace(doc, "host", FIELDREPLACERS_BY_HOST);
            }
            if (FIELDREPLACERS_BY_URL.size() > 0) {
                this.doReplace(doc, "url", FIELDREPLACERS_BY_URL);
            }
        }
        return doc;
    }

    private void doReplace(NutchDocument doc, String keyName, Map<Pattern, List<FieldReplacer>> replaceMap) {
        if (doc == null || replaceMap.size() == 0) {
            return;
        }
        Collection docFieldNames = doc.getFieldNames();
        NutchField keyField = doc.getField(keyName);
        if (keyField == null) {
            return;
        }
        List keyFieldValues = keyField.getValues();
        if (keyFieldValues.size() == 0) {
            return;
        }
        for (Object oKeyFieldValue : keyFieldValues) {
            if (oKeyFieldValue == null || !(oKeyFieldValue instanceof String)) continue;
            String keyFieldValue = (String)oKeyFieldValue;
            for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap.entrySet()) {
                if (!entries.getKey().matcher(keyFieldValue).find()) continue;
                for (FieldReplacer fp : entries.getValue()) {
                    String fieldName = fp.getFieldName();
                    if (!docFieldNames.contains(fieldName)) continue;
                    NutchField docField = doc.getField(fieldName);
                    List fieldValues = docField.getValues();
                    ArrayList<String> newFieldValues = new ArrayList<String>();
                    for (Object oFieldValue : fieldValues) {
                        if (oFieldValue == null || !(oFieldValue instanceof String)) continue;
                        String fieldValue = (String)oFieldValue;
                        String newValue = fp.replace(fieldValue);
                        newFieldValues.add(newValue);
                    }
                    String targetFieldName = fp.getToFieldName();
                    doc.removeField(targetFieldName);
                    for (String newFieldValue : newFieldValues) {
                        doc.add(targetFieldName, (Object)newFieldValue);
                    }
                }
            }
        }
    }
}

