/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.service.NutchServer;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Injector
extends NutchTool
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static final String URL_FILTER_NORMALIZE_ALL = "crawldb.inject.filter.normalize.all";
    public static String nutchScoreMDName = "nutch.score";
    public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
    public static String nutchFixedFetchIntervalMDName = "nutch.fetchInterval.fixed";

    public Injector() {
    }

    public Injector(Configuration conf) {
        this.setConf(conf);
    }

    public void inject(Path crawlDb, Path urlDir) throws IOException, ClassNotFoundException, InterruptedException {
        this.inject(crawlDb, urlDir, false, false);
    }

    public void inject(Path crawlDb, Path urlDir, boolean overwrite, boolean update) throws IOException, ClassNotFoundException, InterruptedException {
        this.inject(crawlDb, urlDir, overwrite, update, true, true, false);
    }

    public void inject(Path crawlDb, Path urlDir, boolean overwrite, boolean update, boolean normalize, boolean filter, boolean filterNormalizeAll) throws IOException, ClassNotFoundException, InterruptedException {
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        LOG.info("Injector: starting");
        LOG.info("Injector: crawlDb: {}", (Object)crawlDb);
        LOG.info("Injector: urlDir: {}", (Object)urlDir);
        LOG.info("Injector: Converting injected urls to crawl db entries.");
        Configuration conf = this.getConf();
        conf.setLong("injector.current.time", System.currentTimeMillis());
        conf.setBoolean("db.injector.overwrite", overwrite);
        conf.setBoolean("db.injector.update", update);
        conf.setBoolean("crawldb.url.normalizers", normalize);
        conf.setBoolean("crawldb.url.filters", filter);
        conf.setBoolean(URL_FILTER_NORMALIZE_ALL, filterNormalizeAll);
        conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        FileSystem fs = crawlDb.getFileSystem(conf);
        Path current = new Path(crawlDb, "current");
        if (!fs.exists(current)) {
            fs.mkdirs(current);
        }
        Path tempCrawlDb = new Path(crawlDb, "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        Path lock = CrawlDb.lock(conf, crawlDb, false);
        Job job = Job.getInstance((Configuration)conf, (String)("Nutch Injector: " + urlDir));
        job.setJarByClass(Injector.class);
        job.setMapperClass(InjectMapper.class);
        job.setReducerClass(InjectReducer.class);
        job.setOutputFormatClass(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.setSpeculativeExecution(false);
        MultipleInputs.addInputPath((Job)job, (Path)current, SequenceFileInputFormat.class);
        FileStatus[] seedFiles = urlDir.getFileSystem(this.getConf()).listStatus(urlDir);
        int numSeedFiles = 0;
        for (FileStatus seedFile : seedFiles) {
            if (seedFile.isFile()) {
                MultipleInputs.addInputPath((Job)job, (Path)seedFile.getPath(), KeyValueTextInputFormat.class);
                ++numSeedFiles;
                LOG.info("Injecting seed URL file {}", (Object)seedFile.getPath());
                continue;
            }
            LOG.warn("Skipped non-file input in {}: {}", (Object)urlDir, (Object)seedFile.getPath());
        }
        if (numSeedFiles == 0) {
            LOG.error("No seed files to inject found in {}", (Object)urlDir);
            LockUtil.removeLockFile(fs, lock);
            return;
        }
        FileOutputFormat.setOutputPath((Job)job, (Path)tempCrawlDb);
        try {
            boolean success = job.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("Injector", job);
                LOG.error(message);
                NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
                throw new RuntimeException(message);
            }
            CrawlDb.install(job, crawlDb);
            if (LOG.isInfoEnabled()) {
                long urlsInjected = job.getCounters().findCounter("injector", "urls_injected").getValue();
                long urlsInjectedUniq = job.getCounters().findCounter("injector", "urls_injected_unique").getValue();
                long urlsFiltered = job.getCounters().findCounter("injector", "urls_filtered").getValue();
                long urlsMerged = job.getCounters().findCounter("injector", "urls_merged").getValue();
                long urlsPurged404 = job.getCounters().findCounter("injector", "urls_purged_404").getValue();
                long urlsPurgedFilter = job.getCounters().findCounter("injector", "urls_purged_filter").getValue();
                LOG.info("Injector: Total urls rejected by filters: {}", (Object)urlsFiltered);
                LOG.info("Injector: Total urls injected after normalization and filtering: {} (unique URLs: {})", (Object)urlsInjected, (Object)urlsInjectedUniq);
                LOG.info("Injector: Total urls injected but already in CrawlDb: {}", (Object)urlsMerged);
                LOG.info("Injector: Total new urls injected: {}", (Object)(urlsInjectedUniq - urlsMerged));
                if (filterNormalizeAll) {
                    LOG.info("Injector: Total urls removed from CrawlDb by filters: {}", (Object)urlsPurgedFilter);
                }
                if (conf.getBoolean("db.update.purge.404", false)) {
                    LOG.info("Injector: Total urls with status gone removed from CrawlDb (db.update.purge.404): {}", (Object)urlsPurged404);
                }
                stopWatch.stop();
                LOG.info("Injector: finished, elapsed: {} ms", (Object)stopWatch.getTime(TimeUnit.MILLISECONDS));
            }
        }
        catch (IOException | ClassNotFoundException | InterruptedException | NullPointerException e) {
            LOG.error("Injector job failed: {}", (Object)e.getMessage());
            NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
            throw e;
        }
    }

    public void usage() {
        System.err.println("Usage: Injector [-D...] <crawldb> <url_dir> [-overwrite|-update] [-noFilter] [-noNormalize] [-filterNormalizeAll]\n");
        System.err.println("  <crawldb>\tPath to a crawldb directory. If not present, a new one would be created.");
        System.err.println("  <url_dir>\tPath to URL file or directory with URL file(s) containing URLs to be injected.");
        System.err.println("           \tA URL file should have one URL per line, optionally followed by custom metadata.");
        System.err.println("           \tBlank lines or lines starting with a '#' would be ignored. Custom metadata must");
        System.err.println("           \tbe of form 'key=value' and separated by tabs.");
        System.err.println("           \tBelow are reserved metadata keys:\n");
        System.err.println("           \t\tnutch.score: A custom score for a url");
        System.err.println("           \t\tnutch.fetchInterval: A custom fetch interval for a url");
        System.err.println("           \t\tnutch.fetchInterval.fixed: A custom fetch interval for a url that is not changed by AdaptiveFetchSchedule\n");
        System.err.println("           \tExample:");
        System.err.println("           \t http://www.apache.org/");
        System.err.println("           \t http://www.nutch.org/ \\t nutch.score=10 \\t nutch.fetchInterval=2592000 \\t userType=open_source\n");
        System.err.println(" -overwrite\tOverwite existing crawldb records by the injected records. Has precedence over 'update'");
        System.err.println(" -update   \tUpdate existing crawldb records with the injected records. Old metadata is preserved");
        System.err.println();
        System.err.println(" -noNormalize\tDo not normalize URLs before injecting");
        System.err.println(" -noFilter \tDo not apply URL filters to injected URLs");
        System.err.println(" -filterNormalizeAll\n           \tNormalize and filter all URLs including the URLs of existing CrawlDb records");
        System.err.println();
        System.err.println(" -D...     \tset or overwrite configuration property (property=value)");
        System.err.println(" -Ddb.update.purge.404=true\n           \tremove URLs with status gone (404) from CrawlDb");
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new Injector(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            this.usage();
            return -1;
        }
        boolean overwrite = false;
        boolean update = false;
        boolean normalize = true;
        boolean filter = true;
        boolean filterNormalizeAll = false;
        for (int i = 2; i < args.length; ++i) {
            if (args[i].equalsIgnoreCase("-overwrite")) {
                overwrite = true;
                continue;
            }
            if (args[i].equalsIgnoreCase("-update")) {
                update = true;
                continue;
            }
            if (args[i].equalsIgnoreCase("-noNormalize")) {
                normalize = false;
                continue;
            }
            if (args[i].equalsIgnoreCase("-noFilter")) {
                filter = false;
                continue;
            }
            if (args[i].equalsIgnoreCase("-filterNormalizeAll")) {
                filterNormalizeAll = true;
                continue;
            }
            LOG.error("Injector: Found invalid argument \"{}\"", (Object)args[i]);
            this.usage();
            return -1;
        }
        try {
            this.inject(new Path(args[0]), new Path(args[1]), overwrite, update, normalize, filter, filterNormalizeAll);
            return 0;
        }
        catch (Exception e) {
            LOG.error("Injector: " + StringUtils.stringifyException((Throwable)e));
            return -1;
        }
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
        Object crawldbPath;
        if (args.size() < 1) {
            throw new IllegalArgumentException("Required arguments <url_dir> or <seedName>");
        }
        Object path = null;
        if (args.containsKey("url_dir")) {
            path = args.get("url_dir");
        } else if (args.containsKey("seedName")) {
            path = NutchServer.getInstance().getSeedManager().getSeedList((String)args.get("seedName")).getSeedFilePath();
        } else {
            throw new IllegalArgumentException("Required arguments <url_dir> or <seedName>");
        }
        Path input = path instanceof Path ? (Path)path : new Path(path.toString());
        HashMap<String, Object> results = new HashMap<String, Object>();
        Path crawlDb = args.containsKey("crawldb") ? ((crawldbPath = args.get("crawldb")) instanceof Path ? (Path)crawldbPath : new Path(crawldbPath.toString())) : new Path(crawlId + "/crawldb");
        this.inject(crawlDb, input);
        results.put("result", Integer.toString(0));
        return results;
    }

    public static class InjectReducer
    extends Reducer<Text, CrawlDatum, Text, CrawlDatum> {
        private int interval;
        private float scoreInjected;
        private boolean overwrite = false;
        private boolean update = false;
        private CrawlDatum old = new CrawlDatum();
        private CrawlDatum injected = new CrawlDatum();

        public void setup(Reducer.Context context) {
            Configuration conf = context.getConfiguration();
            this.interval = conf.getInt("db.fetch.interval.default", 2592000);
            this.scoreInjected = conf.getFloat("db.score.injected", 1.0f);
            this.overwrite = conf.getBoolean("db.injector.overwrite", false);
            this.update = conf.getBoolean("db.injector.update", false);
            LOG.info("Injector: overwrite: " + this.overwrite);
            LOG.info("Injector: update: " + this.update);
        }

        public void reduce(Text key, Iterable<CrawlDatum> values, Reducer.Context context) throws IOException, InterruptedException {
            CrawlDatum result;
            boolean oldSet = false;
            boolean injectedSet = false;
            for (CrawlDatum val : values) {
                if (val.getStatus() == 66) {
                    this.injected.set(val);
                    this.injected.setStatus(1);
                    injectedSet = true;
                    continue;
                }
                this.old.set(val);
                oldSet = true;
            }
            if (injectedSet && (!oldSet || this.overwrite)) {
                result = this.injected;
            } else {
                result = this.old;
                if (injectedSet && this.update) {
                    this.old.putAllMetaData(this.injected);
                    this.old.setScore(this.injected.getScore() != this.scoreInjected ? this.injected.getScore() : this.old.getScore());
                    this.old.setFetchInterval(this.injected.getFetchInterval() != this.interval ? this.injected.getFetchInterval() : this.old.getFetchInterval());
                }
            }
            if (injectedSet) {
                context.getCounter("injector", "urls_injected_unique").increment(1L);
                if (oldSet) {
                    context.getCounter("injector", "urls_merged").increment(1L);
                }
            }
            context.write((Object)key, (Object)result);
        }
    }

    public static class InjectMapper
    extends Mapper<Text, Writable, Text, CrawlDatum> {
        public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope";
        public static final String TAB_CHARACTER = "\t";
        public static final String EQUAL_CHARACTER = "=";
        private URLNormalizers urlNormalizers;
        private int interval;
        private float scoreInjected;
        private URLFilters filters;
        private ScoringFilters scfilters;
        private long curTime;
        private boolean url404Purging;
        private String scope;
        private boolean filterNormalizeAll = false;

        public void setup(Mapper.Context context) {
            Configuration conf = context.getConfiguration();
            boolean normalize = conf.getBoolean("crawldb.url.normalizers", true);
            boolean filter = conf.getBoolean("crawldb.url.filters", true);
            this.filterNormalizeAll = conf.getBoolean(Injector.URL_FILTER_NORMALIZE_ALL, false);
            if (normalize) {
                this.scope = conf.get(URL_NORMALIZING_SCOPE, "inject");
                this.urlNormalizers = new URLNormalizers(conf, this.scope);
            }
            this.interval = conf.getInt("db.fetch.interval.default", 2592000);
            if (filter) {
                this.filters = new URLFilters(conf);
            }
            this.scfilters = new ScoringFilters(conf);
            this.scoreInjected = conf.getFloat("db.score.injected", 1.0f);
            this.curTime = conf.getLong("injector.current.time", System.currentTimeMillis());
            this.url404Purging = conf.getBoolean("db.update.purge.404", false);
        }

        private String filterNormalize(String url) {
            if (url != null) {
                try {
                    if (this.urlNormalizers != null) {
                        url = this.urlNormalizers.normalize(url, this.scope);
                    }
                    if (this.filters != null) {
                        url = this.filters.filter(url);
                    }
                }
                catch (Exception e) {
                    LOG.warn("Skipping " + url + ":" + e);
                    url = null;
                }
            }
            return url;
        }

        private void processMetaData(String metadata, CrawlDatum datum, String url) {
            String[] splits;
            for (String split : splits = metadata.split(TAB_CHARACTER)) {
                int indexEquals = split.indexOf(EQUAL_CHARACTER);
                if (indexEquals == -1) continue;
                String metaname = split.substring(0, indexEquals);
                String metavalue = split.substring(indexEquals + 1);
                try {
                    if (metaname.equals(nutchScoreMDName)) {
                        datum.setScore(Float.parseFloat(metavalue));
                        continue;
                    }
                    if (metaname.equals(nutchFetchIntervalMDName)) {
                        datum.setFetchInterval(Integer.parseInt(metavalue));
                        continue;
                    }
                    if (metaname.equals(nutchFixedFetchIntervalMDName)) {
                        int fixedInterval = Integer.parseInt(metavalue);
                        if (fixedInterval <= -1) continue;
                        datum.getMetaData().put((Writable)Nutch.WRITABLE_FIXED_INTERVAL_KEY, (Writable)new FloatWritable((float)fixedInterval));
                        datum.setFetchInterval(fixedInterval);
                        continue;
                    }
                    datum.getMetaData().put((Writable)new Text(metaname), (Writable)new Text(metavalue));
                }
                catch (NumberFormatException nfe) {
                    LOG.error("Invalid number '" + metavalue + "' in metadata '" + metaname + "' for url " + url);
                }
            }
        }

        public void map(Text key, Writable value, Mapper.Context context) throws IOException, InterruptedException {
            if (value instanceof Text) {
                String url = key.toString().trim();
                if (url.length() == 0 || url.startsWith("#")) {
                    return;
                }
                if ((url = this.filterNormalize(url)) == null) {
                    context.getCounter("injector", "urls_filtered").increment(1L);
                } else {
                    CrawlDatum datum = new CrawlDatum();
                    datum.setStatus(66);
                    datum.setFetchTime(this.curTime);
                    datum.setScore(this.scoreInjected);
                    datum.setFetchInterval(this.interval);
                    String metadata = value.toString().trim();
                    if (metadata.length() > 0) {
                        this.processMetaData(metadata, datum, url);
                    }
                    try {
                        key.set(url);
                        this.scfilters.injectedScore(key, datum);
                    }
                    catch (ScoringFilterException e) {
                        LOG.warn("Cannot filter injected score for url {}, using default ({})", (Object)url, (Object)e.getMessage());
                    }
                    context.getCounter("injector", "urls_injected").increment(1L);
                    context.write((Object)key, (Object)datum);
                }
            } else if (value instanceof CrawlDatum) {
                CrawlDatum datum = (CrawlDatum)value;
                if (this.url404Purging && 3 == datum.getStatus()) {
                    context.getCounter("injector", "urls_purged_404").increment(1L);
                    return;
                }
                if (this.filterNormalizeAll) {
                    String url = this.filterNormalize(key.toString());
                    if (url == null) {
                        context.getCounter("injector", "urls_purged_filter").increment(1L);
                    } else {
                        key.set(url);
                        context.write((Object)key, (Object)datum);
                    }
                } else {
                    context.write((Object)key, (Object)datum);
                }
            }
        }
    }
}

