/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDbFilter;
import org.apache.nutch.crawl.CrawlDbReducer;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CrawlDb
extends NutchTool
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
    public static final String CRAWLDB_PURGE_404 = "db.update.purge.404";
    public static final String CRAWLDB_PURGE_ORPHANS = "db.update.purge.orphans";
    public static final String CURRENT_NAME = "current";
    public static final String LOCK_NAME = ".locked";

    public CrawlDb() {
    }

    public CrawlDb(Configuration conf) {
        this.setConf(conf);
    }

    public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter) throws IOException, InterruptedException, ClassNotFoundException {
        boolean additionsAllowed = this.getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
        this.update(crawlDb, segments, normalize, filter, additionsAllowed, false);
    }

    public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException, InterruptedException, ClassNotFoundException {
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        Path lock = CrawlDb.lock(this.getConf(), crawlDb, force);
        Job job = CrawlDb.createJob(this.getConf(), crawlDb);
        Configuration conf = job.getConfiguration();
        conf.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
        conf.setBoolean("crawldb.url.filters", filter);
        conf.setBoolean("crawldb.url.normalizers", normalize);
        boolean url404Purging = conf.getBoolean(CRAWLDB_PURGE_404, false);
        LOG.info("CrawlDb update: starting");
        LOG.info("CrawlDb update: db: {}", (Object)crawlDb);
        LOG.info("CrawlDb update: segments: {}", Arrays.asList(segments));
        LOG.info("CrawlDb update: additions allowed: {}", (Object)additionsAllowed);
        LOG.info("CrawlDb update: URL normalizing: {}", (Object)normalize);
        LOG.info("CrawlDb update: URL filtering: {}", (Object)filter);
        LOG.info("CrawlDb update: 404 purging: {}", (Object)url404Purging);
        for (int i = 0; i < segments.length; ++i) {
            FileSystem sfs = segments[i].getFileSystem(this.getConf());
            Path fetch = new Path(segments[i], "crawl_fetch");
            Path parse = new Path(segments[i], "crawl_parse");
            if (sfs.exists(fetch)) {
                FileInputFormat.addInputPath((Job)job, (Path)fetch);
                if (sfs.exists(parse)) {
                    FileInputFormat.addInputPath((Job)job, (Path)parse);
                    continue;
                }
                LOG.info(" - adding fetched but unparsed segment {}", (Object)segments[i]);
                continue;
            }
            LOG.info(" - skipping invalid segment {}", (Object)segments[i]);
        }
        LOG.info("CrawlDb update: Merging segment data into db.");
        FileSystem fs = crawlDb.getFileSystem(this.getConf());
        Path outPath = FileOutputFormat.getOutputPath((JobContext)job);
        try {
            boolean success = job.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("CrawlDb update", job);
                LOG.error(message);
                NutchJob.cleanupAfterFailure(outPath, lock, fs);
                throw new RuntimeException(message);
            }
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error("CrawlDb update job failed: {}", (Object)e.getMessage());
            NutchJob.cleanupAfterFailure(outPath, lock, fs);
            throw e;
        }
        CrawlDb.install(job, crawlDb);
        if (filter) {
            long urlsFiltered = job.getCounters().findCounter("CrawlDB filter", "URLs filtered").getValue();
            LOG.info("CrawlDb update: Total number of existing URLs in CrawlDb rejected by URL filters: {}", (Object)urlsFiltered);
        }
        stopWatch.stop();
        LOG.info("CrawlDb update: finished, elapsed: {} ms", (Object)stopWatch.getTime(TimeUnit.MILLISECONDS));
    }

    public static Job createJob(Configuration config, Path crawlDb) throws IOException {
        Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        Path current = new Path(crawlDb, CURRENT_NAME);
        Job job = Job.getInstance((Configuration)config, (String)("Nutch CrawlDb: " + crawlDb));
        if (current.getFileSystem(job.getConfiguration()).exists(current)) {
            FileInputFormat.addInputPath((Job)job, (Path)current);
        }
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbFilter.class);
        job.setReducerClass(CrawlDbReducer.class);
        job.setJarByClass(CrawlDb.class);
        FileOutputFormat.setOutputPath((Job)job, (Path)newCrawlDb);
        job.setOutputFormatClass(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.getConfiguration().setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        return job;
    }

    public static Path lock(Configuration job, Path crawlDb, boolean force) throws IOException {
        Path lock = new Path(crawlDb, LOCK_NAME);
        LockUtil.createLockFile(job, lock, force);
        return lock;
    }

    private static void install(Configuration conf, Path crawlDb, Path tempCrawlDb) throws IOException {
        boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
        FileSystem fs = crawlDb.getFileSystem(conf);
        Path old = new Path(crawlDb, "old");
        Path current = new Path(crawlDb, CURRENT_NAME);
        if (fs.exists(current)) {
            FSUtils.replace(fs, old, current, true);
        }
        FSUtils.replace(fs, current, tempCrawlDb, true);
        Path lock = new Path(crawlDb, LOCK_NAME);
        LockUtil.removeLockFile(fs, lock);
        if (!preserveBackup && fs.exists(old)) {
            fs.delete(old, true);
        }
    }

    public static void install(Job job, Path crawlDb) throws IOException {
        Configuration conf = job.getConfiguration();
        Path tempCrawlDb = FileOutputFormat.getOutputPath((JobContext)job);
        CrawlDb.install(conf, crawlDb, tempCrawlDb);
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new CrawlDb(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length < 1) {
            System.err.println("Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]");
            System.err.println("\tcrawldb\tCrawlDb to update");
            System.err.println("\t-dir segments\tparent directory containing all segments to update from");
            System.err.println("\tseg1 seg2 ...\tlist of segment names to update from");
            System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
            System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)");
            System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment");
            System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");
            return -1;
        }
        boolean normalize = this.getConf().getBoolean("crawldb.url.normalizers", false);
        boolean filter = this.getConf().getBoolean("crawldb.url.filters", false);
        boolean additionsAllowed = this.getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
        boolean force = false;
        HashSet<Path> dirs = new HashSet<Path>();
        for (int i = 1; i < args.length; ++i) {
            if (args[i].equals("-normalize")) {
                normalize = true;
                continue;
            }
            if (args[i].equals("-filter")) {
                filter = true;
                continue;
            }
            if (args[i].equals("-force")) {
                force = true;
                continue;
            }
            if (args[i].equals("-noAdditions")) {
                additionsAllowed = false;
                continue;
            }
            if (args[i].equals("-dir")) {
                Path dirPath = new Path(args[++i]);
                FileSystem fs = dirPath.getFileSystem(this.getConf());
                FileStatus[] paths = fs.listStatus(dirPath, HadoopFSUtil.getPassDirectoriesFilter(fs));
                dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
                continue;
            }
            dirs.add(new Path(args[i]));
        }
        try {
            this.update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize, filter, additionsAllowed, force);
            return 0;
        }
        catch (Exception e) {
            LOG.error("CrawlDb update: ", (Throwable)e);
            return -1;
        }
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
        Object crawldbPath;
        HashMap<String, Object> results = new HashMap<String, Object>();
        boolean normalize = this.getConf().getBoolean("crawldb.url.normalizers", false);
        boolean filter = this.getConf().getBoolean("crawldb.url.filters", false);
        boolean additionsAllowed = this.getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
        boolean force = false;
        HashSet<Path> dirs = new HashSet<Path>();
        if (args.containsKey("normalize")) {
            normalize = true;
        }
        if (args.containsKey("filter")) {
            filter = true;
        }
        if (args.containsKey("force")) {
            force = true;
        }
        if (args.containsKey("noAdditions")) {
            additionsAllowed = false;
        }
        Path crawlDb = args.containsKey("crawldb") ? ((crawldbPath = args.get("crawldb")) instanceof Path ? (Path)crawldbPath : new Path(crawldbPath.toString())) : new Path(crawlId + "/crawldb");
        if (args.containsKey("segment_dir")) {
            Object segDir = args.get("segment_dir");
            Path segmentsDir = segDir instanceof Path ? (Path)segDir : new Path(segDir.toString());
            FileSystem fs = segmentsDir.getFileSystem(this.getConf());
            FileStatus[] paths = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
            dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
        } else if (args.containsKey("segment")) {
            Object segments = args.get("segment");
            ArrayList segmentList = new ArrayList();
            if (segments instanceof ArrayList) {
                segmentList = (ArrayList)segments;
            } else if (segments instanceof Path) {
                segmentList.add(segments.toString());
            }
            for (String segment : segmentList) {
                dirs.add(new Path(segment));
            }
        } else {
            String segmentDir = crawlId + "/segments";
            File dir = new File(segmentDir);
            File[] segmentsList = dir.listFiles();
            Arrays.sort(segmentsList, (f1, f2) -> {
                if (f1.lastModified() > f2.lastModified()) {
                    return -1;
                }
                return 0;
            });
            dirs.add(new Path(segmentsList[0].getPath()));
        }
        try {
            this.update(crawlDb, dirs.toArray(new Path[dirs.size()]), normalize, filter, additionsAllowed, force);
            results.put("result", Integer.toString(0));
            return results;
        }
        catch (Exception e) {
            LOG.error("CrawlDb update: " + StringUtils.stringifyException((Throwable)e));
            results.put("result", Integer.toString(-1));
            return results;
        }
    }
}

