/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDbMerger;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class LinkDb
extends NutchTool
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static final String IGNORE_INTERNAL_LINKS = "linkdb.ignore.internal.links";
    public static final String IGNORE_EXTERNAL_LINKS = "linkdb.ignore.external.links";
    public static final String CURRENT_NAME = "current";
    public static final String LOCK_NAME = ".locked";

    public LinkDb() {
    }

    public LinkDb(Configuration conf) {
        this.setConf(conf);
    }

    private static String getHost(String url) {
        try {
            return new URL(url).getHost().toLowerCase();
        }
        catch (MalformedURLException e) {
            return null;
        }
    }

    public void invert(Path linkDb, Path segmentsDir, boolean normalize, boolean filter, boolean force) throws IOException, InterruptedException, ClassNotFoundException {
        FileSystem fs = segmentsDir.getFileSystem(this.getConf());
        FileStatus[] files = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
        this.invert(linkDb, HadoopFSUtil.getPaths(files), normalize, filter, force);
    }

    public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException, InterruptedException, ClassNotFoundException {
        Job job = LinkDb.createJob(this.getConf(), linkDb, normalize, filter);
        Path lock = new Path(linkDb, LOCK_NAME);
        FileSystem fs = linkDb.getFileSystem(this.getConf());
        LockUtil.createLockFile(fs, lock, force);
        Path currentLinkDb = new Path(linkDb, CURRENT_NAME);
        Configuration conf = job.getConfiguration();
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        LOG.info("LinkDb: starting");
        LOG.info("LinkDb: linkdb: {}", (Object)linkDb);
        LOG.info("LinkDb: URL normalize: {}", (Object)normalize);
        LOG.info("LinkDb: URL filter: {}", (Object)filter);
        if (conf.getBoolean(IGNORE_INTERNAL_LINKS, true)) {
            LOG.info("LinkDb: internal links will be ignored.");
        }
        if (conf.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
            LOG.info("LinkDb: external links will be ignored.");
        }
        if (conf.getBoolean(IGNORE_INTERNAL_LINKS, true) && conf.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
            LOG.warn("LinkDb: internal and external links are ignored! Nothing to do, actually. Exiting.");
            LockUtil.removeLockFile(fs, lock);
            return;
        }
        for (int i = 0; i < segments.length; ++i) {
            LOG.info("LinkDb: adding segment: {}", (Object)segments[i]);
            FileInputFormat.addInputPath((Job)job, (Path)new Path(segments[i], "parse_data"));
        }
        try {
            boolean success = job.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("LinkDb", job);
                LOG.error(message);
                LockUtil.removeLockFile(fs, lock);
                throw new RuntimeException(message);
            }
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error("LinkDb job failed: {}", (Object)e.getMessage());
            LockUtil.removeLockFile(fs, lock);
            throw e;
        }
        if (fs.exists(currentLinkDb)) {
            LOG.info("LinkDb: merging with existing linkdb: {}", (Object)linkDb);
            Path newLinkDb = FileOutputFormat.getOutputPath((JobContext)job);
            job = LinkDbMerger.createMergeJob(this.getConf(), linkDb, normalize, filter);
            FileInputFormat.addInputPath((Job)job, (Path)currentLinkDb);
            FileInputFormat.addInputPath((Job)job, (Path)newLinkDb);
            try {
                boolean success = job.waitForCompletion(true);
                if (!success) {
                    String message = NutchJob.getJobFailureLogMessage("LinkDb", job);
                    LOG.error(message);
                    NutchJob.cleanupAfterFailure(newLinkDb, lock, fs);
                    throw new RuntimeException(message);
                }
            }
            catch (IOException | ClassNotFoundException | InterruptedException e) {
                LOG.error("LinkDb job failed: {}", (Object)e.getMessage());
                NutchJob.cleanupAfterFailure(newLinkDb, lock, fs);
                throw e;
            }
            fs.delete(newLinkDb, true);
        }
        LinkDb.install(job, linkDb);
        stopWatch.stop();
        LOG.info("LinkDb: finished, elapsed: {} ms", (Object)stopWatch.getTime(TimeUnit.MILLISECONDS));
    }

    private static Job createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) throws IOException {
        Path newLinkDb = new Path(linkDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        Job job = Job.getInstance((Configuration)config, (String)("Nutch LinkDb: " + linkDb));
        Configuration conf = job.getConfiguration();
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setJarByClass(LinkDb.class);
        job.setMapperClass(LinkDbMapper.class);
        job.setJarByClass(LinkDbMerger.class);
        job.setCombinerClass(LinkDbMerger.LinkDbMergeReducer.class);
        if (normalize || filter) {
            try {
                FileSystem fs = linkDb.getFileSystem(config);
                if (!fs.exists(linkDb)) {
                    conf.setBoolean("linkdb.url.filters", filter);
                    conf.setBoolean("linkdb.url.normalizer", normalize);
                }
            }
            catch (Exception e) {
                LOG.warn("LinkDb createJob:: {}", (Object)e.getMessage());
            }
        }
        job.setReducerClass(LinkDbMerger.LinkDbMergeReducer.class);
        FileOutputFormat.setOutputPath((Job)job, (Path)newLinkDb);
        job.setOutputFormatClass(MapFileOutputFormat.class);
        conf.setBoolean("mapreduce.output.fileoutputformat.compress", true);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Inlinks.class);
        return job;
    }

    public static void install(Job job, Path linkDb) throws IOException {
        Configuration conf = job.getConfiguration();
        Path newLinkDb = FileOutputFormat.getOutputPath((JobContext)job);
        FileSystem fs = linkDb.getFileSystem(conf);
        Path old = new Path(linkDb, "old");
        Path current = new Path(linkDb, CURRENT_NAME);
        if (fs.exists(current)) {
            if (fs.exists(old)) {
                fs.delete(old, true);
            }
            fs.rename(current, old);
        }
        fs.mkdirs(linkDb);
        fs.rename(newLinkDb, current);
        if (fs.exists(old)) {
            fs.delete(old, true);
        }
        LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new LinkDb(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
            System.err.println("\tlinkdb\toutput LinkDb to create or update");
            System.err.println("\t-dir segmentsDir\tparent directory of several segments, OR");
            System.err.println("\tseg1 seg2 ...\t list of segment directories");
            System.err.println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)");
            System.err.println("\t-noNormalize\tdon't normalize link URLs");
            System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
            return -1;
        }
        Path db = new Path(args[0]);
        ArrayList<Path> segs = new ArrayList<Path>();
        boolean filter = true;
        boolean normalize = true;
        boolean force = false;
        for (int i = 1; i < args.length; ++i) {
            if ("-dir".equals(args[i])) {
                Path segDir = new Path(args[++i]);
                FileSystem fs = segDir.getFileSystem(this.getConf());
                FileStatus[] paths = fs.listStatus(segDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
                segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
                continue;
            }
            if ("-noNormalize".equalsIgnoreCase(args[i])) {
                normalize = false;
                continue;
            }
            if ("-noFilter".equalsIgnoreCase(args[i])) {
                filter = false;
                continue;
            }
            if ("-force".equalsIgnoreCase(args[i])) {
                force = true;
                continue;
            }
            segs.add(new Path(args[i]));
        }
        try {
            this.invert(db, segs.toArray(new Path[segs.size()]), normalize, filter, force);
            return 0;
        }
        catch (Exception e) {
            LOG.error("LinkDb: {}", (Object)StringUtils.stringifyException((Throwable)e));
            return -1;
        }
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
        Object path;
        HashMap<String, Object> results = new HashMap<String, Object>();
        Path linkdb = args.containsKey("linkdb") ? ((path = args.get("linkdb")) instanceof Path ? (Path)path : new Path(path.toString())) : new Path(crawlId + "/linkdb");
        ArrayList<Path> segs = new ArrayList<Path>();
        boolean filter = true;
        boolean normalize = true;
        boolean force = false;
        if (args.containsKey("noNormalize")) {
            normalize = false;
        }
        if (args.containsKey("noFilter")) {
            filter = false;
        }
        if (args.containsKey("force")) {
            force = true;
        }
        if (args.containsKey("segment_dir")) {
            Object segDir = args.get("segment_dir");
            Path segmentsDir = segDir instanceof Path ? (Path)segDir : new Path(segDir.toString());
            FileSystem fs = segmentsDir.getFileSystem(this.getConf());
            FileStatus[] paths = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
            segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
        } else if (args.containsKey("segment")) {
            Object segments = args.get("segment");
            ArrayList segmentList = new ArrayList();
            if (segments instanceof ArrayList) {
                segmentList = (ArrayList)segments;
            } else if (segments instanceof Path) {
                segmentList.add(segments.toString());
            }
            for (String segment : segmentList) {
                segs.add(new Path(segment));
            }
        } else {
            String segmentDir = crawlId + "/segments";
            File dir = new File(segmentDir);
            File[] segmentsList = dir.listFiles();
            Arrays.sort(segmentsList, (f1, f2) -> {
                if (f1.lastModified() > f2.lastModified()) {
                    return -1;
                }
                return 0;
            });
            segs.add(new Path(segmentsList[0].getPath()));
        }
        try {
            this.invert(linkdb, segs.toArray(new Path[segs.size()]), normalize, filter, force);
            results.put("result", Integer.toString(0));
            return results;
        }
        catch (Exception e) {
            LOG.error("LinkDb: {}", (Object)StringUtils.stringifyException((Throwable)e));
            results.put("result", Integer.toString(-1));
            return results;
        }
    }

    public static class LinkDbMapper
    extends Mapper<Text, ParseData, Text, Inlinks> {
        private int maxAnchorLength;
        private boolean ignoreInternalLinks;
        private boolean ignoreExternalLinks;
        private URLFilters urlFilters;
        private URLNormalizers urlNormalizers;

        public void setup(Mapper.Context context) {
            Configuration conf = context.getConfiguration();
            this.maxAnchorLength = conf.getInt("linkdb.max.anchor.length", 100);
            this.ignoreInternalLinks = conf.getBoolean(LinkDb.IGNORE_INTERNAL_LINKS, true);
            this.ignoreExternalLinks = conf.getBoolean(LinkDb.IGNORE_EXTERNAL_LINKS, false);
            if (conf.getBoolean("linkdb.url.filters", false)) {
                this.urlFilters = new URLFilters(conf);
            }
            if (conf.getBoolean("linkdb.url.normalizer", false)) {
                this.urlNormalizers = new URLNormalizers(conf, "linkdb");
            }
        }

        public void map(Text key, ParseData parseData, Mapper.Context context) throws IOException, InterruptedException {
            String fromUrl = key.toString();
            String fromHost = LinkDb.getHost(fromUrl);
            if (this.urlNormalizers != null) {
                try {
                    fromUrl = this.urlNormalizers.normalize(fromUrl, "linkdb");
                }
                catch (Exception e) {
                    LOG.warn("Skipping {} :", (Object)fromUrl, (Object)e);
                    fromUrl = null;
                }
            }
            if (fromUrl != null && this.urlFilters != null) {
                try {
                    fromUrl = this.urlFilters.filter(fromUrl);
                }
                catch (Exception e) {
                    LOG.warn("Skipping {} :", (Object)fromUrl, (Object)e);
                    fromUrl = null;
                }
            }
            if (fromUrl == null) {
                return;
            }
            Outlink[] outlinks = parseData.getOutlinks();
            Inlinks inlinks = new Inlinks();
            for (int i = 0; i < outlinks.length; ++i) {
                String toHost;
                Outlink outlink = outlinks[i];
                String toUrl = outlink.getToUrl();
                if (!this.ignoreInternalLinks ? this.ignoreExternalLinks && ((toHost = LinkDb.getHost(toUrl)) == null || !toHost.equals(fromHost)) : (toHost = LinkDb.getHost(toUrl)) == null || toHost.equals(fromHost)) continue;
                if (this.urlNormalizers != null) {
                    try {
                        toUrl = this.urlNormalizers.normalize(toUrl, "linkdb");
                    }
                    catch (Exception e) {
                        LOG.warn("Skipping {} :", (Object)toUrl, (Object)e);
                        toUrl = null;
                    }
                }
                if (toUrl != null && this.urlFilters != null) {
                    try {
                        toUrl = this.urlFilters.filter(toUrl);
                    }
                    catch (Exception e) {
                        LOG.warn("Skipping {} :", (Object)toUrl, (Object)e);
                        toUrl = null;
                    }
                }
                if (toUrl == null) continue;
                inlinks.clear();
                String anchor = outlink.getAnchor();
                if (anchor.length() > this.maxAnchorLength) {
                    anchor = anchor.substring(0, this.maxAnchorLength);
                }
                inlinks.add(new Inlink(fromUrl, anchor));
                context.write((Object)new Text(toUrl), (Object)inlinks);
            }
        }
    }
}

