/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.tools;

import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
import com.ibm.icu.text.SimpleDateFormat;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDbReader;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.tools.CommonCrawlConfig;
import org.apache.nutch.tools.CommonCrawlFormat;
import org.apache.nutch.tools.CommonCrawlFormatFactory;
import org.apache.nutch.util.DumpFileUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchTool;
import org.apache.tika.Tika;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CommonCrawlDataDumper
extends NutchTool
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private static final int MAX_INLINKS = 5000;
    private CommonCrawlConfig config = null;
    private FileOutputStream fileOutput = null;
    private BufferedOutputStream bufOutput = null;
    private GzipCompressorOutputStream gzipOutput = null;
    private TarArchiveOutputStream tarOutput = null;
    private ArrayList<String> fileList = null;

    public static void main(String[] args) throws Exception {
        Configuration conf = NutchConfiguration.create();
        int res = ToolRunner.run((Configuration)conf, (Tool)new CommonCrawlDataDumper(), (String[])args);
        System.exit(res);
    }

    public CommonCrawlDataDumper(CommonCrawlConfig config) {
        this.config = config;
    }

    public CommonCrawlDataDumper() {
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes, boolean epochFilename, String extension, boolean warc) throws Exception {
        if (gzip) {
            LOG.info("Gzipping CBOR data has been skipped");
        }
        HashMap<String, Integer> typeCounts = new HashMap<String, Integer>();
        HashMap<String, Integer> filteredCounts = new HashMap<String, Integer>();
        Configuration nutchConfig = NutchConfiguration.create();
        Path segmentRootPath = new Path(segmentRootDir.toString());
        FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);
        ArrayList<Path> parts = new ArrayList<Path>();
        RemoteIterator files = fs.listFiles(segmentRootPath, true);
        String partPattern = ".*" + File.separator + "content" + File.separator + "part-[0-9]{5}" + File.separator + "data";
        while (files.hasNext()) {
            Path path;
            LocatedFileStatus next = (LocatedFileStatus)files.next();
            if (!next.isFile() || !(path = next.getPath()).toString().matches(partPattern)) continue;
            parts.add(path);
        }
        LinkDbReader linkDbReader = null;
        if (linkdb != null) {
            linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
        }
        if (parts == null || parts.size() == 0) {
            LOG.error("No segment directories found in {} ", (Object)segmentRootDir.getAbsolutePath());
            System.exit(1);
        }
        LOG.info("Found {} segment parts", (Object)parts.size());
        if (gzip && !warc) {
            this.fileList = new ArrayList();
            this.constructNewStream(outputDir);
        }
        for (Path segmentPart : parts) {
            LOG.info("Processing segment Part : [ {} ]", (Object)segmentPart);
            try {
                SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig, new SequenceFile.Reader.Option[]{SequenceFile.Reader.file((Path)segmentPart)});
                Writable key = (Writable)reader.getKeyClass().getConstructor(new Class[0]).newInstance(new Object[0]);
                Content content = null;
                while (reader.next(key)) {
                    content = new Content();
                    reader.getCurrentValue((Writable)content);
                    Metadata metadata = content.getMetadata();
                    String url = key.toString();
                    String baseName = FilenameUtils.getBaseName((String)url);
                    String extensionName = FilenameUtils.getExtension((String)url);
                    if (!extension.isEmpty()) {
                        extensionName = extension;
                    } else if (extensionName == null || extensionName.isEmpty()) {
                        extensionName = "html";
                    }
                    String outputFullPath = null;
                    Object outputRelativePath = null;
                    Object filename = null;
                    String timestamp = null;
                    String reverseKey = null;
                    if (epochFilename || this.config.getReverseKey()) {
                        try {
                            long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(this.getDate(metadata.get("Date"))).getTime();
                            timestamp = String.valueOf(epoch);
                        }
                        catch (ParseException pe) {
                            LOG.warn(pe.getMessage());
                        }
                        reverseKey = CommonCrawlDataDumper.reverseUrl(url);
                        this.config.setReverseKeyValue(reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex((String)url) + "_" + timestamp);
                    }
                    if (!warc) {
                        if (epochFilename) {
                            outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, url, timestamp, extensionName, !gzip);
                            outputRelativePath = outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator) - 1);
                            filename = content.getMetadata().get("date") + "." + extensionName;
                        } else {
                            String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                            String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, !gzip);
                            filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
                            outputFullPath = String.format("%s/%s", fullDir, filename);
                            String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
                            String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
                            String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
                            outputRelativePath = firstLevelDirName + secondLevelDirName;
                        }
                    }
                    Boolean filter = mimeTypes == null;
                    String jsonData = "";
                    try {
                        Inlinks inlinks;
                        String mimeType = new Tika().detect(content.getContent());
                        LinkedHashSet<String> inUrls = null;
                        if (linkDbReader != null && (inlinks = linkDbReader.getInlinks((Text)key)) != null) {
                            Iterator<Inlink> iterator = inlinks.iterator();
                            inUrls = new LinkedHashSet<String>();
                            while (inUrls.size() <= 5000 && iterator.hasNext()) {
                                inUrls.add(iterator.next().getFromUrl());
                            }
                        }
                        try (CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, this.config);){
                            if (inUrls != null) {
                                format.setInLinks(new ArrayList<String>(inUrls));
                            }
                            jsonData = format.getJsonData(url, content, metadata);
                        }
                        this.collectStats(typeCounts, mimeType);
                        if (mimeType != null && mimeTypes != null && Arrays.asList(mimeTypes).contains(mimeType)) {
                            this.collectStats(filteredCounts, mimeType);
                            filter = true;
                        }
                    }
                    catch (IOException ioe) {
                        LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
                        fs.close();
                        return;
                    }
                    if (warc || !filter.booleanValue()) continue;
                    byte[] byteData = this.serializeCBORData(jsonData);
                    if (!gzip) {
                        File outputFile = new File(outputFullPath);
                        if (outputFile.exists()) {
                            LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                            continue;
                        }
                        LOG.info("Writing: [" + outputFullPath + "]");
                        IOUtils.copy((InputStream)new ByteArrayInputStream(byteData), (OutputStream)new FileOutputStream(outputFile));
                        continue;
                    }
                    if (this.fileList.contains(outputFullPath)) {
                        LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
                        continue;
                    }
                    this.fileList.add(outputFullPath);
                    LOG.info("Compressing: [" + outputFullPath + "]");
                    TarArchiveEntry tarEntry = new TarArchiveEntry((String)outputRelativePath + File.separator + (String)filename);
                    tarEntry.setSize((long)byteData.length);
                    this.tarOutput.putArchiveEntry((ArchiveEntry)tarEntry);
                    this.tarOutput.write(byteData);
                    this.tarOutput.closeArchiveEntry();
                }
                reader.close();
            }
            catch (Exception e) {
                LOG.warn("SKIPPED: {} Because : {}", (Object)segmentPart, (Object)e.getMessage());
            }
            finally {
                fs.close();
            }
        }
        if (gzip && !warc) {
            this.closeStream();
        }
        if (!typeCounts.isEmpty()) {
            LOG.info("CommonsCrawlDataDumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
        }
    }

    private void closeStream() {
        try {
            this.tarOutput.finish();
            this.tarOutput.close();
            this.gzipOutput.close();
            this.bufOutput.close();
            this.fileOutput.close();
        }
        catch (IOException ioe) {
            LOG.warn("Error in closing stream: " + ioe.getMessage());
        }
    }

    private void constructNewStream(File outputDir) throws IOException {
        String archiveName = new SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
        LOG.info("Creating a new gzip archive: " + archiveName);
        this.fileOutput = new FileOutputStream(new File(outputDir + File.separator + archiveName));
        this.bufOutput = new BufferedOutputStream(this.fileOutput);
        this.gzipOutput = new GzipCompressorOutputStream((OutputStream)this.bufOutput);
        this.tarOutput = new TarArchiveOutputStream((OutputStream)this.gzipOutput);
        this.tarOutput.setLongFileMode(2);
    }

    private void writeMagicHeader(CBORGenerator generator) throws IOException {
        byte[] header = new byte[]{-39, -39, -9};
        generator.writeBytes(header, 0, header.length);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private byte[] serializeCBORData(String jsonData) {
        CBORFactory factory = new CBORFactory();
        CBORGenerator generator = null;
        ByteArrayOutputStream stream = null;
        try {
            stream = new ByteArrayOutputStream();
            generator = factory.createGenerator((OutputStream)stream);
            this.writeMagicHeader(generator);
            generator.writeString(jsonData);
            generator.flush();
            stream.flush();
            byte[] byArray = stream.toByteArray();
            return byArray;
        }
        catch (Exception e) {
            LOG.warn("CBOR encoding failed: " + e.getMessage());
        }
        finally {
            try {
                generator.close();
                stream.close();
            }
            catch (IOException iOException) {}
        }
        return null;
    }

    private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
        typeCounts.put(mimeType, typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
    }

    private String getDate(String timestamp) {
        if (timestamp == null || timestamp.isEmpty()) {
            SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z");
            timestamp = dateFormat.format(new Date());
        }
        return timestamp;
    }

    public static String reverseUrl(String urlString) {
        String reverseKey = null;
        try {
            URL url = new URL(urlString);
            String[] hostPart = url.getHost().replace('.', '/').split("/");
            StringBuilder sb = new StringBuilder();
            sb.append(hostPart[hostPart.length - 1]);
            for (int i = hostPart.length - 2; i >= 0; --i) {
                sb.append("/" + hostPart[i]);
            }
            reverseKey = sb.toString();
        }
        catch (MalformedURLException e) {
            LOG.error("Failed to parse URL: {}", (Object)urlString);
        }
        return reverseKey;
    }

    public int run(String[] args) throws Exception {
        Option helpOpt = new Option("h", "help", false, "show this help message.");
        OptionBuilder.withArgName((String)"outputDir");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription((String)"output directory (which will be created) to host the CBOR data.");
        Option outputOpt = OptionBuilder.create((String)"outputDir");
        Option warcOpt = new Option("warc", "export to a WARC file");
        OptionBuilder.withArgName((String)"segment");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription((String)"the segment or directory containing segments to use");
        Option segOpt = OptionBuilder.create((String)"segment");
        OptionBuilder.isRequired((boolean)false);
        OptionBuilder.withArgName((String)"mimetype");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription((String)"an optional list of mimetypes to dump, excluding all others. Defaults to all.");
        Option mimeOpt = OptionBuilder.create((String)"mimetype");
        OptionBuilder.withArgName((String)"gzip");
        OptionBuilder.hasArg((boolean)false);
        OptionBuilder.withDescription((String)"an optional flag indicating whether to additionally gzip the data.");
        Option gzipOpt = OptionBuilder.create((String)"gzip");
        OptionBuilder.withArgName((String)"keyPrefix");
        OptionBuilder.hasArg((boolean)true);
        OptionBuilder.withDescription((String)"an optional prefix for key in the output format.");
        Option keyPrefixOpt = OptionBuilder.create((String)"keyPrefix");
        OptionBuilder.withArgName((String)"SimpleDateFormat");
        OptionBuilder.hasArg((boolean)false);
        OptionBuilder.withDescription((String)"an optional format for timestamp in GMT epoch milliseconds.");
        Option simpleDateFormatOpt = OptionBuilder.create((String)"SimpleDateFormat");
        OptionBuilder.withArgName((String)"epochFilename");
        OptionBuilder.hasArg((boolean)false);
        OptionBuilder.withDescription((String)"an optional format for output filename.");
        Option epochFilenameOpt = OptionBuilder.create((String)"epochFilename");
        OptionBuilder.withArgName((String)"jsonArray");
        OptionBuilder.hasArg((boolean)false);
        OptionBuilder.withDescription((String)"an optional format for JSON output.");
        Option jsonArrayOpt = OptionBuilder.create((String)"jsonArray");
        OptionBuilder.withArgName((String)"reverseKey");
        OptionBuilder.hasArg((boolean)false);
        OptionBuilder.withDescription((String)"an optional format for key value in JSON output.");
        Option reverseKeyOpt = OptionBuilder.create((String)"reverseKey");
        OptionBuilder.withArgName((String)"extension");
        OptionBuilder.hasArg((boolean)true);
        OptionBuilder.withDescription((String)"an optional file extension for output documents.");
        Option extensionOpt = OptionBuilder.create((String)"extension");
        OptionBuilder.withArgName((String)"warcSize");
        OptionBuilder.hasArg((boolean)true);
        OptionBuilder.withType(Number.class);
        OptionBuilder.withDescription((String)"an optional file size in bytes for the WARC file(s)");
        Option sizeOpt = OptionBuilder.create((String)"warcSize");
        OptionBuilder.withArgName((String)"linkdb");
        OptionBuilder.hasArg((boolean)true);
        OptionBuilder.withDescription((String)"an optional linkdb parameter to include inlinks in dump files");
        OptionBuilder.isRequired((boolean)false);
        Option linkDbOpt = OptionBuilder.create((String)"linkdb");
        Options options = new Options();
        options.addOption(helpOpt);
        options.addOption(outputOpt);
        options.addOption(segOpt);
        options.addOption(warcOpt);
        options.addOption(mimeOpt);
        options.addOption(gzipOpt);
        options.addOption(keyPrefixOpt);
        options.addOption(simpleDateFormatOpt);
        options.addOption(epochFilenameOpt);
        options.addOption(jsonArrayOpt);
        options.addOption(reverseKeyOpt);
        options.addOption(extensionOpt);
        options.addOption(sizeOpt);
        options.addOption(linkDbOpt);
        GnuParser parser = new GnuParser();
        try {
            String linkdbPath;
            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("outputDir") || !line.hasOption("segment")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp(CommonCrawlDataDumper.class.getName(), options, true);
                return 0;
            }
            File outputDir = new File(line.getOptionValue("outputDir"));
            File segmentRootDir = new File(line.getOptionValue("segment"));
            String[] mimeTypes = line.getOptionValues("mimetype");
            boolean gzip = line.hasOption("gzip");
            boolean epochFilename = line.hasOption("epochFilename");
            String keyPrefix = line.getOptionValue("keyPrefix", "");
            boolean simpleDateFormat = line.hasOption("SimpleDateFormat");
            boolean jsonArray = line.hasOption("jsonArray");
            boolean reverseKey = line.hasOption("reverseKey");
            String extension = line.getOptionValue("extension", "");
            boolean warc = line.hasOption("warc");
            long warcSize = 0L;
            if (line.getParsedOptionValue("warcSize") != null) {
                warcSize = (Long)line.getParsedOptionValue("warcSize");
            }
            File linkdb = (linkdbPath = line.getOptionValue("linkdb")) == null ? null : new File(linkdbPath);
            CommonCrawlConfig config = new CommonCrawlConfig();
            config.setKeyPrefix(keyPrefix);
            config.setSimpleDateFormat(simpleDateFormat);
            config.setJsonArray(jsonArray);
            config.setReverseKey(reverseKey);
            config.setCompressed(gzip);
            config.setWarcSize(warcSize);
            config.setOutputDir(line.getOptionValue("outputDir"));
            if (!outputDir.exists()) {
                LOG.warn("Output directory: [" + outputDir.getAbsolutePath() + "]: does not exist, creating it.");
                if (!outputDir.mkdirs()) {
                    throw new Exception("Unable to create: [" + outputDir.getAbsolutePath() + "]");
                }
            }
            CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(config);
            dumper.dump(outputDir, segmentRootDir, linkdb, gzip, mimeTypes, epochFilename, extension, warc);
        }
        catch (Exception e) {
            LOG.error(CommonCrawlDataDumper.class.getName() + ": " + StringUtils.stringifyException((Throwable)e));
            e.printStackTrace();
            return -1;
        }
        return 0;
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
        String keyPrefix = args.containsKey("keyPrefix") ? (String)args.get("keyPrefix") : "";
        File outputDir = new File((String)args.get("outputDir"));
        File segmentRootDir = new File((String)args.get("segment_dir"));
        ArrayList mimeTypesList = args.containsKey("mimetypes") ? (ArrayList)args.get("mimetypes") : null;
        String[] mimeTypes = null;
        if (mimeTypesList != null) {
            mimeTypes = new String[mimeTypesList.size()];
            int i = 0;
            for (String m : mimeTypesList) {
                mimeTypes[i++] = m;
            }
        }
        boolean gzip = args.containsKey("gzip") ? (Boolean)args.get("gzip") : false;
        boolean epochFilename = args.containsKey("epochFilename") ? (Boolean)args.get("epochFilename") : false;
        boolean simpleDateFormat = args.containsKey("simpleDateFormat") ? (Boolean)args.get("simpleDateFormat") : false;
        boolean jsonArray = args.containsKey("jsonArray") ? (Boolean)args.get("jsonArray") : false;
        boolean reverseKey = args.containsKey("reverseKey") ? (Boolean)args.get("reverseKey") : false;
        String extension = args.containsKey("extension") ? (String)args.get("extension") : "";
        boolean warc = args.containsKey("warc") ? (Boolean)args.get("warc") : false;
        long warcSize = args.containsKey("warcSize") ? (Long)args.get("warcSize") : 0L;
        CommonCrawlConfig config = new CommonCrawlConfig();
        config.setKeyPrefix(keyPrefix);
        config.setSimpleDateFormat(simpleDateFormat);
        config.setJsonArray(jsonArray);
        config.setReverseKey(reverseKey);
        config.setCompressed(gzip);
        config.setWarcSize(warcSize);
        config.setOutputDir((String)args.get("outputDir"));
        if (!outputDir.exists() && !outputDir.mkdirs()) {
            throw new Exception("Unable to create: [" + outputDir.getAbsolutePath() + "]");
        }
        CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(config);
        dumper.dump(outputDir, segmentRootDir, null, gzip, mimeTypes, epochFilename, extension, warc);
        return null;
    }
}

