/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.tools;

import com.google.common.base.Strings;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilterOutputStream;
import java.io.OutputStream;
import java.lang.invoke.MethodHandles;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.DumpFileUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.TableUtil;
import org.apache.tika.Tika;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FileDumper {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats, boolean reverseURLDump) throws Exception {
        if (mimeTypes == null) {
            LOG.info("Accepting all mimetypes.");
        }
        HashMap<String, Integer> typeCounts = new HashMap<String, Integer>();
        HashMap<String, Integer> filteredCounts = new HashMap<String, Integer>();
        Configuration conf = NutchConfiguration.create();
        int fileCount = 0;
        File[] segmentDirs = segmentRootDir.listFiles(file -> file.canRead() && file.isDirectory());
        if (segmentDirs == null) {
            LOG.error("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]");
            return;
        }
        for (File segment : segmentDirs) {
            LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
            FilterOutputStream doutputStream = null;
            HashMap<String, String> filenameToUrl = new HashMap<String, String>();
            File segmentDir = new File(segment.getAbsolutePath(), "content");
            File[] partDirs = segmentDir.listFiles(file -> file.canRead() && file.isDirectory());
            if (partDirs == null) {
                LOG.warn("Skipping Corrupt Segment: [{}]", (Object)segment.getAbsolutePath());
                continue;
            }
            for (File partDir : partDirs) {
                try (FileSystem fs = FileSystem.get((Configuration)conf);){
                    String segmentPath = partDir + "/data";
                    Path file2 = new Path(segmentPath);
                    if (!new File(file2.toString()).exists()) {
                        LOG.warn("Skipping segment: [" + segmentPath + "]: no data directory present");
                        continue;
                    }
                    SequenceFile.Reader reader = new SequenceFile.Reader(conf, new SequenceFile.Reader.Option[]{SequenceFile.Reader.file((Path)file2)});
                    Writable key = (Writable)reader.getKeyClass().getConstructor(new Class[0]).newInstance(new Object[0]);
                    Content content = null;
                    while (reader.next(key)) {
                        String outputFullPath;
                        content = new Content();
                        reader.getCurrentValue((Writable)content);
                        String url = key.toString();
                        String baseName = FilenameUtils.getBaseName((String)url);
                        String extension = FilenameUtils.getExtension((String)url);
                        if (extension == null || extension != null && extension.equals("")) {
                            extension = "html";
                        }
                        ByteArrayInputStream bas = null;
                        Boolean filter = false;
                        try {
                            bas = new ByteArrayInputStream(content.getContent());
                            String mimeType = new Tika().detect(content.getContent());
                            this.collectStats(typeCounts, mimeType);
                            if (mimeType != null && (mimeTypes == null || Arrays.asList(mimeTypes).contains(mimeType))) {
                                this.collectStats(filteredCounts, mimeType);
                                filter = true;
                            }
                        }
                        catch (Exception e) {
                            e.printStackTrace();
                            LOG.warn("Tika is unable to detect type for: [" + url + "]");
                        }
                        finally {
                            if (bas != null) {
                                try {
                                    bas.close();
                                }
                                catch (Exception e) {}
                            }
                        }
                        if (!filter.booleanValue() || mimeTypeStats) continue;
                        String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                        String fullDir = outputDir.getAbsolutePath();
                        if (!flatDir && !reverseURLDump) {
                            fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
                        }
                        if (Strings.isNullOrEmpty((String)fullDir)) continue;
                        if (reverseURLDump) {
                            String[] reversedURL = TableUtil.reverseUrl(url).split(":");
                            reversedURL[0] = reversedURL[0].replace('.', '/');
                            String reversedURLPath = reversedURL[0] + "/" + DigestUtils.sha256Hex((String)url).toUpperCase();
                            outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);
                            String[] splitPath = outputFullPath.split("/");
                            File fullOutputDir = new File(StringUtils.join((Object[])Arrays.copyOf(splitPath, splitPath.length - 1), (String)"/"));
                            if (!fullOutputDir.exists()) {
                                if (!fullOutputDir.mkdirs()) {
                                    // empty if block
                                }
                                throw new Exception("Unable to create: [" + fullOutputDir.getAbsolutePath() + "]");
                            }
                        } else {
                            outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
                        }
                        filenameToUrl.put(outputFullPath, url);
                        File outputFile = new File(outputFullPath);
                        if (!outputFile.exists()) {
                            LOG.info("Writing: [" + outputFullPath + "]");
                            FileOutputStream output = null;
                            try {
                                output = new FileOutputStream(outputFile);
                                IOUtils.write((byte[])content.getContent(), (OutputStream)output);
                            }
                            catch (Exception e) {
                                LOG.warn("Write Error: [" + outputFullPath + "]");
                                e.printStackTrace();
                            }
                            finally {
                                if (output != null) {
                                    output.flush();
                                    try {
                                        output.close();
                                    }
                                    catch (Exception exception) {}
                                }
                            }
                            ++fileCount;
                            continue;
                        }
                        LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                    }
                    reader.close();
                }
                finally {
                    if (doutputStream != null) {
                        try {
                            doutputStream.close();
                        }
                        catch (Exception exception) {}
                    }
                }
            }
            String filenameToUrlFilePath = String.format("%s/%s_filenameToUrl.json", outputDir.getAbsolutePath(), segment.getName());
            new ObjectMapper().writeValue(new File(filenameToUrlFilePath), filenameToUrl);
        }
        LOG.info("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
        if (mimeTypeStats) {
            System.out.println("Dumper File Stats: " + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
        }
    }

    public static void main(String[] args) throws Exception {
        Option helpOpt = new Option("h", "help", false, "show this help message");
        OptionBuilder.withArgName((String)"outputDir");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription((String)"output directory (which will be created) to host the raw data");
        Option outputOpt = OptionBuilder.create((String)"outputDir");
        OptionBuilder.withArgName((String)"segment");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription((String)"the segment(s) to use");
        Option segOpt = OptionBuilder.create((String)"segment");
        OptionBuilder.withArgName((String)"mimetype");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription((String)"an optional list of mimetypes to dump, excluding all others. Defaults to all.");
        Option mimeOpt = OptionBuilder.create((String)"mimetype");
        OptionBuilder.withArgName((String)"mimeStats");
        OptionBuilder.withDescription((String)"only display mimetype stats for the segment(s) instead of dumping file.");
        Option mimeStat = OptionBuilder.create((String)"mimeStats");
        OptionBuilder.withArgName((String)"flatdir");
        OptionBuilder.withDescription((String)"optionally specify that the output directory should only contain files.");
        Option dirStructureOpt = OptionBuilder.create((String)"flatdir");
        OptionBuilder.withArgName((String)"reverseUrlDirs");
        OptionBuilder.withDescription((String)"optionally specify to use reverse URL folders for output structure.");
        Option reverseURLOutput = OptionBuilder.create((String)"reverseUrlDirs");
        Options options = new Options();
        options.addOption(helpOpt);
        options.addOption(outputOpt);
        options.addOption(segOpt);
        options.addOption(mimeOpt);
        options.addOption(mimeStat);
        options.addOption(dirStructureOpt);
        options.addOption(reverseURLOutput);
        GnuParser parser = new GnuParser();
        try {
            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("outputDir") || !line.hasOption("segment")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("FileDumper", options, true);
                return;
            }
            File outputDir = new File(line.getOptionValue("outputDir"));
            File segmentRootDir = new File(line.getOptionValue("segment"));
            String[] mimeTypes = line.getOptionValues("mimetype");
            boolean flatDir = line.hasOption("flatdir");
            boolean shouldDisplayStats = false;
            if (line.hasOption("mimeStats")) {
                shouldDisplayStats = true;
            }
            boolean reverseURLDump = false;
            if (line.hasOption("reverseUrlDirs")) {
                reverseURLDump = true;
            }
            if (!outputDir.exists()) {
                LOG.warn("Output directory: [" + outputDir.getAbsolutePath() + "]: does not exist, creating it.");
                if (!shouldDisplayStats && !outputDir.mkdirs()) {
                    throw new Exception("Unable to create: [" + outputDir.getAbsolutePath() + "]");
                }
            }
            FileDumper dumper = new FileDumper();
            dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir, shouldDisplayStats, reverseURLDump);
        }
        catch (Exception e) {
            LOG.error("FileDumper: " + org.apache.hadoop.util.StringUtils.stringifyException((Throwable)e));
            e.printStackTrace();
            return;
        }
    }

    private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
        typeCounts.put(mimeType, typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
    }
}

