/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.tools.warc;

import com.google.gson.Gson;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.martinkl.warc.WARCRecord;
import com.martinkl.warc.WARCWritable;
import com.martinkl.warc.mapreduce.WARCOutputFormat;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.tools.WARCUtils;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WARCExporter
extends Configured
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private static final String ONLY_SUCCESSFUL_RESPONSES = "warc.exporter.only.successful.responses";
    private static final String CRLF = "\r\n";
    private static final byte[] CRLF_BYTES = new byte[]{13, 10};

    public WARCExporter() {
        super(null);
    }

    public WARCExporter(Configuration conf) {
        super(conf);
    }

    public int generateWARC(String output, List<Path> segments, boolean onlySuccessfulResponses, boolean includeParseData, boolean includeParseText) throws IOException {
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        LOG.info("WARCExporter: starting");
        Job job = Job.getInstance((Configuration)this.getConf(), (String)("Nutch WARCExporter: " + output));
        job.getConfiguration().setBoolean(ONLY_SUCCESSFUL_RESPONSES, onlySuccessfulResponses);
        for (Path segment : segments) {
            LOG.info("warc-exporter: adding segment: {}", (Object)segment);
            FileInputFormat.addInputPath((Job)job, (Path)new Path(segment, "content"));
            FileInputFormat.addInputPath((Job)job, (Path)new Path(segment, "crawl_fetch"));
            if (includeParseData) {
                FileInputFormat.addInputPath((Job)job, (Path)new Path(segment, "parse_data"));
            }
            if (!includeParseText) continue;
            FileInputFormat.addInputPath((Job)job, (Path)new Path(segment, "parse_text"));
        }
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setJarByClass(WARCMapReduce.class);
        job.setMapperClass(WARCMapReduce.WARCMapper.class);
        job.setReducerClass(WARCMapReduce.WARCReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NutchWritable.class);
        FileOutputFormat.setOutputPath((Job)job, (Path)new Path(output));
        job.setOutputFormatClass(WARCOutputFormat.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(WARCWritable.class);
        try {
            boolean success = job.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("WARCExporter", job);
                LOG.error(message);
                throw new RuntimeException(message);
            }
            LOG.info(job.getCounters().toString());
            stopWatch.stop();
            LOG.info("WARCExporter: finished, elapsed: {} ms", (Object)stopWatch.getTime(TimeUnit.MILLISECONDS));
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error("WARCExporter job failed: {}", (Object)e.getMessage());
            return -1;
        }
        return 0;
    }

    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println("Usage: WARCExporter <output> (<segment> ... | -dir <segments>) [-onlySuccessfulResponses] [-includeParseData] [-includeParseText]");
            return -1;
        }
        boolean onlySuccessfulResponses = false;
        boolean includeParseData = false;
        boolean includeParseText = false;
        ArrayList<Path> segments = new ArrayList<Path>();
        for (int i = 1; i < args.length; ++i) {
            if (args[i].equals("-onlySuccessfulResponses")) {
                onlySuccessfulResponses = true;
                continue;
            }
            if (args[i].equals("-includeParseData")) {
                includeParseData = true;
                continue;
            }
            if (args[i].equals("-includeParseText")) {
                includeParseText = true;
                continue;
            }
            if (args[i].equals("-dir")) {
                Path[] files;
                Path dir = new Path(args[++i]);
                FileSystem fs = dir.getFileSystem(this.getConf());
                FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
                for (Path p : files = HadoopFSUtil.getPaths(fstats)) {
                    segments.add(p);
                }
                continue;
            }
            segments.add(new Path(args[i]));
        }
        return this.generateWARC(args[0], segments, onlySuccessfulResponses, includeParseData, includeParseText);
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new WARCExporter(), (String[])args);
        System.exit(res);
    }

    public static class WARCMapReduce {

        public static class WARCReducer
        extends Reducer<Text, NutchWritable, NullWritable, WARCWritable> {
            Gson gson = new Gson();

            public void reduce(Text key, Iterable<NutchWritable> values, Reducer.Context context) throws IOException, InterruptedException {
                String status;
                boolean onlySuccessfulResponses = context.getConfiguration().getBoolean(WARCExporter.ONLY_SUCCESSFUL_RESPONSES, false);
                ParseData parseData = null;
                ParseText parseText = null;
                Content content = null;
                CrawlDatum cd = null;
                SimpleDateFormat warcdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ENGLISH);
                for (NutchWritable val : values) {
                    Writable value = val.get();
                    if (value instanceof Content) {
                        content = (Content)value;
                        continue;
                    }
                    if (value instanceof CrawlDatum) {
                        cd = (CrawlDatum)value;
                        continue;
                    }
                    if (value instanceof ParseData) {
                        parseData = (ParseData)value;
                        continue;
                    }
                    if (!(value instanceof ParseText)) continue;
                    parseText = (ParseText)value;
                }
                if (content == null) {
                    LOG.info("Missing content for {}", (Object)key);
                    context.getCounter("WARCExporter", "missing content").increment(1L);
                    return;
                }
                if (cd == null) {
                    LOG.info("Missing fetch datum for {}", (Object)key);
                    context.getCounter("WARCExporter", "missing metadata").increment(1L);
                    return;
                }
                if (onlySuccessfulResponses && cd.getStatus() != 33 && cd.getStatus() != 38) {
                    context.getCounter("WARCExporter", "omitted empty response").increment(1L);
                    return;
                }
                Object headersVerbatim = content.getMetadata().get("_response.headers_");
                headersVerbatim = WARCUtils.fixHttpHeaders((String)headersVerbatim, content.getContent().length);
                byte[] httpheaders = new byte[]{};
                if (StringUtils.isNotBlank((String)headersVerbatim)) {
                    if (!((String)headersVerbatim).endsWith("\r\n\r\n")) {
                        headersVerbatim = (String)headersVerbatim + "\r\n\r\n";
                    }
                    httpheaders = ((String)headersVerbatim).getBytes();
                }
                String mainId = UUID.randomUUID().toString();
                StringBuilder buffer = new StringBuilder();
                buffer.append("WARC/1.0");
                buffer.append(WARCExporter.CRLF);
                buffer.append("WARC-Record-ID").append(": ").append("<urn:uuid:").append(mainId).append(">").append(WARCExporter.CRLF);
                int contentLength = 0;
                if (content != null) {
                    contentLength = content.getContent().length;
                }
                buffer.append("Content-Length").append(": ").append(Integer.toString(contentLength += httpheaders.length)).append(WARCExporter.CRLF);
                Date fetchedDate = new Date(cd.getFetchTime());
                buffer.append("WARC-Date").append(": ").append(warcdf.format(fetchedDate)).append(WARCExporter.CRLF);
                String warcTypeValue = "resource";
                if (StringUtils.isNotBlank((String)headersVerbatim)) {
                    warcTypeValue = "response";
                }
                buffer.append("WARC-Type").append(": ").append(warcTypeValue).append(WARCExporter.CRLF);
                String IP = content.getMetadata().get("_ip_");
                if (StringUtils.isNotBlank((String)IP)) {
                    buffer.append("WARC-IP-Address").append(": ").append("IP").append(WARCExporter.CRLF);
                }
                if ((status = CrawlDatum.getStatusName(cd.getStatus())).equalsIgnoreCase("STATUS_FETCH_SUCCESS") && ParseSegment.isTruncated(content)) {
                    buffer.append("WARC-Truncated").append(": ").append("unspecified").append(WARCExporter.CRLF);
                }
                try {
                    String normalised = key.toString().replaceAll(" ", "%20");
                    URI uri = URI.create(normalised);
                    buffer.append("WARC-Target-URI").append(": ").append(uri.toASCIIString()).append(WARCExporter.CRLF);
                }
                catch (Exception e) {
                    LOG.error("Invalid URI {} ", (Object)key);
                    context.getCounter("WARCExporter", "invalid URI").increment(1L);
                    return;
                }
                if (warcTypeValue.equals("response")) {
                    buffer.append("Content-Type: application/http; msgtype=response").append(WARCExporter.CRLF);
                }
                ByteArrayOutputStream bos = new ByteArrayOutputStream();
                bos.write(buffer.toString().getBytes("UTF-8"));
                bos.write(CRLF_BYTES);
                bos.write(httpheaders);
                if (content.getContent() != null) {
                    bos.write(content.getContent());
                }
                bos.write(CRLF_BYTES);
                bos.write(CRLF_BYTES);
                try {
                    DataInputStream in = new DataInputStream(new ByteArrayInputStream(bos.toByteArray()));
                    WARCRecord record = new WARCRecord((DataInput)in);
                    context.write((Object)NullWritable.get(), (Object)new WARCWritable(record));
                    context.getCounter("WARCExporter", "records generated").increment(1L);
                }
                catch (IOException | IllegalStateException exception) {
                    LOG.error("Exception when generating WARC resource record for {} : {}", (Object)key, (Object)exception.getMessage());
                    context.getCounter("WARCExporter", "exception").increment(1L);
                }
                if (parseData != null) {
                    buffer = new StringBuilder();
                    JsonObject jsonObject = new JsonObject();
                    jsonObject.add("contentMeta", (JsonElement)this.metadataToJson(parseData.getContentMeta()));
                    jsonObject.add("parseMeta", (JsonElement)this.metadataToJson(parseData.getParseMeta()));
                    StringBuilder payload = new StringBuilder();
                    payload.append(this.gson.toJson((JsonElement)jsonObject));
                    payload.append(WARCExporter.CRLF);
                    buffer.append("WARC/1.0");
                    buffer.append(WARCExporter.CRLF);
                    buffer.append("WARC-Record-ID").append(": ").append("<urn:uuid:").append(UUID.randomUUID().toString()).append(">").append(WARCExporter.CRLF);
                    buffer.append("WARC-Refers-To").append(": ").append("<urn:uuid:").append(mainId).append(">").append(WARCExporter.CRLF);
                    buffer.append("WARC-Date").append(": ").append(warcdf.format(fetchedDate)).append(WARCExporter.CRLF);
                    buffer.append("WARC-Type").append(": ").append("metadata").append(WARCExporter.CRLF);
                    buffer.append("Content-Type").append(": ").append("application/json").append(WARCExporter.CRLF);
                    contentLength = payload.toString().getBytes("UTF-8").length;
                    buffer.append("Content-Length").append(": ").append(Integer.toString(contentLength)).append(WARCExporter.CRLF);
                    try {
                        String normalised = key.toString().replaceAll(" ", "%20");
                        URI uri = URI.create(normalised);
                        buffer.append("WARC-Target-URI").append(": ").append(uri.toASCIIString()).append(WARCExporter.CRLF);
                    }
                    catch (Exception e) {
                        LOG.error("Invalid URI {} ", (Object)key);
                        context.getCounter("WARCExporter", "invalid URI").increment(1L);
                        return;
                    }
                    bos = new ByteArrayOutputStream();
                    bos.write(buffer.toString().getBytes("UTF-8"));
                    bos.write(CRLF_BYTES);
                    bos.write(payload.toString().getBytes("UTF-8"));
                    bos.write(CRLF_BYTES);
                    bos.write(CRLF_BYTES);
                    try {
                        DataInputStream in = new DataInputStream(new ByteArrayInputStream(bos.toByteArray()));
                        WARCRecord record = new WARCRecord((DataInput)in);
                        context.write((Object)NullWritable.get(), (Object)new WARCWritable(record));
                        context.getCounter("WARCExporter", "records generated").increment(1L);
                    }
                    catch (IOException | IllegalStateException exception) {
                        LOG.error("Exception when generating WARC metadata record for {} : {}", new Object[]{key, exception.getMessage(), exception});
                        context.getCounter("WARCExporter", "exception").increment(1L);
                    }
                }
                if (parseText != null) {
                    buffer = new StringBuilder();
                    StringBuilder payload = new StringBuilder();
                    payload.append(parseText);
                    payload.append(WARCExporter.CRLF);
                    buffer.append("WARC/1.0");
                    buffer.append(WARCExporter.CRLF);
                    buffer.append("WARC-Record-ID").append(": ").append("<urn:uuid:").append(UUID.randomUUID().toString()).append(">").append(WARCExporter.CRLF);
                    buffer.append("WARC-Refers-To").append(": ").append("<urn:uuid:").append(mainId).append(">").append(WARCExporter.CRLF);
                    buffer.append("WARC-Date").append(": ").append(warcdf.format(fetchedDate)).append(WARCExporter.CRLF);
                    buffer.append("WARC-Type").append(": ").append("conversion").append(WARCExporter.CRLF);
                    buffer.append("Content-Type").append(": ").append("text/plain").append(WARCExporter.CRLF);
                    contentLength = payload.toString().getBytes("UTF-8").length;
                    buffer.append("Content-Length").append(": ").append(Integer.toString(contentLength)).append(WARCExporter.CRLF);
                    try {
                        String normalised = key.toString().replaceAll(" ", "%20");
                        URI uri = URI.create(normalised);
                        buffer.append("WARC-Target-URI").append(": ").append(uri.toASCIIString()).append(WARCExporter.CRLF);
                    }
                    catch (Exception e) {
                        LOG.error("Invalid URI {} ", (Object)key);
                        context.getCounter("WARCExporter", "invalid URI").increment(1L);
                        return;
                    }
                    bos = new ByteArrayOutputStream();
                    bos.write(buffer.toString().getBytes("UTF-8"));
                    bos.write(CRLF_BYTES);
                    bos.write(payload.toString().getBytes("UTF-8"));
                    bos.write(CRLF_BYTES);
                    bos.write(CRLF_BYTES);
                    try {
                        DataInputStream in = new DataInputStream(new ByteArrayInputStream(bos.toByteArray()));
                        WARCRecord record = new WARCRecord((DataInput)in);
                        context.write((Object)NullWritable.get(), (Object)new WARCWritable(record));
                        context.getCounter("WARCExporter", "records generated").increment(1L);
                    }
                    catch (IOException | IllegalStateException exception) {
                        LOG.error("Exception when generating WARC metadata record for {} : {}", new Object[]{key, exception.getMessage(), exception});
                        context.getCounter("WARCExporter", "exception").increment(1L);
                    }
                }
            }

            protected JsonObject metadataToJson(Metadata meta) {
                JsonObject obj = new JsonObject();
                for (String key : meta.names()) {
                    if (meta.isMultiValued(key)) {
                        obj.add(key, this.gson.toJsonTree((Object)meta.getValues(key)));
                        continue;
                    }
                    obj.addProperty(key, meta.get(key));
                }
                return obj;
            }
        }

        public static class WARCMapper
        extends Mapper<Text, Writable, Text, NutchWritable> {
            public void map(Text key, Writable value, Mapper.Context context) throws IOException, InterruptedException {
                context.write((Object)key, (Object)new NutchWritable(value));
            }
        }
    }
}

