/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.tools;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilterReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.Random;
import java.util.Vector;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.xerces.util.XMLChar;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

public class DmozParser {
    private static final Random RANDOM = new Random();
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    long pages = 0L;

    public void parseDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, int skew, Pattern topicPattern) throws IOException, SAXException, ParserConfigurationException {
        SAXParserFactory parserFactory = SAXParserFactory.newInstance();
        parserFactory.setFeature("http://xml.org/sax/features/external-general-entities", false);
        parserFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
        SAXParser parser = parserFactory.newSAXParser();
        XMLReader reader = parser.getXMLReader();
        reader.setFeature("http://xml.org/sax/features/external-general-entities", false);
        RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, skew, topicPattern);
        reader.setContentHandler(rp);
        reader.setErrorHandler(rp);
        LOG.info("skew = " + rp.hashSkew);
        try (XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader((InputStream)new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8")));){
            InputSource is = new InputSource(in);
            reader.parse(is);
        }
        catch (Exception e) {
            if (LOG.isErrorEnabled()) {
                LOG.error(e.toString());
            }
            System.exit(0);
        }
    }

    private static void addTopicsFromFile(String topicFile, Vector<String> topics) throws IOException {
        try (BufferedReader in = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(topicFile), "UTF-8"));){
            String line = null;
            while ((line = in.readLine()) != null) {
                topics.addElement(line);
            }
        }
        catch (Exception e) {
            if (LOG.isErrorEnabled()) {
                LOG.error(e.toString());
            }
            System.exit(0);
        }
    }

    public static void main(String[] argv) throws Exception {
        if (argv.length < 1) {
            System.err.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
            return;
        }
        int subsetDenom = 1;
        int skew = 0;
        String dmozFile = argv[0];
        boolean includeAdult = false;
        Pattern topicPattern = null;
        Vector<String> topics = new Vector<String>();
        Configuration conf = NutchConfiguration.create();
        try (FileSystem fs = FileSystem.get((Configuration)conf);){
            for (int i = 1; i < argv.length; ++i) {
                if ("-includeAdultMaterial".equals(argv[i])) {
                    includeAdult = true;
                    continue;
                }
                if ("-subset".equals(argv[i])) {
                    subsetDenom = Integer.parseInt(argv[i + 1]);
                    ++i;
                    continue;
                }
                if ("-topic".equals(argv[i])) {
                    topics.addElement(argv[i + 1]);
                    ++i;
                    continue;
                }
                if ("-topicFile".equals(argv[i])) {
                    DmozParser.addTopicsFromFile(argv[i + 1], topics);
                    ++i;
                    continue;
                }
                if (!"-skew".equals(argv[i])) continue;
                skew = Integer.parseInt(argv[i + 1]);
                ++i;
            }
            DmozParser parser = new DmozParser();
            if (!topics.isEmpty()) {
                int j;
                String regExp = "^(";
                for (j = 0; j < topics.size() - 1; ++j) {
                    regExp = regExp.concat((String)topics.get(j));
                    regExp = regExp.concat("|");
                }
                regExp = regExp.concat((String)topics.get(j));
                regExp = regExp.concat(").*");
                LOG.info("Topic selection pattern = " + regExp);
                topicPattern = Pattern.compile(regExp);
            }
            parser.parseDmozFile(new File(dmozFile), subsetDenom, includeAdult, skew, topicPattern);
        }
    }

    private class RDFProcessor
    extends DefaultHandler {
        String curURL = null;
        String curSection = null;
        boolean titlePending = false;
        boolean descPending = false;
        Pattern topicPattern = null;
        StringBuffer title = new StringBuffer();
        StringBuffer desc = new StringBuffer();
        XMLReader reader;
        int subsetDenom;
        int hashSkew;
        boolean includeAdult;
        Locator location;

        public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, int skew, Pattern topicPattern) throws IOException {
            this.reader = reader;
            this.subsetDenom = subsetDenom;
            this.includeAdult = includeAdult;
            this.topicPattern = topicPattern;
            this.hashSkew = skew != 0 ? skew : RANDOM.nextInt();
        }

        @Override
        public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
            if ("Topic".equals(qName)) {
                this.curSection = atts.getValue("r:id");
            } else if ("ExternalPage".equals(qName)) {
                if (!this.includeAdult && this.curSection.startsWith("Top/Adult")) {
                    return;
                }
                if (this.topicPattern != null && !this.topicPattern.matcher(this.curSection).matches()) {
                    return;
                }
                String url = atts.getValue("about");
                int hashValue = MD5Hash.digest((String)url).hashCode();
                if ((hashValue = Math.abs(hashValue ^ this.hashSkew)) % this.subsetDenom != 0) {
                    return;
                }
                this.curURL = url;
            } else if (this.curURL != null && "d:Title".equals(qName)) {
                this.titlePending = true;
            } else if (this.curURL != null && "d:Description".equals(qName)) {
                this.descPending = true;
            }
        }

        @Override
        public void characters(char[] ch, int start, int length) {
            if (this.titlePending) {
                this.title.append(ch, start, length);
            } else if (this.descPending) {
                this.desc.append(ch, start, length);
            }
        }

        @Override
        public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
            if (this.curURL != null) {
                if ("ExternalPage".equals(qName)) {
                    System.out.println(this.curURL);
                    ++DmozParser.this.pages;
                    if (this.title.length() > 0) {
                        this.title.delete(0, this.title.length());
                    }
                    if (this.desc.length() > 0) {
                        this.desc.delete(0, this.desc.length());
                    }
                    this.curURL = null;
                } else if ("d:Title".equals(qName)) {
                    this.titlePending = false;
                } else if ("d:Description".equals(qName)) {
                    this.descPending = false;
                }
            }
        }

        @Override
        public void startDocument() {
            LOG.info("Begin parse");
        }

        @Override
        public void endDocument() {
            LOG.info("Completed parse.  Found " + DmozParser.this.pages + " pages.");
        }

        @Override
        public void setDocumentLocator(Locator locator) {
            this.location = locator;
        }

        @Override
        public void error(SAXParseException spe) {
            if (LOG.isErrorEnabled()) {
                LOG.error("Error: " + spe.toString() + ": " + spe.getMessage());
            }
        }

        @Override
        public void warning(SAXParseException spe) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Warning: " + spe.toString() + ": " + spe.getMessage());
            }
        }
    }

    private static class XMLCharFilter
    extends FilterReader {
        private boolean lastBad = false;

        public XMLCharFilter(Reader reader) {
            super(reader);
        }

        @Override
        public int read() throws IOException {
            int c;
            int value = c = this.in.read();
            if (c != -1 && !XMLChar.isValid((int)c)) {
                value = 88;
            } else if (this.lastBad && c == 60) {
                this.in.mark(1);
                if (this.in.read() != 47) {
                    value = 88;
                }
                this.in.reset();
            }
            this.lastBad = c == 65533;
            return value;
        }

        @Override
        public int read(char[] cbuf, int off, int len) throws IOException {
            int n = this.in.read(cbuf, off, len);
            if (n != -1) {
                for (int i = 0; i < n; ++i) {
                    int c;
                    int value = c = cbuf[off + i];
                    if (!XMLChar.isValid((int)c)) {
                        value = 88;
                    } else if (this.lastBad && c == 60 && i != n - 1 && cbuf[off + i + 1] != '/') {
                        value = 88;
                    }
                    this.lastBad = c == 65533;
                    cbuf[off + i] = value;
                }
            }
            return n;
        }
    }
}

