/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.protocol.http.api;

import crawlercommons.robots.BaseRobotRules;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.RobotRulesParser;
import org.apache.nutch.protocol.http.api.HttpBase;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HttpRobotRulesParser
extends RobotRulesParser {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    protected boolean allowForbidden = false;
    protected boolean deferVisits503 = false;

    HttpRobotRulesParser() {
    }

    public HttpRobotRulesParser(Configuration conf) {
        this.setConf(conf);
    }

    public void setConf(Configuration conf) {
        super.setConf(conf);
        this.allowForbidden = conf.getBoolean("http.robots.403.allow", true);
        this.deferVisits503 = conf.getBoolean("http.robots.503.defer.visits", true);
    }

    protected static String getCacheKey(URL url) {
        String protocol = url.getProtocol().toLowerCase();
        String host = url.getHost().toLowerCase();
        int port = url.getPort();
        if (port == -1) {
            port = url.getDefaultPort();
        }
        String cacheKey = protocol + ":" + host + ":" + port;
        return cacheKey;
    }

    public BaseRobotRules getRobotRulesSet(Protocol http, URL url, List<Content> robotsTxtContent) {
        String cacheKey;
        BaseRobotRules robotRules;
        if (LOG.isTraceEnabled() && this.isAllowListed(url)) {
            LOG.trace("Ignoring robots.txt (host is allowlisted) for URL: {}", (Object)url);
        }
        if ((robotRules = (BaseRobotRules)CACHE.get(cacheKey = HttpRobotRulesParser.getCacheKey(url))) != null) {
            return robotRules;
        }
        if (LOG.isTraceEnabled()) {
            LOG.trace("Robots.txt cache miss {}", (Object)url);
        }
        boolean cacheRule = true;
        HashSet<String> redirectCacheKeys = new HashSet<String>();
        if (this.isAllowListed(url)) {
            robotRules = EMPTY_RULES;
            LOG.info("Allowlisted host found for: {}", (Object)url);
            LOG.info("Ignoring robots.txt for all URLs from allowlisted host: {}", (Object)url.getHost());
        } else {
            URL robotsUrl = null;
            URL robotsUrlRedir = null;
            try {
                robotsUrl = new URL(url, "/robots.txt");
                int numRedirects = 0;
                robotsUrlRedir = robotsUrl;
                Response response = ((HttpBase)http).getResponse(robotsUrl, new CrawlDatum(), true);
                int code = response.getCode();
                if (robotsTxtContent != null) {
                    this.addRobotsContent(robotsTxtContent, robotsUrl, response);
                }
                while (this.isRedirect(code) && numRedirects < this.maxNumRedirects) {
                    ++numRedirects;
                    String redirectionLocation = response.getHeader("Location");
                    if (StringUtils.isNotBlank((String)redirectionLocation)) {
                        LOG.debug("Following robots.txt redirect: {} -> {}", (Object)robotsUrlRedir, (Object)redirectionLocation);
                        try {
                            robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation);
                        }
                        catch (MalformedURLException e) {
                            LOG.info("Failed to resolve redirect location for robots.txt: {} -> {} ({})", new Object[]{robotsUrlRedir, redirectionLocation, e.getMessage()});
                            break;
                        }
                        response = ((HttpBase)http).getResponse(robotsUrlRedir, new CrawlDatum(), true);
                        code = response.getCode();
                        if (robotsTxtContent != null) {
                            this.addRobotsContent(robotsTxtContent, robotsUrlRedir, response);
                        }
                    } else {
                        LOG.info("No HTTP redirect Location header for robots.txt: {} (status code: {})", (Object)robotsUrlRedir, (Object)code);
                        break;
                    }
                    if ("/robots.txt".equals(robotsUrlRedir.getFile())) {
                        String redirectCacheKey = HttpRobotRulesParser.getCacheKey(robotsUrlRedir);
                        robotRules = (BaseRobotRules)CACHE.get(redirectCacheKey);
                        LOG.debug("Found cached robots.txt rules for {} (redirected to {}) under target key {}", new Object[]{url, robotsUrlRedir, redirectCacheKey});
                        if (robotRules != null) {
                            CACHE.put(cacheKey, robotRules);
                            return robotRules;
                        }
                        redirectCacheKeys.add(redirectCacheKey);
                    }
                    if (numRedirects != this.maxNumRedirects || !this.isRedirect(code)) continue;
                    LOG.info("Reached maximum number of robots.txt redirects for {} (assuming no robots.txt, allow all)", (Object)url);
                }
                LOG.debug("Fetched robots.txt for {} with status code {}", (Object)url, (Object)code);
                if (code == 200) {
                    robotRules = this.parseRules(url.toString(), response.getContent(), response.getHeader("Content-Type"), this.agentNames);
                } else if (code == 403 && !this.allowForbidden) {
                    robotRules = FORBID_ALL_RULES;
                } else if (code >= 500 || code == 429) {
                    cacheRule = false;
                    robotRules = this.deferVisits503 ? DEFER_VISIT_RULES : EMPTY_RULES;
                } else {
                    robotRules = EMPTY_RULES;
                }
            }
            catch (Throwable t) {
                if (robotsUrl == null || robotsUrlRedir == null) {
                    LOG.info("Couldn't get robots.txt for {}", (Object)url, (Object)t);
                } else if (robotsUrl.equals(robotsUrlRedir)) {
                    LOG.info("Couldn't get robots.txt for {} ({}): {}", new Object[]{url, robotsUrl, t});
                } else {
                    LOG.info("Couldn't get redirected robots.txt for {} (redirected to {}): {}", new Object[]{url, robotsUrlRedir, t});
                }
                cacheRule = false;
                robotRules = EMPTY_RULES;
            }
        }
        if (cacheRule) {
            CACHE.put(cacheKey, robotRules);
            for (String redirectCacheKey : redirectCacheKeys) {
                CACHE.put(redirectCacheKey, robotRules);
            }
        }
        return robotRules;
    }

    private boolean isRedirect(int code) {
        return code == 301 || code == 302 || code == 303 || code == 307 || code == 308;
    }

    protected void addRobotsContent(List<Content> robotsTxtContent, URL robotsUrl, Response robotsResponse) {
        byte[] robotsBytes = robotsResponse.getContent();
        if (robotsBytes == null) {
            robotsBytes = new byte[]{};
        }
        Content content = new Content(robotsUrl.toString(), robotsUrl.toString(), robotsBytes, robotsResponse.getHeader("Content-Type"), robotsResponse.getHeaders(), this.getConf());
        robotsTxtContent.add(content);
    }
}

