/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.parse.tika;

import java.net.URL;
import java.util.Locale;

import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.*;

/**
 * Class for parsing META Directives from DOM trees. This class handles
 * specifically Robots META directives (all, none, nofollow, noindex), finding
 * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are
 * stored in a HTMLMetaTags instance.
 */
public class HTMLMetaProcessor {

  /**
   * Utility class with indicators for the robots directives "noindex" and
   * "nofollow", and HTTP-EQUIV/no-cache
   */

  /**
   * Sets the indicators in <code>robotsMeta</code> to appropriate values, based
   * on any META tags found under the given <code>node</code>.
   */
  public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
      URL currURL) {

    metaTags.reset();
    getMetaTagsHelper(metaTags, node, currURL);
  }

  private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
      URL currURL) {

    if (node.getNodeType() == Node.ELEMENT_NODE) {

      if ("body".equalsIgnoreCase(node.getNodeName())) {
        // META tags should not be under body
        return;
      }

      if ("meta".equalsIgnoreCase(node.getNodeName())) {
        NamedNodeMap attrs = node.getAttributes();
        Node nameNode = null;
        Node equivNode = null;
        Node contentNode = null;
        // Retrieves name, http-equiv and content attribues
        for (int i = 0; i < attrs.getLength(); i++) {
          Node attr = attrs.item(i);
          String attrName = attr.getNodeName().toLowerCase(Locale.ROOT);
          if (attrName.equals("name")) {
            nameNode = attr;
          } else if (attrName.equals("http-equiv")) {
            equivNode = attr;
          } else if (attrName.equals("content")) {
            contentNode = attr;
          }
        }

        if (nameNode != null) {
          if (contentNode != null) {
            String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT);
            metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
            if ("robots".equals(name)) {

              if (contentNode != null) {
                String directives = contentNode.getNodeValue().toLowerCase(Locale.ROOT);
                int index = directives.indexOf("none");

                if (index >= 0) {
                  metaTags.setNoIndex();
                  metaTags.setNoFollow();
                }

                index = directives.indexOf("all");
                if (index >= 0) {
                  // do nothing...
                }

                index = directives.indexOf("noindex");
                if (index >= 0) {
                  metaTags.setNoIndex();
                }

                index = directives.indexOf("nofollow");
                if (index >= 0) {
                  metaTags.setNoFollow();
                }

                index = directives.indexOf("noarchive");
                if (index >= 0) {
                  metaTags.setNoCache();
                }
              }

            } // end if (name == robots)
          }
        }

        if (equivNode != null) {
          if (contentNode != null) {
            String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT);
            String content = contentNode.getNodeValue();
            metaTags.getHttpEquivTags().setProperty(name, content);
            if ("pragma".equals(name)) {
              content = content.toLowerCase(Locale.ROOT);
              int index = content.indexOf("no-cache");
              if (index >= 0)
                metaTags.setNoCache();
            } else if ("refresh".equals(name)) {
              int idx = content.indexOf(';');
              String time = null;
              if (idx == -1) { // just the refresh time
                time = content;
              } else
                time = content.substring(0, idx);
              try {
                metaTags.setRefreshTime(Integer.parseInt(time));
                // skip this if we couldn't parse the time
                metaTags.setRefresh(true);
              } catch (Exception e) {
                ;
              }
              URL refreshUrl = null;
              if (metaTags.getRefresh() && idx != -1) { // set the URL
                idx = content.toLowerCase(Locale.ROOT).indexOf("url=");
                if (idx == -1) { // assume a mis-formatted entry with just the
                                 // url
                  idx = content.indexOf(';') + 1;
                } else
                  idx += 4;
                if (idx != -1) {
                  String url = content.substring(idx);
                  try {
                    refreshUrl = new URL(url);
                  } catch (Exception e) {
                    // XXX according to the spec, this has to be an absolute
                    // XXX url. However, many websites use relative URLs and
                    // XXX expect browsers to handle that.
                    // XXX Unfortunately, in some cases this may create a
                    // XXX infinitely recursive paths (a crawler trap)...
                    // if (!url.startsWith("/")) url = "/" + url;
                    try {
                      refreshUrl = new URL(currURL, url);
                    } catch (Exception e1) {
                      refreshUrl = null;
                    }
                  }
                }
              }
              if (metaTags.getRefresh()) {
                if (refreshUrl == null) {
                  // apparently only refresh time was present. set the URL
                  // to the same URL.
                  refreshUrl = currURL;
                }
                metaTags.setRefreshHref(refreshUrl);
              }
            }
          }
        }

      } else if ("base".equalsIgnoreCase(node.getNodeName())) {
        NamedNodeMap attrs = node.getAttributes();
        Node hrefNode = attrs.getNamedItem("href");

        if (hrefNode != null) {
          String urlString = hrefNode.getNodeValue();

          URL url = null;
          try {
            if (currURL == null)
              url = new URL(urlString);
            else
              url = new URL(currURL, urlString);
          } catch (Exception e) {
            ;
          }

          if (url != null)
            metaTags.setBaseHref(url);
        }

      }

    }

    NodeList children = node.getChildNodes();
    if (children != null) {
      int len = children.getLength();
      for (int i = 0; i < len; i++) {
        getMetaTagsHelper(metaTags, children.item(i), currURL);
      }
    }
  }

}
