View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.html;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.util.HashMap;
22  import java.util.HashSet;
23  import java.util.Map;
24  import java.util.Set;
25  
26  import org.apache.commons.io.input.CloseShieldInputStream;
27  import org.apache.tika.exception.TikaException;
28  import org.apache.tika.metadata.Metadata;
29  import org.apache.tika.parser.Parser;
30  import org.apache.tika.sax.TeeContentHandler;
31  import org.apache.tika.sax.TextContentHandler;
32  import org.apache.tika.sax.WriteOutContentHandler;
33  import org.apache.tika.sax.XHTMLContentHandler;
34  import org.apache.tika.sax.xpath.Matcher;
35  import org.apache.tika.sax.xpath.MatchingContentHandler;
36  import org.apache.tika.sax.xpath.XPathParser;
37  import org.apache.tika.utils.Utils;
38  import org.cyberneko.html.parsers.SAXParser;
39  import org.xml.sax.Attributes;
40  import org.xml.sax.ContentHandler;
41  import org.xml.sax.InputSource;
42  import org.xml.sax.SAXException;
43  
44  /**
45   * HTML parser. Uses CyberNeko to turn the input document to HTML SAX events,
46   * and post-processes the events to produce XHTML and metadata expected by
47   * Tika clients.
48   */
49  public class HtmlParser implements Parser {
50  
51      /**
52       * Set of safe mappings from incoming HTML elements to outgoing
53       * XHTML elements. Ensures that the output is valid XHTML 1.0 Strict.
54       */
55      private static final Map<String, String> SAFE_ELEMENTS =
56          new HashMap<String, String>();
57  
58      /**
59       * Set of HTML elements whose content will be discarded.
60       */
61      private static final Set<String> DISCARD_ELEMENTS = new HashSet<String>();
62  
63      static {
64          // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
65          SAFE_ELEMENTS.put("P", "p");
66          SAFE_ELEMENTS.put("H1", "h1");
67          SAFE_ELEMENTS.put("H2", "h2");
68          SAFE_ELEMENTS.put("H3", "h3");
69          SAFE_ELEMENTS.put("H4", "h4");
70          SAFE_ELEMENTS.put("H5", "h5");
71          SAFE_ELEMENTS.put("H6", "h6");
72          SAFE_ELEMENTS.put("UL", "ul");
73          SAFE_ELEMENTS.put("OL", "ol");
74          SAFE_ELEMENTS.put("LI", "li");
75          SAFE_ELEMENTS.put("DL", "dl");
76          SAFE_ELEMENTS.put("DT", "dt");
77          SAFE_ELEMENTS.put("DD", "dd");
78          SAFE_ELEMENTS.put("PRE", "pre");
79          SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
80          SAFE_ELEMENTS.put("TABLE", "p"); // TODO colspan/rowspan issues
81  
82          DISCARD_ELEMENTS.add("STYLE");
83          DISCARD_ELEMENTS.add("SCRIPT");
84      }
85  
86      public void parse(
87              InputStream stream, ContentHandler handler, Metadata metadata)
88              throws IOException, SAXException, TikaException {
89          // Protect the stream from being closed by CyberNeko
90          stream = new CloseShieldInputStream(stream);
91  
92          // Prepare the HTML content handler that generates proper
93          // XHTML events to records relevant document metadata
94          XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
95          XPathParser xpath = new XPathParser(null, "");
96          Matcher body = xpath.parse("/HTML/BODY//node()");
97          Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
98          handler = new TeeContentHandler(
99                  new MatchingContentHandler(getBodyHandler(xhtml), body),
100                 new MatchingContentHandler(getTitleHandler(metadata), title));
101 
102         // Parse the HTML document
103         xhtml.startDocument();
104         SAXParser parser = new SAXParser();
105         parser.setContentHandler(new XHTMLDowngradeHandler(handler));
106         parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
107         xhtml.endDocument();
108     }
109 
110     private ContentHandler getTitleHandler(final Metadata metadata) {
111         return new WriteOutContentHandler() {
112             @Override
113             public void endElement(String u, String l, String n) {
114                 metadata.set(Metadata.TITLE, toString());
115             }
116         };
117     }
118 
119     private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
120         return new TextContentHandler(xhtml) {
121 
122             private int discardLevel = 0;
123 
124             @Override
125             public void startElement(
126                     String uri, String local, String name, Attributes atts)
127                     throws SAXException {
128                 if (discardLevel != 0) {
129                     discardLevel++;
130                 } else if (DISCARD_ELEMENTS.contains(name)) {
131                     discardLevel = 1;
132                 } else if (SAFE_ELEMENTS.containsKey(name)) {
133                     xhtml.startElement(SAFE_ELEMENTS.get(name));
134                 } else if ("A".equals(name)) {
135                     String href = atts.getValue("href");
136                     if (href == null) {
137                         href = "";
138                     }
139                     xhtml.startElement("a", "href", href);
140                 }
141             }
142 
143             @Override
144             public void endElement(
145                     String uri, String local, String name) throws SAXException {
146                 if (discardLevel != 0) {
147                     discardLevel--;
148                 } else if (SAFE_ELEMENTS.containsKey(name)) {
149                     xhtml.endElement(SAFE_ELEMENTS.get(name));
150                 } else if ("A".equals(name)) {
151                     xhtml.endElement("a");
152                 }
153             }
154 
155             @Override
156             public void characters(char[] ch, int start, int length)
157                     throws SAXException {
158                 if (discardLevel == 0) {
159                     super.characters(ch, start, length);
160                 }
161             }
162 
163             @Override
164             public void ignorableWhitespace(char[] ch, int start, int length)
165                     throws SAXException {
166                 if (discardLevel == 0) {
167                     super.ignorableWhitespace(ch, start, length);
168                 }
169             }
170 
171         };
172     }
173 
174 }