1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.html;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.Map;
24 import java.util.Set;
25
26 import org.apache.commons.io.input.CloseShieldInputStream;
27 import org.apache.tika.exception.TikaException;
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.parser.Parser;
30 import org.apache.tika.sax.TeeContentHandler;
31 import org.apache.tika.sax.TextContentHandler;
32 import org.apache.tika.sax.WriteOutContentHandler;
33 import org.apache.tika.sax.XHTMLContentHandler;
34 import org.apache.tika.sax.xpath.Matcher;
35 import org.apache.tika.sax.xpath.MatchingContentHandler;
36 import org.apache.tika.sax.xpath.XPathParser;
37 import org.apache.tika.utils.Utils;
38 import org.cyberneko.html.parsers.SAXParser;
39 import org.xml.sax.Attributes;
40 import org.xml.sax.ContentHandler;
41 import org.xml.sax.InputSource;
42 import org.xml.sax.SAXException;
43
44
45
46
47
48
49 public class HtmlParser implements Parser {
50
51
52
53
54
55 private static final Map<String, String> SAFE_ELEMENTS =
56 new HashMap<String, String>();
57
58
59
60
61 private static final Set<String> DISCARD_ELEMENTS = new HashSet<String>();
62
63 static {
64
65 SAFE_ELEMENTS.put("P", "p");
66 SAFE_ELEMENTS.put("H1", "h1");
67 SAFE_ELEMENTS.put("H2", "h2");
68 SAFE_ELEMENTS.put("H3", "h3");
69 SAFE_ELEMENTS.put("H4", "h4");
70 SAFE_ELEMENTS.put("H5", "h5");
71 SAFE_ELEMENTS.put("H6", "h6");
72 SAFE_ELEMENTS.put("UL", "ul");
73 SAFE_ELEMENTS.put("OL", "ol");
74 SAFE_ELEMENTS.put("LI", "li");
75 SAFE_ELEMENTS.put("DL", "dl");
76 SAFE_ELEMENTS.put("DT", "dt");
77 SAFE_ELEMENTS.put("DD", "dd");
78 SAFE_ELEMENTS.put("PRE", "pre");
79 SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
80 SAFE_ELEMENTS.put("TABLE", "p");
81
82 DISCARD_ELEMENTS.add("STYLE");
83 DISCARD_ELEMENTS.add("SCRIPT");
84 }
85
86 public void parse(
87 InputStream stream, ContentHandler handler, Metadata metadata)
88 throws IOException, SAXException, TikaException {
89
90 stream = new CloseShieldInputStream(stream);
91
92
93
94 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
95 XPathParser xpath = new XPathParser(null, "");
96 Matcher body = xpath.parse("/HTML/BODY//node()");
97 Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
98 handler = new TeeContentHandler(
99 new MatchingContentHandler(getBodyHandler(xhtml), body),
100 new MatchingContentHandler(getTitleHandler(metadata), title));
101
102
103 xhtml.startDocument();
104 SAXParser parser = new SAXParser();
105 parser.setContentHandler(new XHTMLDowngradeHandler(handler));
106 parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
107 xhtml.endDocument();
108 }
109
110 private ContentHandler getTitleHandler(final Metadata metadata) {
111 return new WriteOutContentHandler() {
112 @Override
113 public void endElement(String u, String l, String n) {
114 metadata.set(Metadata.TITLE, toString());
115 }
116 };
117 }
118
119 private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
120 return new TextContentHandler(xhtml) {
121
122 private int discardLevel = 0;
123
124 @Override
125 public void startElement(
126 String uri, String local, String name, Attributes atts)
127 throws SAXException {
128 if (discardLevel != 0) {
129 discardLevel++;
130 } else if (DISCARD_ELEMENTS.contains(name)) {
131 discardLevel = 1;
132 } else if (SAFE_ELEMENTS.containsKey(name)) {
133 xhtml.startElement(SAFE_ELEMENTS.get(name));
134 } else if ("A".equals(name)) {
135 String href = atts.getValue("href");
136 if (href == null) {
137 href = "";
138 }
139 xhtml.startElement("a", "href", href);
140 }
141 }
142
143 @Override
144 public void endElement(
145 String uri, String local, String name) throws SAXException {
146 if (discardLevel != 0) {
147 discardLevel--;
148 } else if (SAFE_ELEMENTS.containsKey(name)) {
149 xhtml.endElement(SAFE_ELEMENTS.get(name));
150 } else if ("A".equals(name)) {
151 xhtml.endElement("a");
152 }
153 }
154
155 @Override
156 public void characters(char[] ch, int start, int length)
157 throws SAXException {
158 if (discardLevel == 0) {
159 super.characters(ch, start, length);
160 }
161 }
162
163 @Override
164 public void ignorableWhitespace(char[] ch, int start, int length)
165 throws SAXException {
166 if (discardLevel == 0) {
167 super.ignorableWhitespace(ch, start, length);
168 }
169 }
170
171 };
172 }
173
174 }