1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.mime;
18
19
20 import org.apache.commons.logging.Log;
21 import org.apache.commons.logging.LogFactory;
22
23
24 import org.w3c.dom.Attr;
25 import org.w3c.dom.Node;
26 import org.w3c.dom.Element;
27 import org.w3c.dom.Document;
28 import org.w3c.dom.NodeList;
29 import org.w3c.dom.NamedNodeMap;
30 import org.xml.sax.InputSource;
31
32
33 import java.io.InputStream;
34 import javax.xml.parsers.DocumentBuilder;
35 import javax.xml.parsers.DocumentBuilderFactory;
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91 final class MimeTypesReader {
92
93
94 private Log logger = null;
95
96 private final MimeTypes types;
97
98 MimeTypesReader(MimeTypes types) {
99 this(types, null);
100 }
101
102 MimeTypesReader(MimeTypes types, Log logger) {
103 this.types = types;
104 if (logger == null) {
105 this.logger = LogFactory.getLog(this.getClass());
106 } else {
107 this.logger = logger;
108 }
109 }
110
111 void read(String filepath) {
112 read(MimeTypesReader.class.getClassLoader().getResourceAsStream(filepath));
113 }
114
115 void read(InputStream stream) {
116 try {
117 DocumentBuilderFactory factory = DocumentBuilderFactory
118 .newInstance();
119 DocumentBuilder builder = factory.newDocumentBuilder();
120 Document document = builder.parse(new InputSource(stream));
121 read(document);
122 } catch (Exception e) {
123 if (logger.isWarnEnabled()) {
124 logger.warn(e.toString() + " while loading mime-types");
125 }
126 }
127 }
128
129 void read(Document document) {
130 Element element = document.getDocumentElement();
131 if (element != null && element.getTagName().equals("mime-info")) {
132 NodeList nodes = element.getChildNodes();
133 for (int i = 0; i < nodes.getLength(); i++) {
134 Node node = nodes.item(i);
135 if (node.getNodeType() == Node.ELEMENT_NODE) {
136 Element child = (Element) node;
137 if (child.getTagName().equals("mime-type")) {
138 readMimeType(child);
139 }
140 }
141 }
142 } else {
143 logger.warn("Not a <mime-info/> configuration document");
144 }
145 }
146
147
148 private void readMimeType(Element element) {
149 String name = element.getAttribute("type");
150 try {
151 MimeType type = types.forName(name);
152
153 NodeList nodes = element.getChildNodes();
154 for (int i = 0; i < nodes.getLength(); i++) {
155 Node node = nodes.item(i);
156 if (node.getNodeType() == Node.ELEMENT_NODE) {
157 Element nodeElement = (Element) node;
158 if (nodeElement.getTagName().equals("_comment")) {
159 type.setDescription(
160 nodeElement.getFirstChild().getNodeValue());
161 } else if (nodeElement.getTagName().equals("glob")) {
162 types.addPattern(type, nodeElement.getAttribute("pattern"));
163 } else if (nodeElement.getTagName().equals("magic")) {
164 readMagic(nodeElement, type);
165 } else if (nodeElement.getTagName().equals("alias")) {
166 String alias = nodeElement.getAttribute("type");
167 try {
168 type.addAlias(alias);
169 } catch (MimeTypeException e) {
170 logger.warn("Invalid media type alias: " + alias, e);
171 }
172 } else if (nodeElement.getTagName().equals("root-XML")) {
173 readRootXML(nodeElement, type);
174 } else if (nodeElement.getTagName().equals("sub-class-of")) {
175 String parent = nodeElement.getAttribute("type");
176 try {
177 type.setSuperType(types.forName(parent));
178 } catch (MimeTypeException e) {
179 logger.warn("Invalid parent type: " + parent, e);
180 }
181 }
182 }
183 }
184
185 types.add(type);
186 } catch (MimeTypeException e) {
187 logger.warn("Invalid media type configuration entry: " + name, e);
188 }
189 }
190
191
192 private void readMagic(Element element, MimeType mimeType) {
193 Magic magic = null;
194 try {
195 magic = new Magic(Integer
196 .parseInt(element.getAttribute("priority")));
197 } catch (Exception e) {
198 magic = new Magic();
199 }
200 magic.setType(mimeType);
201 magic.setClause(readMatches(element));
202 mimeType.addMagic(magic);
203 }
204
205 private Clause readMatches(Element element) {
206 Clause sub = null;
207 Clause prev = Clause.FALSE;
208 Clause clause = null;
209 NodeList nodes = element.getChildNodes();
210 for (int i = 0; i < nodes.getLength(); i++) {
211 Node node = nodes.item(i);
212 if (node.getNodeType() == Node.ELEMENT_NODE) {
213 Element nodeElement = (Element) node;
214 if (nodeElement.getTagName().equals("match")) {
215 sub = readMatches(nodeElement);
216 try {
217 if (sub != null) {
218 clause = new MagicClause(Operator.AND,
219 readMatch(nodeElement), sub);
220 } else {
221 clause = readMatch(nodeElement);
222 }
223 clause = new MagicClause(Operator.OR, prev, clause);
224 prev = clause;
225 } catch (MimeTypeException mte) {
226 logger.warn(mte + " while reading magic-match ["
227 + nodeElement + "], Ignoring!");
228 }
229 }
230 }
231 }
232 return clause;
233 }
234
235
236 private MagicMatch readMatch(Element element) throws MimeTypeException {
237
238 String offset = null;
239 String value = null;
240 String mask = null;
241 String type = null;
242
243 NamedNodeMap attrs = element.getAttributes();
244 for (int i = 0; i < attrs.getLength(); i++) {
245 Attr attr = (Attr) attrs.item(i);
246 if (attr.getName().equals("offset")) {
247 offset = attr.getValue();
248 } else if (attr.getName().equals("type")) {
249 type = attr.getValue();
250 } else if (attr.getName().equals("value")) {
251 value = attr.getValue();
252 } else if (attr.getName().equals("mask")) {
253 mask = attr.getValue();
254 }
255 }
256
257 String[] offsets = offset.split(":");
258 int offStart = 0;
259 int offEnd = 0;
260 try {
261 offStart = Integer.parseInt(offsets[0]);
262 } catch (Exception e) {
263
264 }
265 try {
266 offEnd = Integer.parseInt(offsets[1]);
267 } catch (Exception e) {
268
269 }
270 offEnd = Math.max(offStart, offEnd);
271
272 return new MagicMatch(offStart, offEnd, type, mask, value);
273 }
274
275
276 private void readRootXML(Element element, MimeType mimeType) {
277 mimeType.addRootXML(element.getAttribute("namespaceURI"), element
278 .getAttribute("localName"));
279 }
280
281 }