View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.mime;
18  
19  // Commons Logging imports
20  import org.apache.commons.logging.Log;
21  import org.apache.commons.logging.LogFactory;
22  
23  // DOM imports
24  import org.w3c.dom.Attr;
25  import org.w3c.dom.Node;
26  import org.w3c.dom.Element;
27  import org.w3c.dom.Document;
28  import org.w3c.dom.NodeList;
29  import org.w3c.dom.NamedNodeMap;
30  import org.xml.sax.InputSource;
31  
32  // JDK imports
33  import java.io.InputStream;
34  import javax.xml.parsers.DocumentBuilder;
35  import javax.xml.parsers.DocumentBuilderFactory;
36  
37  /**
38   * A reader for XML files compliant with the freedesktop MIME-info DTD.
39   * 
40   * <pre>
41   *  &lt;!DOCTYPE mime-info [
42   *    &lt;!ELEMENT mime-info (mime-type)+&gt;
43   *    &lt;!ATTLIST mime-info xmlns CDATA #FIXED &quot;http://www.freedesktop.org/standards/shared-mime-info&quot;&gt;
44   * 
45   *    &lt;!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*&gt;
46   *    &lt;!ATTLIST mime-type type CDATA #REQUIRED&gt;
47   * 
48   *    &lt;!-- a comment describing a document with the respective MIME type. Example: &quot;WMV video&quot; --&gt;
49   *    &lt;!ELEMENT comment (#PCDATA)&gt;
50   *    &lt;!ATTLIST comment xml:lang CDATA #IMPLIED&gt;
51   * 
52   *    &lt;!-- a comment describing a the respective unexpanded MIME type acronym. Example: &quot;WMV&quot; --&gt;
53   *    &lt;!ELEMENT acronym (#PCDATA)&gt;
54   *    &lt;!ATTLIST acronym xml:lang CDATA #IMPLIED&gt;
55   * 
56   *    &lt;!-- a comment describing a the respective unexpanded MIME type acronym. Example: &quot;Windows Media Video&quot; --&gt;
57   *    &lt;!ELEMENT expanded-acronym (#PCDATA)&gt;
58   *    &lt;!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED&gt;
59   * 
60   *    &lt;!ELEMENT glob EMPTY&gt;
61   *    &lt;!ATTLIST glob pattern CDATA #REQUIRED&gt;
62   * 
63   *    &lt;!ELEMENT magic (match)+&gt;
64   *    &lt;!ATTLIST magic priority CDATA #IMPLIED&gt;
65   * 
66   *    &lt;!ELEMENT match (match)*&gt;
67   *    &lt;!ATTLIST match offset CDATA #REQUIRED&gt;
68   *    &lt;!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED&gt;
69   *    &lt;!ATTLIST match value CDATA #REQUIRED&gt;
70   *    &lt;!ATTLIST match mask CDATA #IMPLIED&gt;
71   * 
72   *    &lt;!ELEMENT root-XML EMPTY&gt;
73   *    &lt;!ATTLIST root-XML
74   *          namespaceURI CDATA #REQUIRED
75   *          localName CDATA #REQUIRED&gt;
76   * 
77   *    &lt;!ELEMENT alias EMPTY&gt;
78   *    &lt;!ATTLIST alias
79   *          type CDATA #REQUIRED&gt;
80   * 
81   *   &lt;!ELEMENT sub-class-of EMPTY&gt;
82   *   &lt;!ATTLIST sub-class-of
83   *         type CDATA #REQUIRED&gt;
84   *  ]&gt;
85   * </pre>
86   * 
87   * 
88   * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
89   * 
90   */
91  final class MimeTypesReader {
92  
93      /** The logger to use */
94      private Log logger = null;
95  
96      private final MimeTypes types;
97  
98      MimeTypesReader(MimeTypes types) {
99          this(types, null);
100     }
101 
102     MimeTypesReader(MimeTypes types, Log logger) {
103         this.types = types;
104         if (logger == null) {
105             this.logger = LogFactory.getLog(this.getClass());
106         } else {
107             this.logger = logger;
108         }
109     }
110 
111     void read(String filepath) {
112         read(MimeTypesReader.class.getClassLoader().getResourceAsStream(filepath));
113     }
114 
115     void read(InputStream stream) {
116         try {
117             DocumentBuilderFactory factory = DocumentBuilderFactory
118                     .newInstance();
119             DocumentBuilder builder = factory.newDocumentBuilder();
120             Document document = builder.parse(new InputSource(stream));
121             read(document);
122         } catch (Exception e) {
123             if (logger.isWarnEnabled()) {
124                 logger.warn(e.toString() + " while loading mime-types");
125             }
126         }
127     }
128 
129     void read(Document document) {
130         Element element = document.getDocumentElement();
131         if (element != null && element.getTagName().equals("mime-info")) {
132             NodeList nodes = element.getChildNodes();
133             for (int i = 0; i < nodes.getLength(); i++) {
134                 Node node = nodes.item(i);
135                 if (node.getNodeType() == Node.ELEMENT_NODE) {
136                     Element child = (Element) node;
137                     if (child.getTagName().equals("mime-type")) {
138                         readMimeType(child);
139                     }
140                 }
141             }
142         } else {
143             logger.warn("Not a <mime-info/> configuration document");
144         }
145     }
146 
147     /** Read Element named mime-type. */
148     private void readMimeType(Element element) {
149         String name = element.getAttribute("type");
150         try {
151             MimeType type = types.forName(name);
152 
153             NodeList nodes = element.getChildNodes();
154             for (int i = 0; i < nodes.getLength(); i++) {
155                 Node node = nodes.item(i);
156                 if (node.getNodeType() == Node.ELEMENT_NODE) {
157                     Element nodeElement = (Element) node;
158                     if (nodeElement.getTagName().equals("_comment")) {
159                         type.setDescription(
160                                 nodeElement.getFirstChild().getNodeValue());
161                     } else if (nodeElement.getTagName().equals("glob")) {
162                         types.addPattern(type, nodeElement.getAttribute("pattern"));
163                     } else if (nodeElement.getTagName().equals("magic")) {
164                         readMagic(nodeElement, type);
165                     } else if (nodeElement.getTagName().equals("alias")) {
166                         String alias = nodeElement.getAttribute("type");
167                         try {
168                             type.addAlias(alias);
169                         } catch (MimeTypeException e) {
170                             logger.warn("Invalid media type alias: " + alias, e);
171                         }
172                     } else if (nodeElement.getTagName().equals("root-XML")) {
173                         readRootXML(nodeElement, type);
174                     } else if (nodeElement.getTagName().equals("sub-class-of")) {
175                         String parent = nodeElement.getAttribute("type");
176                         try {
177                             type.setSuperType(types.forName(parent));
178                         } catch (MimeTypeException e) {
179                             logger.warn("Invalid parent type: " + parent, e);
180                         }
181                     }
182                 }
183             }
184 
185             types.add(type);
186         } catch (MimeTypeException e) {
187             logger.warn("Invalid media type configuration entry: " + name, e);
188         }
189     }
190 
191     /** Read Element named magic. */
192     private void readMagic(Element element, MimeType mimeType) {
193         Magic magic = null;
194         try {
195             magic = new Magic(Integer
196                     .parseInt(element.getAttribute("priority")));
197         } catch (Exception e) {
198             magic = new Magic();
199         }
200         magic.setType(mimeType);
201         magic.setClause(readMatches(element));
202         mimeType.addMagic(magic);
203     }
204 
205     private Clause readMatches(Element element) {
206         Clause sub = null;
207         Clause prev = Clause.FALSE;
208         Clause clause = null;
209         NodeList nodes = element.getChildNodes();
210         for (int i = 0; i < nodes.getLength(); i++) {
211             Node node = nodes.item(i);
212             if (node.getNodeType() == Node.ELEMENT_NODE) {
213                 Element nodeElement = (Element) node;
214                 if (nodeElement.getTagName().equals("match")) {
215                     sub = readMatches(nodeElement);
216                     try {
217                         if (sub != null) {
218                             clause = new MagicClause(Operator.AND,
219                                     readMatch(nodeElement), sub);
220                         } else {
221                             clause = readMatch(nodeElement);
222                         }
223                         clause = new MagicClause(Operator.OR, prev, clause);
224                         prev = clause;
225                     } catch (MimeTypeException mte) {
226                         logger.warn(mte + " while reading magic-match ["
227                                 + nodeElement + "], Ignoring!");
228                     }
229                 }
230             }
231         }
232         return clause;
233     }
234 
235     /** Read Element named match. */
236     private MagicMatch readMatch(Element element) throws MimeTypeException {
237 
238         String offset = null;
239         String value = null;
240         String mask = null;
241         String type = null;
242 
243         NamedNodeMap attrs = element.getAttributes();
244         for (int i = 0; i < attrs.getLength(); i++) {
245             Attr attr = (Attr) attrs.item(i);
246             if (attr.getName().equals("offset")) {
247                 offset = attr.getValue();
248             } else if (attr.getName().equals("type")) {
249                 type = attr.getValue();
250             } else if (attr.getName().equals("value")) {
251                 value = attr.getValue();
252             } else if (attr.getName().equals("mask")) {
253                 mask = attr.getValue();
254             }
255         }
256         // Parse OffSet
257         String[] offsets = offset.split(":");
258         int offStart = 0;
259         int offEnd = 0;
260         try {
261             offStart = Integer.parseInt(offsets[0]);
262         } catch (Exception e) {
263             // WARN log + avoid loading
264         }
265         try {
266             offEnd = Integer.parseInt(offsets[1]);
267         } catch (Exception e) {
268             // WARN log
269         }
270         offEnd = Math.max(offStart, offEnd);
271 
272         return new MagicMatch(offStart, offEnd, type, mask, value);
273     }
274 
275     /** Read Element named root-XML. */
276     private void readRootXML(Element element, MimeType mimeType) {
277         mimeType.addRootXML(element.getAttribute("namespaceURI"), element
278                 .getAttribute("localName"));
279     }
280 
281 }