View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.config;
18  
19  import java.io.File;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.net.URL;
23  import java.util.HashMap;
24  import java.util.Map;
25  
26  import javax.xml.parsers.DocumentBuilder;
27  import javax.xml.parsers.DocumentBuilderFactory;
28  import javax.xml.parsers.ParserConfigurationException;
29  
30  import org.apache.tika.exception.TikaException;
31  import org.apache.tika.mime.MimeTypes;
32  import org.apache.tika.mime.MimeTypesFactory;
33  import org.apache.tika.parser.Parser;
34  import org.w3c.dom.Document;
35  import org.w3c.dom.Element;
36  import org.w3c.dom.Node;
37  import org.w3c.dom.NodeList;
38  import org.xml.sax.SAXException;
39  
40  /**
41   * Parse xml config file.
42   */
43  public class TikaConfig {
44  
45      public static final String DEFAULT_CONFIG_LOCATION = 
46          "/org/apache/tika/tika-config.xml";
47  
48      private final Map<String, Parser> parsers = new HashMap<String, Parser>();
49      
50      private static MimeTypes mimeTypes;
51  
52      public TikaConfig(String file)
53              throws TikaException, IOException, SAXException {
54          this(new File(file));
55      }
56  
57      public TikaConfig(File file)
58              throws TikaException, IOException, SAXException {
59          this(getBuilder().parse(file));
60      }
61  
62      public TikaConfig(URL url)
63              throws TikaException, IOException, SAXException {
64          this(getBuilder().parse(url.toString()));
65      }
66  
67      public TikaConfig(InputStream stream)
68              throws TikaException, IOException, SAXException {
69          this(getBuilder().parse(stream));
70      }
71  
72      public TikaConfig(Document document) throws TikaException, IOException {
73          this(document.getDocumentElement());
74      }
75  
76      public TikaConfig(Element element) throws TikaException, IOException {
77          Element mtr = getChild(element, "mimeTypeRepository");
78          if (mtr != null) {
79              mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
80          }
81  
82          NodeList nodes = element.getElementsByTagName("parser");
83          for (int i = 0; i < nodes.getLength(); i++) {
84              Element node = (Element) nodes.item(i);
85              String name = node.getAttribute("class");
86              try {
87                  Parser parser = (Parser) Class.forName(name).newInstance();
88                  NodeList mimes = node.getElementsByTagName("mime");
89                  for (int j = 0; j < mimes.getLength(); j++) {
90                      Element mime = (Element) mimes.item(j);
91                      parsers.put(mime.getTextContent().trim(), parser);
92                  }
93              } catch (Exception e) {
94                  throw new TikaException(
95                          "Invalid parser configuration: " + name, e);
96              }
97          }
98      }
99  
100     /**
101      * Returns the parser instance configured for the given MIME type.
102      * Returns <code>null</code> if the given MIME type is unknown.
103      *
104      * @param mimeType MIME type
105      * @return configured Parser instance, or <code>null</code>
106      */
107     public Parser getParser(String mimeType) {
108         return parsers.get(mimeType);
109     }
110 
111     public Map<String, Parser> getParsers() {
112         return parsers;
113     }
114 
115     public MimeTypes getMimeRepository(){
116         return mimeTypes;
117     }
118 
119     /**
120      * Provides a default configuration (TikaConfig).  Currently creates a
121      * new instance each time it's called; we may be able to have it
122      * return a shared instance once it is completely immutable.
123      *
124      * @return default configuration
125      * @throws TikaException if the default configuration is not available
126      */
127     public static TikaConfig getDefaultConfig() throws TikaException {
128         try {
129             InputStream stream =
130                 TikaConfig.class.getResourceAsStream(DEFAULT_CONFIG_LOCATION);
131             return new TikaConfig(stream);
132         } catch (IOException e) {
133             throw new TikaException("Unable to read default configuration", e);
134         } catch (SAXException e) {
135             throw new TikaException("Unable to parse default configuration", e);
136         }
137     }
138 
139     private static DocumentBuilder getBuilder() throws TikaException {
140         try {
141             return DocumentBuilderFactory.newInstance().newDocumentBuilder();
142         } catch (ParserConfigurationException e) {
143             throw new TikaException("XML parser not available", e);
144         }
145     }
146 
147     private static Element getChild(Element element, String name) {
148         Node child = element.getFirstChild();
149         while (child != null) {
150             if (child.getNodeType() == Node.ELEMENT_NODE
151                     && name.equals(child.getNodeName())) {
152                 return (Element) child;
153             }
154             child = child.getNextSibling();
155         }
156         return null;
157     }
158 
159 }