1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.config;
18
19 import java.io.File;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.net.URL;
23 import java.util.HashMap;
24 import java.util.Map;
25
26 import javax.xml.parsers.DocumentBuilder;
27 import javax.xml.parsers.DocumentBuilderFactory;
28 import javax.xml.parsers.ParserConfigurationException;
29
30 import org.apache.tika.exception.TikaException;
31 import org.apache.tika.mime.MimeTypes;
32 import org.apache.tika.mime.MimeTypesFactory;
33 import org.apache.tika.parser.Parser;
34 import org.w3c.dom.Document;
35 import org.w3c.dom.Element;
36 import org.w3c.dom.Node;
37 import org.w3c.dom.NodeList;
38 import org.xml.sax.SAXException;
39
40
41
42
43 public class TikaConfig {
44
45 public static final String DEFAULT_CONFIG_LOCATION =
46 "/org/apache/tika/tika-config.xml";
47
48 private final Map<String, Parser> parsers = new HashMap<String, Parser>();
49
50 private static MimeTypes mimeTypes;
51
52 public TikaConfig(String file)
53 throws TikaException, IOException, SAXException {
54 this(new File(file));
55 }
56
57 public TikaConfig(File file)
58 throws TikaException, IOException, SAXException {
59 this(getBuilder().parse(file));
60 }
61
62 public TikaConfig(URL url)
63 throws TikaException, IOException, SAXException {
64 this(getBuilder().parse(url.toString()));
65 }
66
67 public TikaConfig(InputStream stream)
68 throws TikaException, IOException, SAXException {
69 this(getBuilder().parse(stream));
70 }
71
72 public TikaConfig(Document document) throws TikaException, IOException {
73 this(document.getDocumentElement());
74 }
75
76 public TikaConfig(Element element) throws TikaException, IOException {
77 Element mtr = getChild(element, "mimeTypeRepository");
78 if (mtr != null) {
79 mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
80 }
81
82 NodeList nodes = element.getElementsByTagName("parser");
83 for (int i = 0; i < nodes.getLength(); i++) {
84 Element node = (Element) nodes.item(i);
85 String name = node.getAttribute("class");
86 try {
87 Parser parser = (Parser) Class.forName(name).newInstance();
88 NodeList mimes = node.getElementsByTagName("mime");
89 for (int j = 0; j < mimes.getLength(); j++) {
90 Element mime = (Element) mimes.item(j);
91 parsers.put(mime.getTextContent().trim(), parser);
92 }
93 } catch (Exception e) {
94 throw new TikaException(
95 "Invalid parser configuration: " + name, e);
96 }
97 }
98 }
99
100
101
102
103
104
105
106
107 public Parser getParser(String mimeType) {
108 return parsers.get(mimeType);
109 }
110
111 public Map<String, Parser> getParsers() {
112 return parsers;
113 }
114
115 public MimeTypes getMimeRepository(){
116 return mimeTypes;
117 }
118
119
120
121
122
123
124
125
126
127 public static TikaConfig getDefaultConfig() throws TikaException {
128 try {
129 InputStream stream =
130 TikaConfig.class.getResourceAsStream(DEFAULT_CONFIG_LOCATION);
131 return new TikaConfig(stream);
132 } catch (IOException e) {
133 throw new TikaException("Unable to read default configuration", e);
134 } catch (SAXException e) {
135 throw new TikaException("Unable to parse default configuration", e);
136 }
137 }
138
139 private static DocumentBuilder getBuilder() throws TikaException {
140 try {
141 return DocumentBuilderFactory.newInstance().newDocumentBuilder();
142 } catch (ParserConfigurationException e) {
143 throw new TikaException("XML parser not available", e);
144 }
145 }
146
147 private static Element getChild(Element element, String name) {
148 Node child = element.getFirstChild();
149 while (child != null) {
150 if (child.getNodeType() == Node.ELEMENT_NODE
151 && name.equals(child.getNodeName())) {
152 return (Element) child;
153 }
154 child = child.getNextSibling();
155 }
156 return null;
157 }
158
159 }