1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.tika.utils;
18
19 //JDK imports
20 import java.io.BufferedInputStream;
21 import java.io.File;
22 import java.io.FileInputStream;
23 import java.io.FileNotFoundException;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.net.URL;
27 import java.util.ArrayList;
28 import java.util.List;
29
30 import org.apache.tika.config.TikaConfig;
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.metadata.Metadata;
33 import org.apache.tika.metadata.TikaMimeKeys;
34 import org.apache.tika.parser.Parser;
35 import org.apache.tika.sax.BodyContentHandler;
36 import org.xml.sax.ContentHandler;
37 import org.xml.sax.SAXException;
38
39 /**
40 * Contains utility methods for parsing documents. Intended to provide simple
41 * entry points into the Tika framework.
42 */
43 public class ParseUtils implements TikaMimeKeys {
44
45 /**
46 * Returns a parser that can handle the specified MIME type, and is set to
47 * receive input from a stream opened from the specified URL. NB: Close the
48 * input stream when it is no longer needed!
49 *
50 * @param config
51 * @param mimeType
52 * the document's MIME type
53 * @return a parser appropriate to this MIME type
54 * @throws TikaException
55 */
56 public static Parser getParser(String mimeType, TikaConfig config)
57 throws TikaException {
58 return config.getParser(mimeType);
59 }
60
61 /**
62 * Returns a parser that can handle the specified MIME type, and is set to
63 * receive input from a stream opened from the specified URL. The MIME type
64 * is determined automatically. NB: Close the input stream when it is no
65 * longer needed!
66 *
67 * @param documentUrl
68 * URL pointing to the document to parse
69 * @param config
70 * @return a parser appropriate to this MIME type and ready to read input
71 * from the specified document
72 * @throws TikaException
73 */
74 public static Parser getParser(URL documentUrl, TikaConfig config)
75 throws TikaException {
76 String mimetype = config.getMimeRepository().getMimeType(documentUrl)
77 .getName();
78 return getParser(mimetype, config);
79 }
80
81 /**
82 * Returns a parser that can handle the specified MIME type, and is set to
83 * receive input from a stream opened from the specified URL. NB: Close the
84 * input stream when it is no longer needed!
85 *
86 * @param documentFile
87 * File object pointing to the document to parse
88 * @param config
89 * @return a parser appropriate to this MIME type and ready to read input
90 * from the specified document
91 * @throws TikaException
92 */
93 public static Parser getParser(File documentFile, TikaConfig config)
94 throws TikaException {
95 String mimetype = config.getMimeRepository().getMimeType(documentFile)
96 .getName();
97 return getParser(mimetype, config);
98 }
99
100 /**
101 * Returns a list of parsers from zip InputStream
102 *
103 * @param zip
104 * InputStream
105 * @param config
106 * @return a list of parsers from zip file
107 * @throws TikaException
108 */
109 private static List<Parser> getParsersFromZip(InputStream zipIs,
110 TikaConfig config) throws TikaException {
111 List<Parser> parsers = new ArrayList<Parser>();
112 List<File> zipFiles = Utils.unzip(zipIs);
113 for (int i = 0; i < zipFiles.size(); i++) {
114 File zipEntry = zipFiles.get(i);
115 parsers.add(getParser(zipEntry, config));
116 }
117 return parsers;
118 }
119
120 /**
121 * Returns a list of parsers from zip File
122 *
123 * @param zip
124 * File
125 * @param config
126 * @return a list of parsers from zip file
127 * @throws TikaException
128 * @throws FileNotFoundException
129 */
130 public static List<Parser> getParsersFromZip(File zip, TikaConfig config)
131 throws TikaException, FileNotFoundException {
132 String zipMimeType = config.getMimeRepository().getMimeType(zip)
133 .getName();
134 if (!zipMimeType.equalsIgnoreCase("application/zip")) {
135 throw new TikaException("The file you are using is note a zip file");
136 }
137 return getParsersFromZip(new FileInputStream(zip), config);
138 }
139
140 /**
141 * Returns a list of parsers from URL
142 *
143 * @param URL
144 * @param config
145 * @return a list of parsers from zip file
146 * @throws TikaException
147 * @throws IOException
148 */
149 public static List<Parser> getParsersFromZip(URL zip, TikaConfig config)
150 throws TikaException, IOException {
151 String zipMimeType = config.getMimeRepository().getMimeType(zip)
152 .getName();
153 if (!zipMimeType.equalsIgnoreCase("application/zip")) {
154 throw new TikaException("The file you are using is note a zip file");
155 }
156 return getParsersFromZip(zip.openStream(), config);
157 }
158
159 /**
160 * Gets the string content of a document read from an input stream.
161 *
162 * @param stream the stream from which to read document data
163 * @param config
164 * @param mimeType MIME type of the data
165 * @return the string content parsed from the document
166 */
167 public static String getStringContent(
168 InputStream stream, TikaConfig config, String mimeType)
169 throws TikaException, IOException {
170 try {
171 Parser parser = config.getParser(mimeType);
172 ContentHandler handler = new BodyContentHandler();
173 parser.parse(stream, handler, new Metadata());
174 return handler.toString();
175 } catch (SAXException e) {
176 throw new TikaException("Unexpected SAX error", e);
177 }
178 }
179
180 /**
181 * Gets the string content of a document read from an input stream.
182 *
183 * @param documentUrl
184 * URL pointing to the document to parse
185 * @param config
186 * @return the string content parsed from the document
187 */
188 public static String getStringContent(URL documentUrl, TikaConfig config)
189 throws TikaException, IOException {
190 String mime = config.getMimeRepository().getMimeType(documentUrl)
191 .getName();
192 return getStringContent(documentUrl, config, mime);
193 }
194
195 /**
196 * Gets the string content of a document read from an input stream.
197 *
198 * @param documentUrl
199 * URL pointing to the document to parse
200 * @param config
201 * @param mimeType
202 * MIME type of the data
203 * @return the string content parsed from the document
204 */
205 public static String getStringContent(
206 URL documentUrl, TikaConfig config, String mimeType)
207 throws TikaException, IOException {
208 InputStream stream = documentUrl.openStream();
209 try {
210 return getStringContent(stream, config, mimeType);
211 } finally {
212 stream.close();
213 }
214 }
215
216 /**
217 * Gets the string content of a document read from an input stream.
218 *
219 * @param documentFile
220 * File object pointing to the document to parse
221 * @param config
222 * @param mimeType
223 * MIME type of the data
224 * @return the string content parsed from the document
225 */
226 public static String getStringContent(
227 File documentFile, TikaConfig config, String mimeType)
228 throws TikaException, IOException {
229 InputStream stream = new BufferedInputStream(new FileInputStream(
230 documentFile));
231 try {
232 return getStringContent(stream, config, mimeType);
233 } finally {
234 stream.close();
235 }
236 }
237
238 /**
239 * Gets the string content of a document read from an input stream.
240 *
241 * @param documentFile
242 * File object pointing to the document to parse
243 * @param config
244 * @return the string content parsed from the document
245 */
246 public static String getStringContent(File documentFile, TikaConfig config)
247 throws TikaException, IOException {
248 String mime =
249 config.getMimeRepository().getMimeType(documentFile).getName();
250 return getStringContent(documentFile, config, mime);
251 }
252
253 }