View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.utils;
18  
19  //JDK imports
20  import java.io.BufferedInputStream;
21  import java.io.File;
22  import java.io.FileInputStream;
23  import java.io.FileNotFoundException;
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.net.URL;
27  import java.util.ArrayList;
28  import java.util.List;
29  
30  import org.apache.tika.config.TikaConfig;
31  import org.apache.tika.exception.TikaException;
32  import org.apache.tika.metadata.Metadata;
33  import org.apache.tika.metadata.TikaMimeKeys;
34  import org.apache.tika.parser.Parser;
35  import org.apache.tika.sax.BodyContentHandler;
36  import org.xml.sax.ContentHandler;
37  import org.xml.sax.SAXException;
38  
39  /**
40   * Contains utility methods for parsing documents. Intended to provide simple
41   * entry points into the Tika framework.
42   */
43  public class ParseUtils implements TikaMimeKeys {
44  
45      /**
46       * Returns a parser that can handle the specified MIME type, and is set to
47       * receive input from a stream opened from the specified URL. NB: Close the
48       * input stream when it is no longer needed!
49       * 
50       * @param config
51       * @param mimeType
52       *            the document's MIME type
53       * @return a parser appropriate to this MIME type
54       * @throws TikaException
55       */
56      public static Parser getParser(String mimeType, TikaConfig config)
57              throws TikaException {
58          return config.getParser(mimeType);
59      }
60  
61      /**
62       * Returns a parser that can handle the specified MIME type, and is set to
63       * receive input from a stream opened from the specified URL. The MIME type
64       * is determined automatically. NB: Close the input stream when it is no
65       * longer needed!
66       * 
67       * @param documentUrl
68       *            URL pointing to the document to parse
69       * @param config
70       * @return a parser appropriate to this MIME type and ready to read input
71       *         from the specified document
72       * @throws TikaException
73       */
74      public static Parser getParser(URL documentUrl, TikaConfig config)
75              throws TikaException {
76          String mimetype = config.getMimeRepository().getMimeType(documentUrl)
77          .getName();
78          return getParser(mimetype, config);
79      }
80  
81      /**
82       * Returns a parser that can handle the specified MIME type, and is set to
83       * receive input from a stream opened from the specified URL. NB: Close the
84       * input stream when it is no longer needed!
85       * 
86       * @param documentFile
87       *            File object pointing to the document to parse
88       * @param config
89       * @return a parser appropriate to this MIME type and ready to read input
90       *         from the specified document
91       * @throws TikaException
92       */
93      public static Parser getParser(File documentFile, TikaConfig config)
94              throws TikaException {
95          String mimetype = config.getMimeRepository().getMimeType(documentFile)
96          .getName();
97          return getParser(mimetype, config);
98      }
99  
100     /**
101      * Returns a list of parsers from zip InputStream
102      * 
103      * @param zip
104      *            InputStream
105      * @param config
106      * @return a list of parsers from zip file
107      * @throws TikaException
108      */
109     private static List<Parser> getParsersFromZip(InputStream zipIs,
110             TikaConfig config) throws TikaException {
111         List<Parser> parsers = new ArrayList<Parser>();
112         List<File> zipFiles = Utils.unzip(zipIs);
113         for (int i = 0; i < zipFiles.size(); i++) {
114             File zipEntry = zipFiles.get(i);
115             parsers.add(getParser(zipEntry, config));
116         }
117         return parsers;
118     }
119 
120     /**
121      * Returns a list of parsers from zip File
122      * 
123      * @param zip
124      *            File
125      * @param config
126      * @return a list of parsers from zip file
127      * @throws TikaException
128      * @throws FileNotFoundException
129      */
130     public static List<Parser> getParsersFromZip(File zip, TikaConfig config)
131             throws TikaException, FileNotFoundException {
132         String zipMimeType = config.getMimeRepository().getMimeType(zip)
133         .getName();
134         if (!zipMimeType.equalsIgnoreCase("application/zip")) {
135             throw new TikaException("The file you are using is note a zip file");
136         }
137         return getParsersFromZip(new FileInputStream(zip), config);
138     }
139 
140     /**
141      * Returns a list of parsers from URL
142      * 
143      * @param URL
144      * @param config
145      * @return a list of parsers from zip file
146      * @throws TikaException
147      * @throws IOException
148      */
149     public static List<Parser> getParsersFromZip(URL zip, TikaConfig config)
150             throws TikaException, IOException {
151         String zipMimeType = config.getMimeRepository().getMimeType(zip)
152         .getName();
153         if (!zipMimeType.equalsIgnoreCase("application/zip")) {
154             throw new TikaException("The file you are using is note a zip file");
155         }
156         return getParsersFromZip(zip.openStream(), config);
157     }
158 
159     /**
160      * Gets the string content of a document read from an input stream.
161      * 
162      * @param stream the stream from which to read document data
163      * @param config
164      * @param mimeType MIME type of the data
165      * @return the string content parsed from the document
166      */
167     public static String getStringContent(
168             InputStream stream, TikaConfig config, String mimeType)
169             throws TikaException, IOException {
170         try {
171             Parser parser = config.getParser(mimeType);
172             ContentHandler handler = new BodyContentHandler();
173             parser.parse(stream, handler, new Metadata());
174             return handler.toString();
175         } catch (SAXException e) {
176             throw new TikaException("Unexpected SAX error", e);
177         }
178     }
179 
180     /**
181      * Gets the string content of a document read from an input stream.
182      * 
183      * @param documentUrl
184      *            URL pointing to the document to parse
185      * @param config
186      * @return the string content parsed from the document
187      */
188     public static String getStringContent(URL documentUrl, TikaConfig config)
189             throws TikaException, IOException {
190         String mime = config.getMimeRepository().getMimeType(documentUrl)
191         .getName();
192         return getStringContent(documentUrl, config, mime);
193     }
194 
195     /**
196      * Gets the string content of a document read from an input stream.
197      * 
198      * @param documentUrl
199      *            URL pointing to the document to parse
200      * @param config
201      * @param mimeType
202      *            MIME type of the data
203      * @return the string content parsed from the document
204      */
205     public static String getStringContent(
206             URL documentUrl, TikaConfig config, String mimeType)
207             throws TikaException, IOException {
208         InputStream stream = documentUrl.openStream();
209         try {
210             return getStringContent(stream, config, mimeType);
211         } finally {
212             stream.close();
213         }
214     }
215 
216     /**
217      * Gets the string content of a document read from an input stream.
218      * 
219      * @param documentFile
220      *            File object pointing to the document to parse
221      * @param config
222      * @param mimeType
223      *            MIME type of the data
224      * @return the string content parsed from the document
225      */
226     public static String getStringContent(
227             File documentFile, TikaConfig config, String mimeType)
228             throws TikaException, IOException {
229         InputStream stream = new BufferedInputStream(new FileInputStream(
230                 documentFile));
231         try {
232             return getStringContent(stream, config, mimeType);
233         } finally {
234             stream.close();
235         }
236     }
237 
238     /**
239      * Gets the string content of a document read from an input stream.
240      * 
241      * @param documentFile
242      *            File object pointing to the document to parse
243      * @param config
244      * @return the string content parsed from the document
245      */
246     public static String getStringContent(File documentFile, TikaConfig config)
247             throws TikaException, IOException {
248         String mime =
249             config.getMimeRepository().getMimeType(documentFile).getName();
250         return getStringContent(documentFile, config, mime);
251     }
252 
253 }