View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.mime;
18  
19  // JDK imports
20  import java.io.File;
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.net.URL;
24  import java.util.Arrays;
25  import java.util.Map;
26  import java.util.HashMap;
27  import java.util.SortedSet;
28  import java.util.TreeSet;
29  
30  /**
31   * This class is a MimeType repository. It gathers a set of MimeTypes and
32   * enables to retrieves a content-type from its name, from a file name, or from
33   * a magic character sequence.
34   * <p>
35   * The MIME type detection methods that take an {@link InputStream} as
36   * an argument will never reads more than {@link #getMinLength()} bytes
37   * from the stream. Also the given stream is never
38   * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
39   * or {@link InputStream#reset() reset} by the methods. Thus a client can
40   * use the {@link InputStream#markSupported() mark feature} of the stream
41   * (if available) to restore the stream back to the state it was before type
42   * detection if it wants to process the stream based on the detected type.
43   */
44  public final class MimeTypes {
45  
46      /** The default <code>application/octet-stream</code> MimeType */
47      public final static String DEFAULT = "application/octet-stream";
48  
49      private final MimeType root;
50  
51      /** All the registered MimeTypes indexed on their name */
52      private final Map<String, MimeType> types = new HashMap<String, MimeType>();
53  
54      /** The patterns matcher */
55      private Patterns patterns = new Patterns();
56  
57      /** List of all registered magics */
58      private SortedSet<Magic> magics = new TreeSet<Magic>();
59  
60      /** List of all registered rootXML */
61      private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
62  
63      public MimeTypes() {
64          root = new MimeType(this, DEFAULT);
65          types.put(root.getName(), root);
66      }
67  
68      /**
69       * Find the Mime Content Type of a file.
70       * 
71       * @param file
72       *            to analyze.
73       * @return the Mime Content Type of the specified file, or <code>null</code>
74       *         if none is found.
75       */
76      public MimeType getMimeType(File file) {
77          return getMimeType(file.getName());
78      }
79  
80      /**
81       * Find the Mime Content Type of a document from its URL.
82       * 
83       * @param url
84       *            of the document to analyze.
85       * @return the Mime Content Type of the specified document URL, or
86       *         <code>null</code> if none is found.
87       */
88      public MimeType getMimeType(URL url) {
89          return getMimeType(url.getPath());
90      }
91  
92      /**
93       * Find the Mime Content Type of a document from its name.
94       * 
95       * @param name
96       *            of the document to analyze.
97       * @return the Mime Content Type of the specified document name
98       */
99      public MimeType getMimeType(String name) {
100         MimeType type = patterns.matches(name);
101         if (type != null) {
102             return type;
103         }
104         type = patterns.matches(name.toLowerCase());
105         if (type != null) {
106             return type;
107         } else {
108             return root;
109         }
110     }
111 
112     /**
113      * Returns the MIME type that best matches the given first few bytes
114      * of a document stream.
115      * <p>
116      * The given byte array is expected to be at least {@link #getMinLength()}
117      * long, or shorter only if the document stream itself is shorter.
118      *
119      * @param data first few bytes of a document stream
120      * @return matching MIME type, or <code>null</code> if no match is found
121      */
122     public MimeType getMimeType(byte[] data) {
123         if (data == null) {
124             throw new IllegalArgumentException("Data is missing");
125         }
126 
127         // First, check for XML descriptions (level by level)
128         for (MimeType type : xmls) {
129             if (type.matchesXML(data)) {
130                 return type;
131             }
132         }
133 
134         // Then, check for magic bytes
135         for (Magic magic : magics) {
136             if (magic.eval(data)) {
137                 return magic.getType();
138             }
139         }
140 
141         return null;
142     }
143 
144     /**
145      * Returns the MIME type that best matches the first few bytes of the
146      * given document stream.
147      *
148      * @see #getMimeType(byte[])
149      * @param stream document stream
150      * @return matching MIME type, or <code>null</code> if no match is found
151      * @throws IOException if the stream can be read
152      */
153     public MimeType getMimeType(InputStream stream) throws IOException {
154         return getMimeType(readMagicHeader(stream));
155     }
156 
157     /**
158      * Reads the first {@link #getMinLength()} bytes from the given stream.
159      * If the stream is shorter, then the entire content of the stream is
160      * returned.
161      * <p>
162      * The given stream is never {@link InputStream#close() closed},
163      * {@link InputStream#mark(int) marked}, or
164      * {@link InputStream#reset() reset} by this method.
165      *
166      * @param stream stream to be read
167      * @return first {@link #getMinLength()} (or fewer) bytes of the stream
168      * @throws IOException if the stream can not be read
169      */
170     private byte[] readMagicHeader(InputStream stream) throws IOException {
171         if (stream == null) {
172             throw new IllegalArgumentException("InputStream is missing");
173         }
174 
175         byte[] bytes = new byte[getMinLength()];
176         int totalRead = 0;
177 
178         int lastRead = stream.read(bytes);
179         while (lastRead != -1) {
180             totalRead += lastRead;
181             if (totalRead == bytes.length) {
182                 return bytes;
183             }
184             lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
185         }
186 
187         byte[] shorter = new byte[totalRead];
188         System.arraycopy(bytes, 0, shorter, 0, totalRead);
189         return shorter;
190     }
191 
192     public String getType(String typeName, String url, byte[] data) {
193         MimeType type = getMimeType(url, data);
194 
195         if (type == null && typeName != null) {
196             try {
197                 type = forName(typeName);
198             } catch (MimeTypeException e) {
199                 // Invalid type name hint
200             }
201         }
202 
203         if (type == null) {
204             type = root;
205         }
206 
207         return type.getName();
208     }
209 
210     /**
211      * Determines the MIME type of the resource pointed to by the specified URL.
212      * Examines the file's header, and if it cannot determine the MIME type
213      * from the header, guesses the MIME type from the URL extension
214      * (e.g. "pdf).
215      *
216      * @param url
217      * @return
218      * @throws IOException
219      */
220     public String getType(URL url) throws IOException {
221         InputStream stream = url.openStream();
222         try {
223             return getType(null, url.toString(), readMagicHeader(stream));
224         } finally {
225             stream.close();
226         }
227     }
228 
229     /**
230      * Find the Mime Content Type of a document from its name and its content.
231      * The policy used to guess the Mime Content Type is:
232      * <ol>
233      * <li>Try to find the type based on the provided data.</li>
234      * <li>If a type is found, then return it, otherwise try to find the type
235      * based on the file name</li>
236      * </ol>
237      * 
238      * @param name
239      *            of the document to analyze.
240      * @param data
241      *            are the first bytes of the document's content.
242      * @return the Mime Content Type of the specified document, or
243      *         <code>null</code> if none is found.
244      * @see #getMinLength()
245      */
246     public MimeType getMimeType(String name, byte[] data) {
247         // First, try to get the mime-type from the content
248         MimeType mimeType = getMimeType(data);
249 
250         // If no mime-type found, then try to get the mime-type from
251         // the document name
252         if (mimeType == null) {
253             mimeType = getMimeType(name);
254         }
255 
256         return mimeType;
257     }
258 
259     /**
260      * Returns the MIME type that best matches the given document name and
261      * the first few bytes of the given document stream.
262      *
263      * @see #getMimeType(String, byte[])
264      * @param name document name
265      * @param stream document stream
266      * @return matching MIME type, or <code>null</code> if no match is found
267      * @throws IOException if the stream can not be read
268      */
269     public MimeType getMimeType(String name, InputStream stream)
270             throws IOException {
271         return getMimeType(name, readMagicHeader(stream));
272     }
273 
274     /**
275      * Returns the registered media type with the given name (or alias).
276      * The named media type is automatically registered (and returned) if
277      * it doesn't already exist.
278      *
279      * @param name media type name (case-insensitive)
280      * @return the registered media type with the given name or alias
281      * @throws MimeTypeException if the given media type name is invalid
282      */
283     public synchronized MimeType forName(String name)
284             throws MimeTypeException {
285         if (MimeType.isValid(name)) {
286             name = name.toLowerCase();
287             MimeType type = types.get(name);
288             if (type == null) {
289                 type = new MimeType(this, name);
290                 type.setSuperType(root);
291                 types.put(name, type);
292             }
293             return type;
294         } else {
295             throw new MimeTypeException("Invalid media type name: " + name);
296         }
297     }
298 
299     /**
300      * Adds an alias for the given media type. This method should only
301      * be called from {@link MimeType#addAlias(String)}.
302      *
303      * @param type media type
304      * @param alias media type alias (normalized to lower case)
305      * @throws MimeTypeException if the alias already exists
306      */
307     synchronized void addAlias(MimeType type, String alias)
308             throws MimeTypeException {
309         if (!types.containsKey(alias)) {
310             types.put(alias, type);
311         } else {
312             throw new MimeTypeException(
313                     "Media type alias already exists: " + alias);
314         }
315     }
316 
317     /**
318      * Adds a file name pattern for the given media type.
319      *
320      * @param type media type
321      * @param pattern file name pattern
322      * @throws MimeTypeException if the pattern conflicts with existing ones
323      */
324     public void addPattern(MimeType type, String pattern)
325             throws MimeTypeException {
326         patterns.add(pattern, type);
327     }
328 
329     /**
330      * Return the minimum length of data to provide to analyzing methods based
331      * on the document's content in order to check all the known MimeTypes.
332      * 
333      * @return the minimum length of data to provide.
334      * @see #getMimeType(byte[])
335      * @see #getMimeType(String, byte[])
336      */
337     public int getMinLength() {
338         return 1024;
339         // return minLength;
340     }
341 
342     /**
343      * Add the specified mime-type in the repository.
344      * 
345      * @param type
346      *            is the mime-type to add.
347      */
348     void add(MimeType type) {
349         // Update the magics index...
350         if (type.hasMagic()) {
351             magics.addAll(Arrays.asList(type.getMagics()));
352         }
353 
354         // Update the xml (xmlRoot) index...
355         if (type.hasRootXML()) {
356             xmls.add(type);
357         }
358     }
359 
360 }