1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.tika.mime;
18
19 // JDK imports
20 import java.io.File;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.net.URL;
24 import java.util.Arrays;
25 import java.util.Map;
26 import java.util.HashMap;
27 import java.util.SortedSet;
28 import java.util.TreeSet;
29
30 /**
31 * This class is a MimeType repository. It gathers a set of MimeTypes and
32 * enables to retrieves a content-type from its name, from a file name, or from
33 * a magic character sequence.
34 * <p>
35 * The MIME type detection methods that take an {@link InputStream} as
36 * an argument will never reads more than {@link #getMinLength()} bytes
37 * from the stream. Also the given stream is never
38 * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
39 * or {@link InputStream#reset() reset} by the methods. Thus a client can
40 * use the {@link InputStream#markSupported() mark feature} of the stream
41 * (if available) to restore the stream back to the state it was before type
42 * detection if it wants to process the stream based on the detected type.
43 */
44 public final class MimeTypes {
45
46 /** The default <code>application/octet-stream</code> MimeType */
47 public final static String DEFAULT = "application/octet-stream";
48
49 private final MimeType root;
50
51 /** All the registered MimeTypes indexed on their name */
52 private final Map<String, MimeType> types = new HashMap<String, MimeType>();
53
54 /** The patterns matcher */
55 private Patterns patterns = new Patterns();
56
57 /** List of all registered magics */
58 private SortedSet<Magic> magics = new TreeSet<Magic>();
59
60 /** List of all registered rootXML */
61 private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
62
63 public MimeTypes() {
64 root = new MimeType(this, DEFAULT);
65 types.put(root.getName(), root);
66 }
67
68 /**
69 * Find the Mime Content Type of a file.
70 *
71 * @param file
72 * to analyze.
73 * @return the Mime Content Type of the specified file, or <code>null</code>
74 * if none is found.
75 */
76 public MimeType getMimeType(File file) {
77 return getMimeType(file.getName());
78 }
79
80 /**
81 * Find the Mime Content Type of a document from its URL.
82 *
83 * @param url
84 * of the document to analyze.
85 * @return the Mime Content Type of the specified document URL, or
86 * <code>null</code> if none is found.
87 */
88 public MimeType getMimeType(URL url) {
89 return getMimeType(url.getPath());
90 }
91
92 /**
93 * Find the Mime Content Type of a document from its name.
94 *
95 * @param name
96 * of the document to analyze.
97 * @return the Mime Content Type of the specified document name
98 */
99 public MimeType getMimeType(String name) {
100 MimeType type = patterns.matches(name);
101 if (type != null) {
102 return type;
103 }
104 type = patterns.matches(name.toLowerCase());
105 if (type != null) {
106 return type;
107 } else {
108 return root;
109 }
110 }
111
112 /**
113 * Returns the MIME type that best matches the given first few bytes
114 * of a document stream.
115 * <p>
116 * The given byte array is expected to be at least {@link #getMinLength()}
117 * long, or shorter only if the document stream itself is shorter.
118 *
119 * @param data first few bytes of a document stream
120 * @return matching MIME type, or <code>null</code> if no match is found
121 */
122 public MimeType getMimeType(byte[] data) {
123 if (data == null) {
124 throw new IllegalArgumentException("Data is missing");
125 }
126
127 // First, check for XML descriptions (level by level)
128 for (MimeType type : xmls) {
129 if (type.matchesXML(data)) {
130 return type;
131 }
132 }
133
134 // Then, check for magic bytes
135 for (Magic magic : magics) {
136 if (magic.eval(data)) {
137 return magic.getType();
138 }
139 }
140
141 return null;
142 }
143
144 /**
145 * Returns the MIME type that best matches the first few bytes of the
146 * given document stream.
147 *
148 * @see #getMimeType(byte[])
149 * @param stream document stream
150 * @return matching MIME type, or <code>null</code> if no match is found
151 * @throws IOException if the stream can be read
152 */
153 public MimeType getMimeType(InputStream stream) throws IOException {
154 return getMimeType(readMagicHeader(stream));
155 }
156
157 /**
158 * Reads the first {@link #getMinLength()} bytes from the given stream.
159 * If the stream is shorter, then the entire content of the stream is
160 * returned.
161 * <p>
162 * The given stream is never {@link InputStream#close() closed},
163 * {@link InputStream#mark(int) marked}, or
164 * {@link InputStream#reset() reset} by this method.
165 *
166 * @param stream stream to be read
167 * @return first {@link #getMinLength()} (or fewer) bytes of the stream
168 * @throws IOException if the stream can not be read
169 */
170 private byte[] readMagicHeader(InputStream stream) throws IOException {
171 if (stream == null) {
172 throw new IllegalArgumentException("InputStream is missing");
173 }
174
175 byte[] bytes = new byte[getMinLength()];
176 int totalRead = 0;
177
178 int lastRead = stream.read(bytes);
179 while (lastRead != -1) {
180 totalRead += lastRead;
181 if (totalRead == bytes.length) {
182 return bytes;
183 }
184 lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
185 }
186
187 byte[] shorter = new byte[totalRead];
188 System.arraycopy(bytes, 0, shorter, 0, totalRead);
189 return shorter;
190 }
191
192 public String getType(String typeName, String url, byte[] data) {
193 MimeType type = getMimeType(url, data);
194
195 if (type == null && typeName != null) {
196 try {
197 type = forName(typeName);
198 } catch (MimeTypeException e) {
199 // Invalid type name hint
200 }
201 }
202
203 if (type == null) {
204 type = root;
205 }
206
207 return type.getName();
208 }
209
210 /**
211 * Determines the MIME type of the resource pointed to by the specified URL.
212 * Examines the file's header, and if it cannot determine the MIME type
213 * from the header, guesses the MIME type from the URL extension
214 * (e.g. "pdf).
215 *
216 * @param url
217 * @return
218 * @throws IOException
219 */
220 public String getType(URL url) throws IOException {
221 InputStream stream = url.openStream();
222 try {
223 return getType(null, url.toString(), readMagicHeader(stream));
224 } finally {
225 stream.close();
226 }
227 }
228
229 /**
230 * Find the Mime Content Type of a document from its name and its content.
231 * The policy used to guess the Mime Content Type is:
232 * <ol>
233 * <li>Try to find the type based on the provided data.</li>
234 * <li>If a type is found, then return it, otherwise try to find the type
235 * based on the file name</li>
236 * </ol>
237 *
238 * @param name
239 * of the document to analyze.
240 * @param data
241 * are the first bytes of the document's content.
242 * @return the Mime Content Type of the specified document, or
243 * <code>null</code> if none is found.
244 * @see #getMinLength()
245 */
246 public MimeType getMimeType(String name, byte[] data) {
247 // First, try to get the mime-type from the content
248 MimeType mimeType = getMimeType(data);
249
250 // If no mime-type found, then try to get the mime-type from
251 // the document name
252 if (mimeType == null) {
253 mimeType = getMimeType(name);
254 }
255
256 return mimeType;
257 }
258
259 /**
260 * Returns the MIME type that best matches the given document name and
261 * the first few bytes of the given document stream.
262 *
263 * @see #getMimeType(String, byte[])
264 * @param name document name
265 * @param stream document stream
266 * @return matching MIME type, or <code>null</code> if no match is found
267 * @throws IOException if the stream can not be read
268 */
269 public MimeType getMimeType(String name, InputStream stream)
270 throws IOException {
271 return getMimeType(name, readMagicHeader(stream));
272 }
273
274 /**
275 * Returns the registered media type with the given name (or alias).
276 * The named media type is automatically registered (and returned) if
277 * it doesn't already exist.
278 *
279 * @param name media type name (case-insensitive)
280 * @return the registered media type with the given name or alias
281 * @throws MimeTypeException if the given media type name is invalid
282 */
283 public synchronized MimeType forName(String name)
284 throws MimeTypeException {
285 if (MimeType.isValid(name)) {
286 name = name.toLowerCase();
287 MimeType type = types.get(name);
288 if (type == null) {
289 type = new MimeType(this, name);
290 type.setSuperType(root);
291 types.put(name, type);
292 }
293 return type;
294 } else {
295 throw new MimeTypeException("Invalid media type name: " + name);
296 }
297 }
298
299 /**
300 * Adds an alias for the given media type. This method should only
301 * be called from {@link MimeType#addAlias(String)}.
302 *
303 * @param type media type
304 * @param alias media type alias (normalized to lower case)
305 * @throws MimeTypeException if the alias already exists
306 */
307 synchronized void addAlias(MimeType type, String alias)
308 throws MimeTypeException {
309 if (!types.containsKey(alias)) {
310 types.put(alias, type);
311 } else {
312 throw new MimeTypeException(
313 "Media type alias already exists: " + alias);
314 }
315 }
316
317 /**
318 * Adds a file name pattern for the given media type.
319 *
320 * @param type media type
321 * @param pattern file name pattern
322 * @throws MimeTypeException if the pattern conflicts with existing ones
323 */
324 public void addPattern(MimeType type, String pattern)
325 throws MimeTypeException {
326 patterns.add(pattern, type);
327 }
328
329 /**
330 * Return the minimum length of data to provide to analyzing methods based
331 * on the document's content in order to check all the known MimeTypes.
332 *
333 * @return the minimum length of data to provide.
334 * @see #getMimeType(byte[])
335 * @see #getMimeType(String, byte[])
336 */
337 public int getMinLength() {
338 return 1024;
339 // return minLength;
340 }
341
342 /**
343 * Add the specified mime-type in the repository.
344 *
345 * @param type
346 * is the mime-type to add.
347 */
348 void add(MimeType type) {
349 // Update the magics index...
350 if (type.hasMagic()) {
351 magics.addAll(Arrays.asList(type.getMagics()));
352 }
353
354 // Update the xml (xmlRoot) index...
355 if (type.hasRootXML()) {
356 xmls.add(type);
357 }
358 }
359
360 }