View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.utils;
18  
19  import java.io.BufferedInputStream;
20  import java.io.BufferedOutputStream;
21  import java.io.ByteArrayInputStream;
22  import java.io.ByteArrayOutputStream;
23  import java.io.File;
24  import java.io.FileOutputStream;
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.OutputStream;
28  import java.io.Reader;
29  import java.util.ArrayList;
30  import java.util.List;
31  import java.util.zip.ZipEntry;
32  import java.util.zip.ZipInputStream;
33  
34  import org.apache.log4j.Logger;
35  import org.apache.tika.exception.TikaException;
36  import org.apache.tika.metadata.HttpHeaders;
37  import org.apache.tika.metadata.Metadata;
38  import org.jdom.Document;
39  import org.jdom.JDOMException;
40  import org.jdom.input.SAXBuilder;
41  import org.jdom.output.Format;
42  import org.jdom.output.XMLOutputter;
43  
44  import com.ibm.icu.text.CharsetDetector;
45  import com.ibm.icu.text.CharsetMatch;
46  
47  /**
48   * Class util
49   * 
50   * 
51   */
52  
53  public class Utils {
54  
55      static Logger logger = Logger.getRootLogger();
56  
57      public static Document parse(InputStream is) {
58          org.jdom.Document xmlDoc = new org.jdom.Document();
59          try {
60              SAXBuilder builder = new SAXBuilder();
61              builder.setValidation(false);
62              xmlDoc = builder.build(is);
63          } catch (JDOMException e) {
64              logger.error(e.getMessage());
65          } catch (IOException e) {
66              logger.error(e.getMessage());
67          }
68          return xmlDoc;
69      }
70  
71      public static List unzip(InputStream is) {
72          List res = new ArrayList();
73          try {
74              ZipInputStream in = new ZipInputStream(is);
75              ZipEntry entry = null;
76              while ((entry = in.getNextEntry()) != null) {
77                  ByteArrayOutputStream stream = new ByteArrayOutputStream();
78                  byte[] buf = new byte[1024];
79                  int len;
80                  while ((len = in.read(buf)) > 0) {
81                      stream.write(buf, 0, len);
82                  }
83                  InputStream isEntry = new ByteArrayInputStream(stream
84                          .toByteArray());
85                  File file = File.createTempFile("TIKA_unzip_", "_" + entry.getName());
86                  
87                  // TODO we might want to delete the file earlier than on exit,
88                  // in case Tika is used inside a long-running app
89                  file.deleteOnExit();
90                  saveInputStreamInFile(isEntry, new BufferedOutputStream(
91                          new FileOutputStream(file)));
92                  res.add(file);
93                  isEntry.close();
94              }
95              in.close();
96          } catch (IOException e) {
97              logger.error(e.getMessage());
98          }
99          return res;
100     }
101 
102     private static void saveInputStreamInFile(InputStream in, OutputStream out)
103             throws IOException {
104         byte[] buffer = new byte[1024];
105         int len;
106 
107         while ((len = in.read(buffer)) >= 0)
108             out.write(buffer, 0, len);
109 
110         in.close();
111         out.close();
112     }
113 
114     public static void saveInXmlFile(Document doc, String file) {
115         Format f = Format.getPrettyFormat().setEncoding("UTF-8");
116 
117         XMLOutputter xop = new XMLOutputter(f);
118 
119         try {
120 
121             xop.output(doc, new FileOutputStream(file));
122 
123         }
124 
125         catch (IOException ex) {
126 
127             logger.error(ex.getMessage());
128 
129         }
130     }
131 
132     /**
133      * Try to detect encoding from inputstream and return a UTF-8
134      * Reader. A metadata hint can be submitted as part of {@link Metadata}
135      * under key {@link HttpHeaders#CONTENT_ENCODING}.
136      * 
137      * After succesfull detection, fills Metadata with detected content encoding
138      * and content language ({@link HttpHeaders#CONTENT_LANGUAGE}).
139      * 
140      * @return Reader to utf8 encoded reader.
141      */
142     public static Reader getUTF8Reader(InputStream stream, Metadata metadata) throws TikaException, IOException{
143         CharsetDetector detector = new CharsetDetector();
144     
145         // Use the declared character encoding, if available
146         String encoding = metadata.get(Metadata.CONTENT_ENCODING);
147         if (encoding != null) {
148             detector.setDeclaredEncoding(encoding);
149         }
150     
151         // CharsetDetector expects a stream to support marks
152         if (!stream.markSupported()) {
153             stream = new BufferedInputStream(stream);
154         }
155     
156         detector.setText(stream);
157     
158         CharsetMatch match = detector.detect();
159         if (match == null) {
160             throw new TikaException("Unable to detect character encoding");
161         }
162         
163         metadata.set(Metadata.CONTENT_ENCODING, match.getName());
164         String language = match.getLanguage();
165         if (language != null) {
166             metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
167             metadata.set(Metadata.LANGUAGE, match.getLanguage());
168         }
169         
170         return match.getReader();
171     }
172 
173 }