1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.utils;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedOutputStream;
21 import java.io.ByteArrayInputStream;
22 import java.io.ByteArrayOutputStream;
23 import java.io.File;
24 import java.io.FileOutputStream;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.OutputStream;
28 import java.io.Reader;
29 import java.util.ArrayList;
30 import java.util.List;
31 import java.util.zip.ZipEntry;
32 import java.util.zip.ZipInputStream;
33
34 import org.apache.log4j.Logger;
35 import org.apache.tika.exception.TikaException;
36 import org.apache.tika.metadata.HttpHeaders;
37 import org.apache.tika.metadata.Metadata;
38 import org.jdom.Document;
39 import org.jdom.JDOMException;
40 import org.jdom.input.SAXBuilder;
41 import org.jdom.output.Format;
42 import org.jdom.output.XMLOutputter;
43
44 import com.ibm.icu.text.CharsetDetector;
45 import com.ibm.icu.text.CharsetMatch;
46
47
48
49
50
51
52
53 public class Utils {
54
55 static Logger logger = Logger.getRootLogger();
56
57 public static Document parse(InputStream is) {
58 org.jdom.Document xmlDoc = new org.jdom.Document();
59 try {
60 SAXBuilder builder = new SAXBuilder();
61 builder.setValidation(false);
62 xmlDoc = builder.build(is);
63 } catch (JDOMException e) {
64 logger.error(e.getMessage());
65 } catch (IOException e) {
66 logger.error(e.getMessage());
67 }
68 return xmlDoc;
69 }
70
71 public static List unzip(InputStream is) {
72 List res = new ArrayList();
73 try {
74 ZipInputStream in = new ZipInputStream(is);
75 ZipEntry entry = null;
76 while ((entry = in.getNextEntry()) != null) {
77 ByteArrayOutputStream stream = new ByteArrayOutputStream();
78 byte[] buf = new byte[1024];
79 int len;
80 while ((len = in.read(buf)) > 0) {
81 stream.write(buf, 0, len);
82 }
83 InputStream isEntry = new ByteArrayInputStream(stream
84 .toByteArray());
85 File file = File.createTempFile("TIKA_unzip_", "_" + entry.getName());
86
87
88
89 file.deleteOnExit();
90 saveInputStreamInFile(isEntry, new BufferedOutputStream(
91 new FileOutputStream(file)));
92 res.add(file);
93 isEntry.close();
94 }
95 in.close();
96 } catch (IOException e) {
97 logger.error(e.getMessage());
98 }
99 return res;
100 }
101
102 private static void saveInputStreamInFile(InputStream in, OutputStream out)
103 throws IOException {
104 byte[] buffer = new byte[1024];
105 int len;
106
107 while ((len = in.read(buffer)) >= 0)
108 out.write(buffer, 0, len);
109
110 in.close();
111 out.close();
112 }
113
114 public static void saveInXmlFile(Document doc, String file) {
115 Format f = Format.getPrettyFormat().setEncoding("UTF-8");
116
117 XMLOutputter xop = new XMLOutputter(f);
118
119 try {
120
121 xop.output(doc, new FileOutputStream(file));
122
123 }
124
125 catch (IOException ex) {
126
127 logger.error(ex.getMessage());
128
129 }
130 }
131
132
133
134
135
136
137
138
139
140
141
142 public static Reader getUTF8Reader(InputStream stream, Metadata metadata) throws TikaException, IOException{
143 CharsetDetector detector = new CharsetDetector();
144
145
146 String encoding = metadata.get(Metadata.CONTENT_ENCODING);
147 if (encoding != null) {
148 detector.setDeclaredEncoding(encoding);
149 }
150
151
152 if (!stream.markSupported()) {
153 stream = new BufferedInputStream(stream);
154 }
155
156 detector.setText(stream);
157
158 CharsetMatch match = detector.detect();
159 if (match == null) {
160 throw new TikaException("Unable to detect character encoding");
161 }
162
163 metadata.set(Metadata.CONTENT_ENCODING, match.getName());
164 String language = match.getLanguage();
165 if (language != null) {
166 metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
167 metadata.set(Metadata.LANGUAGE, match.getLanguage());
168 }
169
170 return match.getReader();
171 }
172
173 }