1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.tika.parser.opendocument;
19
20 import java.io.ByteArrayInputStream;
21 import java.io.ByteArrayOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.OutputStream;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.zip.ZipEntry;
28 import java.util.zip.ZipInputStream;
29
30 import org.apache.tika.exception.TikaException;
31 import org.apache.tika.metadata.Metadata;
32 import org.apache.tika.parser.Parser;
33 import org.apache.tika.parser.xml.XMLParser;
34 import org.apache.tika.sax.AppendableAdaptor;
35 import org.apache.tika.sax.XHTMLContentHandler;
36
37 import org.apache.log4j.Logger;
38 import org.jdom.Document;
39 import org.jdom.Element;
40 import org.jdom.JDOMException;
41 import org.jdom.Namespace;
42 import org.jdom.input.SAXBuilder;
43 import org.xml.sax.ContentHandler;
44 import org.xml.sax.SAXException;
45
46
47
48
49 public class OpenOfficeParser implements Parser {
50 static Logger logger = Logger.getRootLogger();
51
52 private final Namespace NS_DC = Namespace.getNamespace("dc",
53 "http://purl.org/dc/elements/1.1/");
54
55 public org.jdom.Document parse(InputStream is) {
56 Document xmlDoc = new org.jdom.Document();
57 org.jdom.Document xmlMeta = new org.jdom.Document();
58 try {
59 List files = unzip(is);
60 SAXBuilder builder = new SAXBuilder();
61 builder.setEntityResolver(new OpenOfficeEntityResolver());
62 builder.setValidation(false);
63
64 xmlDoc = builder.build((InputStream) files.get(0));
65 xmlMeta = builder.build((InputStream) files.get(1));
66 Element rootMeta = xmlMeta.getRootElement();
67 Element meta = null;
68 List ls = rootMeta.getChildren();
69 if (! ls.isEmpty()) {
70 meta = (Element) ls.get(0);
71 }
72 xmlDoc.getRootElement().addContent(meta.detach());
73 xmlDoc.getRootElement().addNamespaceDeclaration(NS_DC);
74 } catch (JDOMException e) {
75 logger.error(e.getMessage());
76 } catch (IOException e) {
77 logger.error(e.getMessage());
78 }
79 return xmlDoc;
80 }
81
82 public void parse(
83 InputStream stream, ContentHandler handler, Metadata metadata)
84 throws IOException, SAXException, TikaException {
85 Document xmlDoc = parse(stream);
86 XMLParser xp = new XMLParser();
87 xp.getAllDocumentNs(xmlDoc);
88 xp.extractContent(xmlDoc, Metadata.TITLE, "//dc:title", metadata);
89 xp.extractContent(xmlDoc, Metadata.SUBJECT, "//dc:subject", metadata);
90 xp.extractContent(xmlDoc, Metadata.CREATOR, "//dc:creator", metadata);
91 xp.extractContent(xmlDoc, Metadata.DESCRIPTION, "//dc:description", metadata);
92 xp.extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language", metadata);
93 xp.extractContent(xmlDoc, Metadata.KEYWORDS, "//meta:keyword", metadata);
94 xp.extractContent(xmlDoc, Metadata.DATE, "//dc:date", metadata);
95 xp.extractContent(xmlDoc, "nbTab", "//meta:document-statistic/@meta:table-count", metadata);
96 xp.extractContent(xmlDoc, "nbObject", "//meta:document-statistic/@meta:object-count", metadata);
97 xp.extractContent(xmlDoc, "nbImg", "//meta:document-statistic/@meta:image-count", metadata);
98 xp.extractContent(xmlDoc, "nbPage", "//meta:document-statistic/@meta:page-count", metadata);
99 xp.extractContent(xmlDoc, "nbPara", "//meta:document-statistic/@meta:paragraph-count", metadata);
100 xp.extractContent(xmlDoc, "nbWord", "//meta:document-statistic/@meta:word-count", metadata);
101 xp.extractContent(xmlDoc, "nbcharacter", "//meta:document-statistic/@meta:character-count", metadata);
102
103 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
104 xhtml.startDocument();
105 xhtml.startElement("p");
106 xp.concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
107 xhtml.endElement("p");
108 xhtml.endDocument();
109 }
110
111 public List unzip(InputStream is) {
112 List res = new ArrayList();
113 try {
114 ZipInputStream in = new ZipInputStream(is);
115 ZipEntry entry = null;
116 while ((entry = in.getNextEntry()) != null) {
117 if (entry.getName().equals("meta.xml")
118 || entry.getName().equals("content.xml")) {
119 ByteArrayOutputStream stream = new ByteArrayOutputStream();
120 byte[] buf = new byte[1024];
121 int len;
122 while ((len = in.read(buf)) > 0) {
123 stream.write(buf, 0, len);
124 }
125 InputStream isEntry = new ByteArrayInputStream(stream
126 .toByteArray());
127 res.add(isEntry);
128 }
129 }
130 in.close();
131 } catch (IOException e) {
132 logger.error(e.getMessage());
133 }
134 return res;
135 }
136
137 protected void copyInputStream(InputStream in, OutputStream out)
138 throws IOException {
139 byte[] buffer = new byte[1024];
140 int len;
141
142 while ((len = in.read(buffer)) >= 0)
143 out.write(buffer, 0, len);
144
145 in.close();
146 out.close();
147 }
148
149 }