View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.tika.parser.opendocument;
19  
20  import java.io.ByteArrayInputStream;
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.OutputStream;
25  import java.util.ArrayList;
26  import java.util.List;
27  import java.util.zip.ZipEntry;
28  import java.util.zip.ZipInputStream;
29  
30  import org.apache.tika.exception.TikaException;
31  import org.apache.tika.metadata.Metadata;
32  import org.apache.tika.parser.Parser;
33  import org.apache.tika.parser.xml.XMLParser;
34  import org.apache.tika.sax.AppendableAdaptor;
35  import org.apache.tika.sax.XHTMLContentHandler;
36  
37  import org.apache.log4j.Logger;
38  import org.jdom.Document;
39  import org.jdom.Element;
40  import org.jdom.JDOMException;
41  import org.jdom.Namespace;
42  import org.jdom.input.SAXBuilder;
43  import org.xml.sax.ContentHandler;
44  import org.xml.sax.SAXException;
45  
46  /**
47   * OpenOffice parser
48   */
49  public class OpenOfficeParser implements Parser {
50      static Logger logger = Logger.getRootLogger();
51  
52      private final Namespace NS_DC = Namespace.getNamespace("dc",
53              "http://purl.org/dc/elements/1.1/");
54  
55      public org.jdom.Document parse(InputStream is) {
56          Document xmlDoc = new org.jdom.Document();
57          org.jdom.Document xmlMeta = new org.jdom.Document();
58          try {
59              List files = unzip(is);
60              SAXBuilder builder = new SAXBuilder();
61              builder.setEntityResolver(new OpenOfficeEntityResolver());
62              builder.setValidation(false);
63  
64              xmlDoc = builder.build((InputStream) files.get(0));
65              xmlMeta = builder.build((InputStream) files.get(1));
66              Element rootMeta = xmlMeta.getRootElement();
67              Element meta = null;
68              List ls = rootMeta.getChildren();
69              if (! ls.isEmpty()) {
70                  meta = (Element) ls.get(0);
71              }
72              xmlDoc.getRootElement().addContent(meta.detach());
73              xmlDoc.getRootElement().addNamespaceDeclaration(NS_DC);
74          } catch (JDOMException e) {
75              logger.error(e.getMessage());
76          } catch (IOException e) {
77              logger.error(e.getMessage());
78          }
79          return xmlDoc;
80      }
81  
82      public void parse(
83              InputStream stream, ContentHandler handler, Metadata metadata)
84              throws IOException, SAXException, TikaException {
85          Document xmlDoc = parse(stream);
86          XMLParser xp = new XMLParser();
87          xp.getAllDocumentNs(xmlDoc);
88          xp.extractContent(xmlDoc, Metadata.TITLE, "//dc:title", metadata);
89          xp.extractContent(xmlDoc, Metadata.SUBJECT, "//dc:subject", metadata);
90          xp.extractContent(xmlDoc, Metadata.CREATOR, "//dc:creator", metadata);
91          xp.extractContent(xmlDoc, Metadata.DESCRIPTION, "//dc:description", metadata);
92          xp.extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language", metadata);
93          xp.extractContent(xmlDoc, Metadata.KEYWORDS, "//meta:keyword", metadata);
94          xp.extractContent(xmlDoc, Metadata.DATE, "//dc:date", metadata);
95          xp.extractContent(xmlDoc, "nbTab", "//meta:document-statistic/@meta:table-count", metadata);
96          xp.extractContent(xmlDoc, "nbObject", "//meta:document-statistic/@meta:object-count", metadata);
97          xp.extractContent(xmlDoc, "nbImg", "//meta:document-statistic/@meta:image-count", metadata);
98          xp.extractContent(xmlDoc, "nbPage", "//meta:document-statistic/@meta:page-count", metadata);
99          xp.extractContent(xmlDoc, "nbPara", "//meta:document-statistic/@meta:paragraph-count", metadata);
100         xp.extractContent(xmlDoc, "nbWord", "//meta:document-statistic/@meta:word-count", metadata);
101         xp.extractContent(xmlDoc, "nbcharacter", "//meta:document-statistic/@meta:character-count", metadata);
102 
103         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
104         xhtml.startDocument();
105         xhtml.startElement("p");
106         xp.concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
107         xhtml.endElement("p");
108         xhtml.endDocument();
109     }
110 
111     public List unzip(InputStream is) {
112         List res = new ArrayList();
113         try {
114             ZipInputStream in = new ZipInputStream(is);
115             ZipEntry entry = null;
116             while ((entry = in.getNextEntry()) != null) {
117                 if (entry.getName().equals("meta.xml")
118                         || entry.getName().equals("content.xml")) {
119                     ByteArrayOutputStream stream = new ByteArrayOutputStream();
120                     byte[] buf = new byte[1024];
121                     int len;
122                     while ((len = in.read(buf)) > 0) {
123                         stream.write(buf, 0, len);
124                     }
125                     InputStream isEntry = new ByteArrayInputStream(stream
126                             .toByteArray());
127                     res.add(isEntry);
128                 }
129             }
130             in.close();
131         } catch (IOException e) {
132             logger.error(e.getMessage());
133         }
134         return res;
135     }
136 
137     protected void copyInputStream(InputStream in, OutputStream out)
138             throws IOException {
139         byte[] buffer = new byte[1024];
140         int len;
141 
142         while ((len = in.read(buffer)) >= 0)
143             out.write(buffer, 0, len);
144 
145         in.close();
146         out.close();
147     }
148 
149 }