View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft;
18  
19  // JDK imports
20  import java.io.IOException;
21  import java.io.InputStream;
22  
23  import org.apache.poi.hpsf.DocumentSummaryInformation;
24  import org.apache.poi.hpsf.HPSFException;
25  import org.apache.poi.hpsf.PropertySet;
26  import org.apache.poi.hpsf.PropertySetFactory;
27  import org.apache.poi.hpsf.SummaryInformation;
28  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
29  import org.apache.tika.exception.TikaException;
30  import org.apache.tika.metadata.Metadata;
31  import org.apache.tika.parser.Parser;
32  import org.apache.tika.sax.AppendableAdaptor;
33  import org.apache.tika.sax.XHTMLContentHandler;
34  import org.xml.sax.ContentHandler;
35  import org.xml.sax.SAXException;
36  
37  /**
38   * Defines a Microsoft document content extractor.
39   */
40  public abstract class OfficeParser implements Parser {
41  
42      /**
43       * Extracts properties and text from an MS Document input stream
44       */
45      public void parse(
46              InputStream stream, ContentHandler handler, Metadata metadata)
47              throws IOException, SAXException, TikaException {
48          POIFSFileSystem filesystem = new POIFSFileSystem(stream);
49  
50          metadata.set(Metadata.CONTENT_TYPE, getContentType());
51          getMetadata(
52                  filesystem, SummaryInformation.DEFAULT_STREAM_NAME, metadata);
53          getMetadata(
54                  filesystem, DocumentSummaryInformation.DEFAULT_STREAM_NAME,
55                  metadata);
56  
57          XHTMLContentHandler xhtml =
58              new XHTMLContentHandler(handler, metadata);
59          xhtml.startDocument();
60          xhtml.startElement("p");
61          extractText(filesystem, new AppendableAdaptor(xhtml));
62          xhtml.endElement("p");
63          xhtml.endDocument();
64      }
65  
66      /**
67       * The content type of the document being parsed.
68       *
69       * @return MIME content type
70       */
71      protected abstract String getContentType();
72  
73      /**
74       * Extracts the text content from a Microsoft document input stream.
75       */
76      protected abstract void extractText(POIFSFileSystem filesystem, Appendable appendable)
77          throws IOException, TikaException;
78  
79      private void getMetadata(
80              POIFSFileSystem filesystem, String name, Metadata metadata) {
81          try {
82              InputStream stream = filesystem.createDocumentInputStream(name);
83              try {
84                  getMetadata(stream, metadata);
85              } finally {
86                  stream.close();
87              }
88          } catch (Exception e) {
89              // summary information not available, ignore
90          }
91      }
92  
93      private void getMetadata(InputStream stream, Metadata metadata)
94              throws HPSFException, IOException {
95          PropertySet set = PropertySetFactory.create(stream);
96          if (set instanceof SummaryInformation) {
97              getMetadata((SummaryInformation) set, metadata);
98          } else if (set instanceof DocumentSummaryInformation) {
99              getMetadata((DocumentSummaryInformation) set, metadata);
100         }
101     }
102 
103     private void getMetadata(
104             SummaryInformation information, Metadata metadata) {
105         if (information.getTitle() != null) {
106             metadata.set(Metadata.TITLE, information.getTitle());
107         }
108         if (information.getAuthor() != null) {
109             metadata.set(Metadata.AUTHOR, information.getAuthor());
110         }
111         if (information.getKeywords() != null) {
112             metadata.set(Metadata.KEYWORDS, information.getKeywords());
113         }
114         if (information.getSubject() != null) {
115             metadata.set(Metadata.SUBJECT, information.getSubject());
116         }
117         if (information.getLastAuthor() != null) {
118             metadata.set(Metadata.LAST_AUTHOR, information.getLastAuthor());
119         }
120         if (information.getComments() != null) {
121             metadata.set(Metadata.COMMENTS, information.getComments());
122         }
123         if (information.getTemplate() != null) {
124             metadata.set(Metadata.TEMPLATE, information.getTemplate());
125         }
126         if (information.getApplicationName() != null) {
127             metadata.set(
128                     Metadata.APPLICATION_NAME,
129                     information.getApplicationName());
130         }
131         if (information.getRevNumber() != null) {
132             metadata.set(Metadata.REVISION_NUMBER, information.getRevNumber());
133         }
134         if (information.getCreateDateTime() != null) {
135             metadata.set(
136                     "creationdate",
137                     information.getCreateDateTime().toString());
138         }
139         if (information.getCharCount() > 0) {
140             metadata.set(
141                     Metadata.CHARACTER_COUNT,
142                     Integer.toString(information.getCharCount()));
143         }
144         if (information.getEditTime() > 0) {
145             metadata.set("edittime", Long.toString(information.getEditTime()));
146         }
147         if (information.getLastSaveDateTime() != null) {
148             metadata.set(
149                     Metadata.LAST_SAVED,
150                     information.getLastSaveDateTime().toString());
151         }
152         if (information.getPageCount() > 0) {
153             metadata.set(
154                     Metadata.PAGE_COUNT,
155                     Integer.toString(information.getPageCount()));
156         }
157         if (information.getSecurity() > 0) {
158             metadata.set(
159                     "security", Integer.toString(information.getSecurity()));
160         }
161         if (information.getWordCount() > 0) {
162             metadata.set(
163                     Metadata.WORD_COUNT,
164                     Integer.toString(information.getWordCount()));
165         }
166         if (information.getLastPrinted() != null) {
167             metadata.set(
168                     Metadata.LAST_PRINTED,
169                     information.getLastPrinted().toString());
170         }
171     }
172 
173     private void getMetadata(
174             DocumentSummaryInformation information, Metadata metadata) {
175         if (information.getCompany() != null) {
176             metadata.set("company", information.getCompany());
177         }
178         if (information.getManager() != null) {
179             metadata.set("manager", information.getManager());
180         }
181     }
182 
183 }