1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.microsoft;
18
19
20 import java.io.IOException;
21 import java.io.InputStream;
22
23 import org.apache.poi.hpsf.DocumentSummaryInformation;
24 import org.apache.poi.hpsf.HPSFException;
25 import org.apache.poi.hpsf.PropertySet;
26 import org.apache.poi.hpsf.PropertySetFactory;
27 import org.apache.poi.hpsf.SummaryInformation;
28 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
29 import org.apache.tika.exception.TikaException;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.parser.Parser;
32 import org.apache.tika.sax.AppendableAdaptor;
33 import org.apache.tika.sax.XHTMLContentHandler;
34 import org.xml.sax.ContentHandler;
35 import org.xml.sax.SAXException;
36
37
38
39
40 public abstract class OfficeParser implements Parser {
41
42
43
44
45 public void parse(
46 InputStream stream, ContentHandler handler, Metadata metadata)
47 throws IOException, SAXException, TikaException {
48 POIFSFileSystem filesystem = new POIFSFileSystem(stream);
49
50 metadata.set(Metadata.CONTENT_TYPE, getContentType());
51 getMetadata(
52 filesystem, SummaryInformation.DEFAULT_STREAM_NAME, metadata);
53 getMetadata(
54 filesystem, DocumentSummaryInformation.DEFAULT_STREAM_NAME,
55 metadata);
56
57 XHTMLContentHandler xhtml =
58 new XHTMLContentHandler(handler, metadata);
59 xhtml.startDocument();
60 xhtml.startElement("p");
61 extractText(filesystem, new AppendableAdaptor(xhtml));
62 xhtml.endElement("p");
63 xhtml.endDocument();
64 }
65
66
67
68
69
70
71 protected abstract String getContentType();
72
73
74
75
76 protected abstract void extractText(POIFSFileSystem filesystem, Appendable appendable)
77 throws IOException, TikaException;
78
79 private void getMetadata(
80 POIFSFileSystem filesystem, String name, Metadata metadata) {
81 try {
82 InputStream stream = filesystem.createDocumentInputStream(name);
83 try {
84 getMetadata(stream, metadata);
85 } finally {
86 stream.close();
87 }
88 } catch (Exception e) {
89
90 }
91 }
92
93 private void getMetadata(InputStream stream, Metadata metadata)
94 throws HPSFException, IOException {
95 PropertySet set = PropertySetFactory.create(stream);
96 if (set instanceof SummaryInformation) {
97 getMetadata((SummaryInformation) set, metadata);
98 } else if (set instanceof DocumentSummaryInformation) {
99 getMetadata((DocumentSummaryInformation) set, metadata);
100 }
101 }
102
103 private void getMetadata(
104 SummaryInformation information, Metadata metadata) {
105 if (information.getTitle() != null) {
106 metadata.set(Metadata.TITLE, information.getTitle());
107 }
108 if (information.getAuthor() != null) {
109 metadata.set(Metadata.AUTHOR, information.getAuthor());
110 }
111 if (information.getKeywords() != null) {
112 metadata.set(Metadata.KEYWORDS, information.getKeywords());
113 }
114 if (information.getSubject() != null) {
115 metadata.set(Metadata.SUBJECT, information.getSubject());
116 }
117 if (information.getLastAuthor() != null) {
118 metadata.set(Metadata.LAST_AUTHOR, information.getLastAuthor());
119 }
120 if (information.getComments() != null) {
121 metadata.set(Metadata.COMMENTS, information.getComments());
122 }
123 if (information.getTemplate() != null) {
124 metadata.set(Metadata.TEMPLATE, information.getTemplate());
125 }
126 if (information.getApplicationName() != null) {
127 metadata.set(
128 Metadata.APPLICATION_NAME,
129 information.getApplicationName());
130 }
131 if (information.getRevNumber() != null) {
132 metadata.set(Metadata.REVISION_NUMBER, information.getRevNumber());
133 }
134 if (information.getCreateDateTime() != null) {
135 metadata.set(
136 "creationdate",
137 information.getCreateDateTime().toString());
138 }
139 if (information.getCharCount() > 0) {
140 metadata.set(
141 Metadata.CHARACTER_COUNT,
142 Integer.toString(information.getCharCount()));
143 }
144 if (information.getEditTime() > 0) {
145 metadata.set("edittime", Long.toString(information.getEditTime()));
146 }
147 if (information.getLastSaveDateTime() != null) {
148 metadata.set(
149 Metadata.LAST_SAVED,
150 information.getLastSaveDateTime().toString());
151 }
152 if (information.getPageCount() > 0) {
153 metadata.set(
154 Metadata.PAGE_COUNT,
155 Integer.toString(information.getPageCount()));
156 }
157 if (information.getSecurity() > 0) {
158 metadata.set(
159 "security", Integer.toString(information.getSecurity()));
160 }
161 if (information.getWordCount() > 0) {
162 metadata.set(
163 Metadata.WORD_COUNT,
164 Integer.toString(information.getWordCount()));
165 }
166 if (information.getLastPrinted() != null) {
167 metadata.set(
168 Metadata.LAST_PRINTED,
169 information.getLastPrinted().toString());
170 }
171 }
172
173 private void getMetadata(
174 DocumentSummaryInformation information, Metadata metadata) {
175 if (information.getCompany() != null) {
176 metadata.set("company", information.getCompany());
177 }
178 if (information.getManager() != null) {
179 metadata.set("manager", information.getManager());
180 }
181 }
182
183 }