1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.tika.parser.microsoft;
18
19 import java.io.InputStream;
20 import java.util.Enumeration;
21 import java.util.Hashtable;
22 import java.util.List;
23 import java.util.Vector;
24
25 import org.apache.log4j.Logger;
26 import org.apache.poi.hdf.extractor.Utils;
27 import org.apache.poi.util.LittleEndian;
28 import org.apache.poi.util.StringUtil;
29
30 class PowerPointExtractor {
31
32 static Logger LOG = Logger.getRootLogger();
33
34 /** Buffer holding the content of the file */
35 private final Appendable builder;
36
37 /**
38 * Constructs Listener to get content of PowerPoint file.
39 */
40 public PowerPointExtractor(Appendable builder) {
41 this.builder = builder;
42 }
43
44 /**
45 * Reads the internal PowerPoint document stream.
46 */
47 public void extract(InputStream dis) {
48 try {
49 final byte pptdata[] = new byte[dis.available()];
50 dis.read(pptdata, 0, dis.available());
51 int offset = 0;
52 long offsetPD = 0;
53
54 /*
55 * Traverse Bytearray to get CurrentUserEditAtom Call to extract the
56 * Text in all PlaceHolders to hold PPTClientTextBox objects for
57 * mapping into Slide Objects
58 */
59 Hashtable/* <Long, TextBox> */containerTextBox = new Hashtable/*
60 * <Long,
61 * TextBox>
62 */();
63 // Traverse ByteArray to identiy edit paths of ClientTextBoxes
64 long n = pptdata.length - 20;
65 for (long i = 0; i < n; i++) {
66
67 final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
68 // final long size = LittleEndian.getUInt(pptdata, (int) i + 4);
69
70 if (PPTConstants.PPT_ATOM_USEREDIT == type) {
71 /*
72 * Checking the Record Header (UserEditAtom)
73 */
74 // final long lastSlideID = LittleEndian.getInt(pptdata,
75 // (int) i + 8);
76 // final long version = LittleEndian.getUInt(pptdata, (int)
77 // i + 12);
78 offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
79 offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);
80
81 /*
82 * Call to extract ClientTextBox text in each UserEditAtom
83 */
84 containerTextBox = extractTextBoxes(containerTextBox,
85 offset, pptdata, offsetPD);
86 } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
87 // if (LOG.isTraceEnabled()) {
88 // LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type);
89 // }
90 } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
91 // if (LOG.isTraceEnabled()) {
92 // LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type);
93 // }
94 } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
95 // if (LOG.isTraceEnabled()) {
96 // LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type);
97 // }
98 } else {
99 // no action
100 // if (LOG.isTraceEnabled()) {
101 // LOG.trace("type not handled: " + type);
102 // }
103 }
104 }
105
106 final List/* <PPTSlide> */slides = extractSlides(offset, pptdata,
107 offsetPD);
108
109 if (slides.size() == 0) {
110 if (LOG.isInfoEnabled()) {
111 LOG.info("No slides extracted!");
112 }
113
114 } else {
115 Slide slide = (Slide) slides.get(slides.size() - 1);
116
117 for (Enumeration enumeration = containerTextBox.elements(); enumeration
118 .hasMoreElements();) {
119 final TextBox textBox = (TextBox) enumeration.nextElement();
120 slide.addContent(textBox.getContent());
121 }
122
123 /*
124 * Merging TextBox data with Slide Data Printing the text from
125 * Slides vector object.
126 */
127 List scontent;
128 for (int i = 0; i < slides.size(); i++) {
129 slide = (Slide) slides.get(i);
130 scontent = slide.getContent();
131 String contentText;
132
133 for (int j = 0; j < scontent.size(); j++) {
134 contentText = scontent.get(j).toString();
135 builder.append(contentText);
136
137 // to avoid concatinated words we add a blank additional
138 if (contentText.length() > 0
139 && !(contentText.endsWith("\r") || contentText
140 .endsWith("\n"))) {
141 builder.append(" ");
142 }
143 }
144 }
145 }
146 } catch (Throwable ex) {
147 // because of not killing complete crawling all Throwables are
148 // catched.
149
150 LOG.error("processPOIFSReaderEvent", ex);
151 }
152 }
153
154 /**
155 * Extracts the client text boxes of a slide.
156 *
157 * @param containerTextBox
158 * @param offset
159 * @param pptdata
160 * @param offsetPD
161 * @return Hashtable
162 * @see TextBox
163 */
164 protected Hashtable/* <Long, TextBox> */extractTextBoxes(
165 final Hashtable/* <Long, TextBox> */containerTextBox,
166 final int offset, final byte[] pptdata, final long offsetPD) {
167
168 // To hold temporary data
169 FilteredStringWriter outStream = new FilteredStringWriter();
170
171 TextBox textBox;
172
173 // Traversing the bytearray up to Presist directory position
174 for (int i = offset; i < offsetPD - 20; i++) {
175 try {
176 // Record info
177 // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
178 // Record Type
179 final long recordType = LittleEndian.getUShort(pptdata, i + 2);
180 // Record Size
181 final long recordSize = LittleEndian.getUInt(pptdata, i + 4);
182
183 if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
184 /*
185 * Record type is of Drawing Group
186 */
187
188 // Total number of objects
189 // final long objectCount = LittleEndian.getUInt(pptdata,
190 // (int) i +
191 // 8);
192 // currentID = Group ID+number of objects
193 long currentID = LittleEndian.getInt(pptdata, i + 12);
194 currentID = ((int) (currentID / 1024)) * 1024;
195
196 if (currentID == PPTConstants.PPT_MASTERSLIDE) {
197 // Ignore Master Slide objects
198 if (LOG.isTraceEnabled()) {
199 LOG.trace("Ignore master slide.");
200 }
201 i++;
202 continue;
203 }
204
205 // Check for the ClientTextBox GroupID existence
206 if (containerTextBox.containsKey(new Long(currentID))) {
207 // If exists get Client Textbox Group
208 textBox = (TextBox) containerTextBox.get(new Long(
209 currentID));
210 textBox.setContent("");
211
212 } else {
213 textBox = new TextBox(currentID);
214 containerTextBox.put(new Long(currentID), textBox);
215 }
216
217 /*
218 * Iterating the bytearray for TextCharAtoms and
219 * TextBytesAtom
220 */
221 if ((offsetPD - 20) != recordSize) {
222 // TODO something wrong? Probably an OLE-Object, which
223 // we ignore.
224 if (LOG.isDebugEnabled()) {
225 LOG.debug("offsetPD - 20=" + (offsetPD - 20)
226 + " recordsize=" + recordSize);
227 }
228 } else {
229 for (int startPos = i + 8; startPos < offsetPD - 20
230 && startPos < recordSize; startPos++) { // &&
231 // startPos
232 // <
233 // recordSize??
234 try {
235
236 // Record info
237 // final long nrinfo =
238 // LittleEndian.getUShort(pptdata, (int) j);
239
240 // Record Type
241 final long ntype = LittleEndian.getUShort(
242 pptdata, startPos + 2);
243
244 // Record size
245 // Note that the size doesn't include the 8 byte
246 // atom header
247 final long nsize = LittleEndian.getUInt(
248 pptdata, startPos + 4);
249
250 if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
251 /*
252 * Break the loop if next GroupID found
253 */
254 i = startPos - 1;
255 break;
256 } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) {
257 // TextByteAtom record
258 outStream = new FilteredStringWriter();
259 long ii = 0;
260 for (ii = startPos + 6; ii <= startPos + 6
261 + nsize; ii++) {
262 // For loop to changed to a function
263 // if ((ii + 2) >= pptdata.length)
264 // break; // FIXME
265 outStream
266 .write((char) (pptdata[(int) ii + 2]));
267 }
268
269 // Setting the identified text for Current
270 // groupID
271 textBox.setContent(textBox.getContent()
272 + outStream.toString());
273
274 } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) {
275 // TextCharAtom record
276
277 final String strTempContent = new String(
278 pptdata, startPos + 6,
279 (int) (nsize) + 2);
280 final byte bytes[] = strTempContent
281 .getBytes();
282 if (true) {
283 outStream = new FilteredStringWriter();
284 for (int ii = 0; ii < bytes.length - 1; ii += 2) {
285 // For loop to changed to a function
286 outStream
287 .write((char) (pptdata[ii + 2]));
288 }
289 textBox.setContent(textBox.getContent()
290 + outStream.toString());
291 } else {
292 // this version is used within POI
293 String text = StringUtil
294 .getFromCompressedUnicode(
295 bytes, 0, bytes.length);
296 textBox.setContent(textBox.getContent()
297 + text);
298 }
299
300 } else {
301 // ignored
302 // if (LOG.isTraceEnabled()) {
303 // LOG.trace("Ignored atom type: " + type);
304 // }
305 }
306 } catch (Throwable e) {
307
308 LOG.error("extractTextBoxes", e);
309
310 break;
311 }
312 }
313 }
314 } else {
315 // Record type is ignored
316 // if (LOG.isTraceEnabled()) {
317 // LOG.trace("Ignored record type: " + type);
318 // }
319 }
320 } catch (Throwable ee) {
321
322 LOG.error("extractClientTextBoxes", ee);
323
324 break;
325 }
326 }
327 return containerTextBox;
328 }
329
330 /**
331 * Returns the Powerpoint <code>Slide</code> s of document as vector.
332 *
333 * @param offset
334 * @param pptdata
335 * @param offsetPD
336 * @return Vector of the powerpoint slides. Contains
337 * <code>{@link Slide Slide}</code>
338 * @see Slide
339 */
340 protected List /* <Slide> */extractSlides(final long offset,
341 final byte[] pptdata, final long offsetPD) {
342
343 int sNum = 0;
344
345 // List of all slides found
346 final List/* <Slide> */slides = new Vector/* <Slide> */();
347
348 // current slide data
349 Slide currentSlide = null;
350
351 // To store data found in TextCharAtoms and TextBytesAtoms
352 FilteredStringWriter outStream;
353
354 for (long i = offset; i < pptdata.length - 20; i++) {
355
356 final long recordInfo = LittleEndian.getUShort(pptdata, (int) i);
357 final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
358 final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);
359
360 if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
361 /*
362 * TextByteAtom record
363 */
364 outStream = new FilteredStringWriter();
365
366 for (long ii = i + 6; (ii <= i + 6 + atomSize)
367 && (ii + 2 < pptdata.length); ii++) {
368 try {
369 // if(ii+2 >= pptdata.length) break; //FIXME
370 byte value = pptdata[(int) ii + 2];
371 outStream.write(value);
372 } catch (ArrayIndexOutOfBoundsException ex) {
373 if (LOG.isTraceEnabled()) {
374 LOG.trace("size=" + pptdata.length);
375 }
376
377 LOG.error("extractSlides", ex);
378
379 }
380 }
381
382 // Setting the identified text for Current Slide
383 if (currentSlide != null) {
384 currentSlide.addContent(outStream.toString());
385 }
386
387 } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
388 /*
389 * TextCharAtom record
390 */
391 outStream = new FilteredStringWriter();
392 final String strTempContent = new String(pptdata, (int) i + 6,
393 (int) (atomSize) + 2);
394 final byte bytes[] = strTempContent.getBytes();
395
396 for (int ii = 0; ii < bytes.length - 1; ii += 2) {
397 outStream.write(Utils.getUnicodeCharacter(bytes, ii));
398 }
399
400 // Setting the identified text for Current Slide
401 if (currentSlide != null) {
402 currentSlide.addContent(outStream.toString());
403 }
404
405 } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
406 /*
407 * SlidePresistAtom Record
408 */
409 if (sNum != 0) {
410 outStream = new FilteredStringWriter();
411
412 final long slideID = LittleEndian.getUInt(pptdata,
413 (int) i + 20);
414
415 currentSlide = new Slide(slideID);
416 // currentSlide.addContent(outStream.toString());
417 slides.add(currentSlide);
418 }
419 sNum++;
420 } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
421 /*
422 * Diagram records are ignored
423 */
424 if (LOG.isTraceEnabled()) {
425 LOG.trace("Drawing Groups are ignored.");
426 }
427 break;
428 } else {
429 // ignored
430 // if (LOG.isTraceEnabled()) {
431 // LOG.trace("Unhandled atomType: " + atomType);
432 // }
433 }
434 }
435
436 return slides;
437 }
438
439 }