View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft;
18  
19  import java.io.InputStream;
20  import java.util.Enumeration;
21  import java.util.Hashtable;
22  import java.util.List;
23  import java.util.Vector;
24  
25  import org.apache.log4j.Logger;
26  import org.apache.poi.hdf.extractor.Utils;
27  import org.apache.poi.util.LittleEndian;
28  import org.apache.poi.util.StringUtil;
29  
30  class PowerPointExtractor {
31  
32      static Logger LOG = Logger.getRootLogger();
33  
34      /** Buffer holding the content of the file */
35      private final Appendable builder;
36  
37      /**
38       * Constructs Listener to get content of PowerPoint file.
39       */
40      public PowerPointExtractor(Appendable builder) {
41          this.builder = builder;
42      }
43  
44      /**
45       * Reads the internal PowerPoint document stream.
46       */
47      public void extract(InputStream dis) {
48          try {
49              final byte pptdata[] = new byte[dis.available()];
50              dis.read(pptdata, 0, dis.available());
51              int offset = 0;
52              long offsetPD = 0;
53  
54              /*
55               * Traverse Bytearray to get CurrentUserEditAtom Call to extract the
56               * Text in all PlaceHolders to hold PPTClientTextBox objects for
57               * mapping into Slide Objects
58               */
59              Hashtable/* <Long, TextBox> */containerTextBox = new Hashtable/*
60                                                                               * <Long,
61                                                                               * TextBox>
62                                                                               */();
63              // Traverse ByteArray to identiy edit paths of ClientTextBoxes
64              long n = pptdata.length - 20;
65              for (long i = 0; i < n; i++) {
66  
67                  final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
68                  // final long size = LittleEndian.getUInt(pptdata, (int) i + 4);
69  
70                  if (PPTConstants.PPT_ATOM_USEREDIT == type) {
71                      /*
72                       * Checking the Record Header (UserEditAtom)
73                       */
74                      // final long lastSlideID = LittleEndian.getInt(pptdata,
75                      // (int) i + 8);
76                      // final long version = LittleEndian.getUInt(pptdata, (int)
77                      // i + 12);
78                      offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
79                      offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);
80  
81                      /*
82                       * Call to extract ClientTextBox text in each UserEditAtom
83                       */
84                      containerTextBox = extractTextBoxes(containerTextBox,
85                              offset, pptdata, offsetPD);
86                  } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
87                      // if (LOG.isTraceEnabled()) {
88                      // LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type);
89                      // }
90                  } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
91                      // if (LOG.isTraceEnabled()) {
92                      // LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type);
93                      // }
94                  } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
95                      // if (LOG.isTraceEnabled()) {
96                      // LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type);
97                      // }
98                  } else {
99                      // no action
100                     // if (LOG.isTraceEnabled()) {
101                     // LOG.trace("type not handled: " + type);
102                     // }
103                 }
104             }
105 
106             final List/* <PPTSlide> */slides = extractSlides(offset, pptdata,
107                     offsetPD);
108 
109             if (slides.size() == 0) {
110                 if (LOG.isInfoEnabled()) {
111                     LOG.info("No slides extracted!");
112                 }
113 
114             } else {
115                 Slide slide = (Slide) slides.get(slides.size() - 1);
116 
117                 for (Enumeration enumeration = containerTextBox.elements(); enumeration
118                         .hasMoreElements();) {
119                     final TextBox textBox = (TextBox) enumeration.nextElement();
120                     slide.addContent(textBox.getContent());
121                 }
122 
123                 /*
124                  * Merging TextBox data with Slide Data Printing the text from
125                  * Slides vector object.
126                  */
127                 List scontent;
128                 for (int i = 0; i < slides.size(); i++) {
129                     slide = (Slide) slides.get(i);
130                     scontent = slide.getContent();
131                     String contentText;
132 
133                     for (int j = 0; j < scontent.size(); j++) {
134                         contentText = scontent.get(j).toString();
135                         builder.append(contentText);
136 
137                         // to avoid concatinated words we add a blank additional
138                         if (contentText.length() > 0
139                                 && !(contentText.endsWith("\r") || contentText
140                                         .endsWith("\n"))) {
141                             builder.append(" ");
142                         }
143                     }
144                 }
145             }
146         } catch (Throwable ex) {
147             // because of not killing complete crawling all Throwables are
148             // catched.
149 
150             LOG.error("processPOIFSReaderEvent", ex);
151         }
152     }
153 
154     /**
155      * Extracts the client text boxes of a slide.
156      * 
157      * @param containerTextBox
158      * @param offset
159      * @param pptdata
160      * @param offsetPD
161      * @return Hashtable
162      * @see TextBox
163      */
164     protected Hashtable/* <Long, TextBox> */extractTextBoxes(
165             final Hashtable/* <Long, TextBox> */containerTextBox,
166             final int offset, final byte[] pptdata, final long offsetPD) {
167 
168         // To hold temporary data
169         FilteredStringWriter outStream = new FilteredStringWriter();
170 
171         TextBox textBox;
172 
173         // Traversing the bytearray up to Presist directory position
174         for (int i = offset; i < offsetPD - 20; i++) {
175             try {
176                 // Record info
177                 // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
178                 // Record Type
179                 final long recordType = LittleEndian.getUShort(pptdata, i + 2);
180                 // Record Size
181                 final long recordSize = LittleEndian.getUInt(pptdata, i + 4);
182 
183                 if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
184                     /*
185                      * Record type is of Drawing Group
186                      */
187 
188                     // Total number of objects
189                     // final long objectCount = LittleEndian.getUInt(pptdata,
190                     // (int) i +
191                     // 8);
192                     // currentID = Group ID+number of objects
193                     long currentID = LittleEndian.getInt(pptdata, i + 12);
194                     currentID = ((int) (currentID / 1024)) * 1024;
195 
196                     if (currentID == PPTConstants.PPT_MASTERSLIDE) {
197                         // Ignore Master Slide objects
198                         if (LOG.isTraceEnabled()) {
199                             LOG.trace("Ignore master slide.");
200                         }
201                         i++;
202                         continue;
203                     }
204 
205                     // Check for the ClientTextBox GroupID existence
206                     if (containerTextBox.containsKey(new Long(currentID))) {
207                         // If exists get Client Textbox Group
208                         textBox = (TextBox) containerTextBox.get(new Long(
209                                 currentID));
210                         textBox.setContent("");
211 
212                     } else {
213                         textBox = new TextBox(currentID);
214                         containerTextBox.put(new Long(currentID), textBox);
215                     }
216 
217                     /*
218                      * Iterating the bytearray for TextCharAtoms and
219                      * TextBytesAtom
220                      */
221                     if ((offsetPD - 20) != recordSize) {
222                         // TODO something wrong? Probably an OLE-Object, which
223                         // we ignore.
224                         if (LOG.isDebugEnabled()) {
225                             LOG.debug("offsetPD - 20=" + (offsetPD - 20)
226                                     + " recordsize=" + recordSize);
227                         }
228                     } else {
229                         for (int startPos = i + 8; startPos < offsetPD - 20
230                                 && startPos < recordSize; startPos++) { // &&
231                             // startPos
232                             // <
233                             // recordSize??
234                             try {
235 
236                                 // Record info
237                                 // final long nrinfo =
238                                 // LittleEndian.getUShort(pptdata, (int) j);
239 
240                                 // Record Type
241                                 final long ntype = LittleEndian.getUShort(
242                                         pptdata, startPos + 2);
243 
244                                 // Record size
245                                 // Note that the size doesn't include the 8 byte
246                                 // atom header
247                                 final long nsize = LittleEndian.getUInt(
248                                         pptdata, startPos + 4);
249 
250                                 if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
251                                     /*
252                                      * Break the loop if next GroupID found
253                                      */
254                                     i = startPos - 1;
255                                     break;
256                                 } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) {
257                                     // TextByteAtom record
258                                     outStream = new FilteredStringWriter();
259                                     long ii = 0;
260                                     for (ii = startPos + 6; ii <= startPos + 6
261                                             + nsize; ii++) {
262                                         // For loop to changed to a function
263                                         // if ((ii + 2) >= pptdata.length)
264                                         // break; // FIXME
265                                         outStream
266                                                 .write((char) (pptdata[(int) ii + 2]));
267                                     }
268 
269                                     // Setting the identified text for Current
270                                     // groupID
271                                     textBox.setContent(textBox.getContent()
272                                             + outStream.toString());
273 
274                                 } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) {
275                                     // TextCharAtom record
276 
277                                     final String strTempContent = new String(
278                                             pptdata, startPos + 6,
279                                             (int) (nsize) + 2);
280                                     final byte bytes[] = strTempContent
281                                             .getBytes();
282                                     if (true) {
283                                         outStream = new FilteredStringWriter();
284                                         for (int ii = 0; ii < bytes.length - 1; ii += 2) {
285                                             // For loop to changed to a function
286                                             outStream
287                                                     .write((char) (pptdata[ii + 2]));
288                                         }
289                                         textBox.setContent(textBox.getContent()
290                                                 + outStream.toString());
291                                     } else {
292                                         // this version is used within POI
293                                         String text = StringUtil
294                                                 .getFromCompressedUnicode(
295                                                         bytes, 0, bytes.length);
296                                         textBox.setContent(textBox.getContent()
297                                                 + text);
298                                     }
299 
300                                 } else {
301                                     // ignored
302                                     // if (LOG.isTraceEnabled()) {
303                                     // LOG.trace("Ignored atom type: " + type);
304                                     // }
305                                 }
306                             } catch (Throwable e) {
307 
308                                 LOG.error("extractTextBoxes", e);
309 
310                                 break;
311                             }
312                         }
313                     }
314                 } else {
315                     // Record type is ignored
316                     // if (LOG.isTraceEnabled()) {
317                     // LOG.trace("Ignored record type: " + type);
318                     // }
319                 }
320             } catch (Throwable ee) {
321 
322                 LOG.error("extractClientTextBoxes", ee);
323 
324                 break;
325             }
326         }
327         return containerTextBox;
328     }
329 
330     /**
331      * Returns the Powerpoint <code>Slide</code> s of document as vector.
332      * 
333      * @param offset
334      * @param pptdata
335      * @param offsetPD
336      * @return Vector of the powerpoint slides. Contains
337      *         <code>{@link Slide Slide}</code>
338      * @see Slide
339      */
340     protected List /* <Slide> */extractSlides(final long offset,
341             final byte[] pptdata, final long offsetPD) {
342 
343         int sNum = 0;
344 
345         // List of all slides found
346         final List/* <Slide> */slides = new Vector/* <Slide> */();
347 
348         // current slide data
349         Slide currentSlide = null;
350 
351         // To store data found in TextCharAtoms and TextBytesAtoms
352         FilteredStringWriter outStream;
353 
354         for (long i = offset; i < pptdata.length - 20; i++) {
355 
356             final long recordInfo = LittleEndian.getUShort(pptdata, (int) i);
357             final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
358             final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);
359 
360             if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
361                 /*
362                  * TextByteAtom record
363                  */
364                 outStream = new FilteredStringWriter();
365 
366                 for (long ii = i + 6; (ii <= i + 6 + atomSize)
367                         && (ii + 2 < pptdata.length); ii++) {
368                     try {
369                         // if(ii+2 >= pptdata.length) break; //FIXME
370                         byte value = pptdata[(int) ii + 2];
371                         outStream.write(value);
372                     } catch (ArrayIndexOutOfBoundsException ex) {
373                         if (LOG.isTraceEnabled()) {
374                             LOG.trace("size=" + pptdata.length);
375                         }
376 
377                         LOG.error("extractSlides", ex);
378 
379                     }
380                 }
381 
382                 // Setting the identified text for Current Slide
383                 if (currentSlide != null) {
384                     currentSlide.addContent(outStream.toString());
385                 }
386 
387             } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
388                 /*
389                  * TextCharAtom record
390                  */
391                 outStream = new FilteredStringWriter();
392                 final String strTempContent = new String(pptdata, (int) i + 6,
393                         (int) (atomSize) + 2);
394                 final byte bytes[] = strTempContent.getBytes();
395 
396                 for (int ii = 0; ii < bytes.length - 1; ii += 2) {
397                     outStream.write(Utils.getUnicodeCharacter(bytes, ii));
398                 }
399 
400                 // Setting the identified text for Current Slide
401                 if (currentSlide != null) {
402                     currentSlide.addContent(outStream.toString());
403                 }
404 
405             } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
406                 /*
407                  * SlidePresistAtom Record
408                  */
409                 if (sNum != 0) {
410                     outStream = new FilteredStringWriter();
411 
412                     final long slideID = LittleEndian.getUInt(pptdata,
413                             (int) i + 20);
414 
415                     currentSlide = new Slide(slideID);
416                     // currentSlide.addContent(outStream.toString());
417                     slides.add(currentSlide);
418                 }
419                 sNum++;
420             } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
421                 /*
422                  * Diagram records are ignored
423                  */
424                 if (LOG.isTraceEnabled()) {
425                     LOG.trace("Drawing Groups are ignored.");
426                 }
427                 break;
428             } else {
429                 // ignored
430                 // if (LOG.isTraceEnabled()) {
431                 // LOG.trace("Unhandled atomType: " + atomType);
432                 // }
433             }
434         }
435 
436         return slides;
437     }
438 
439 }