View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft;
18  
19  import java.io.IOException;
20  import java.util.Iterator;
21  import java.util.List;
22  
23  import org.apache.poi.hwpf.model.CHPBinTable;
24  import org.apache.poi.hwpf.model.CHPX;
25  import org.apache.poi.hwpf.model.ComplexFileTable;
26  import org.apache.poi.hwpf.model.TextPiece;
27  import org.apache.poi.hwpf.model.TextPieceTable;
28  import org.apache.poi.hwpf.sprm.SprmIterator;
29  import org.apache.poi.hwpf.sprm.SprmOperation;
30  import org.apache.poi.poifs.filesystem.DocumentEntry;
31  import org.apache.poi.poifs.filesystem.DocumentInputStream;
32  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
33  import org.apache.poi.util.LittleEndian;
34  import org.apache.tika.exception.TikaException;
35  
36  /**
37   * Word parser
38   */
39  public class WordParser extends OfficeParser {
40  
41      protected String getContentType() {
42          return "application/msword";
43      }
44  
45      /**
46       * Gets the text from a Word document.
47       *
48       * @param in The InputStream representing the Word file.
49       */
50      public void extractText(POIFSFileSystem fsys, Appendable appendable)
51              throws IOException, TikaException {
52          // load our POIFS document streams.
53          DocumentEntry headerProps =
54              (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
55          DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
56          byte[] header = new byte[headerProps.getSize()];
57  
58          din.read(header);
59          din.close();
60  
61          int info = LittleEndian.getShort(header, 0xa);
62          if ((info & 0x4) != 0) {
63              throw new TikaException("Fast-saved files are unsupported");
64          }
65          if ((info & 0x100) != 0) {
66              throw new TikaException("This document is password protected");
67          }
68  
69          // determine the version of Word this document came from.
70          int nFib = LittleEndian.getShort(header, 0x2);
71          switch (nFib) {
72          case 101:
73          case 102:
74          case 103:
75          case 104:
76              // this is a Word 6.0 doc send it to the extractor for that version.
77              Word6Extractor oldExtractor = new Word6Extractor(appendable);
78              oldExtractor.extractText(header);
79          }
80  
81          //get the location of the piece table
82          int complexOffset = LittleEndian.getInt(header, 0x1a2);
83  
84          // determine which table stream we must use.
85          //Get the information we need from the header
86          String tableName = null;
87          boolean useTable1 = (info & 0x200) != 0;
88          if (useTable1) {
89              tableName = "1Table";
90          } else {
91              tableName = "0Table";
92          }
93  
94          DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
95          byte[] tableStream = new byte[table.getSize()];
96  
97          din = fsys.createDocumentInputStream(tableName);
98  
99          din.read(tableStream);
100         din.close();
101 
102         int chpOffset = LittleEndian.getInt(header, 0xfa);
103         int chpSize = LittleEndian.getInt(header, 0xfe);
104         int fcMin = LittleEndian.getInt(header, 0x18);
105         CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
106 
107         // load our text pieces and our character runs
108         ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
109         TextPieceTable tpt = cft.getTextPieceTable();
110         List textPieces = tpt.getTextPieces();
111 
112         // make the POIFS objects available for garbage collection
113         din = null;
114         fsys = null;
115         table = null;
116         headerProps = null;
117 
118         List textRuns = cbt.getTextRuns();
119         Iterator runIt = textRuns.iterator();
120         Iterator textIt = textPieces.iterator();
121 
122         TextPiece currentPiece = (TextPiece)textIt.next();
123         int currentTextStart = currentPiece.getStart();
124         int currentTextEnd = currentPiece.getEnd();
125 
126         WordTextBuffer finalTextBuf = new WordTextBuffer(appendable);
127 
128         // iterate through all text runs extract the text only if they haven't been
129         // deleted
130         while (runIt.hasNext()) {
131             CHPX chpx = (CHPX)runIt.next();
132             boolean deleted = isDeleted(chpx.getGrpprl());
133             if (deleted) {
134                 continue;
135             }
136 
137             int runStart = chpx.getStart();
138             int runEnd = chpx.getEnd();
139 
140             while (runStart >= currentTextEnd) {
141                 currentPiece = (TextPiece) textIt.next ();
142                 currentTextStart = currentPiece.getStart ();
143                 currentTextEnd = currentPiece.getEnd ();
144             }
145 
146             if (runEnd < currentTextEnd) {
147                 String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
148                 finalTextBuf.append(str);
149             } else if (runEnd > currentTextEnd) {
150                 while (runEnd > currentTextEnd) {
151                     String str = currentPiece.substring(runStart - currentTextStart,
152                             currentTextEnd - currentTextStart);
153                     finalTextBuf.append(str);
154                     if (textIt.hasNext()) {
155                         currentPiece = (TextPiece) textIt.next ();
156                         currentTextStart = currentPiece.getStart ();
157                         runStart = currentTextStart;
158                         currentTextEnd = currentPiece.getEnd ();
159                     } else {
160                         return;
161                     }
162                 }
163                 String str = currentPiece.substring(0, runEnd - currentTextStart);
164                 finalTextBuf.append(str);
165             } else {
166                 String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
167                 if (textIt.hasNext()) {
168                     currentPiece = (TextPiece) textIt.next();
169                     currentTextStart = currentPiece.getStart();
170                     currentTextEnd = currentPiece.getEnd();
171                 }
172                 finalTextBuf.append(str);
173             }
174         }
175     }
176 
177     /**
178      * Used to determine if a run of text has been deleted.
179      *
180      * @param grpprl The list of sprms for a particular run of text.
181      * @return true if this run of text has been deleted.
182      */
183     private boolean isDeleted(byte[] grpprl) {
184         SprmIterator iterator = new SprmIterator(grpprl,0);
185         while (iterator.hasNext()) {
186             SprmOperation op = iterator.next();
187             // 0 is the operation that signals a FDelRMark operation
188             if (op.getOperation() == 0 && op.getOperand() != 0) {
189                 return true;
190             }
191         }
192         return false;
193     }
194 
195 }