1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.microsoft;
18
19 import java.io.IOException;
20 import java.util.Iterator;
21 import java.util.List;
22
23 import org.apache.poi.hwpf.model.CHPBinTable;
24 import org.apache.poi.hwpf.model.CHPX;
25 import org.apache.poi.hwpf.model.ComplexFileTable;
26 import org.apache.poi.hwpf.model.TextPiece;
27 import org.apache.poi.hwpf.model.TextPieceTable;
28 import org.apache.poi.hwpf.sprm.SprmIterator;
29 import org.apache.poi.hwpf.sprm.SprmOperation;
30 import org.apache.poi.poifs.filesystem.DocumentEntry;
31 import org.apache.poi.poifs.filesystem.DocumentInputStream;
32 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
33 import org.apache.poi.util.LittleEndian;
34 import org.apache.tika.exception.TikaException;
35
36
37
38
39 public class WordParser extends OfficeParser {
40
41 protected String getContentType() {
42 return "application/msword";
43 }
44
45
46
47
48
49
50 public void extractText(POIFSFileSystem fsys, Appendable appendable)
51 throws IOException, TikaException {
52
53 DocumentEntry headerProps =
54 (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
55 DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
56 byte[] header = new byte[headerProps.getSize()];
57
58 din.read(header);
59 din.close();
60
61 int info = LittleEndian.getShort(header, 0xa);
62 if ((info & 0x4) != 0) {
63 throw new TikaException("Fast-saved files are unsupported");
64 }
65 if ((info & 0x100) != 0) {
66 throw new TikaException("This document is password protected");
67 }
68
69
70 int nFib = LittleEndian.getShort(header, 0x2);
71 switch (nFib) {
72 case 101:
73 case 102:
74 case 103:
75 case 104:
76
77 Word6Extractor oldExtractor = new Word6Extractor(appendable);
78 oldExtractor.extractText(header);
79 }
80
81
82 int complexOffset = LittleEndian.getInt(header, 0x1a2);
83
84
85
86 String tableName = null;
87 boolean useTable1 = (info & 0x200) != 0;
88 if (useTable1) {
89 tableName = "1Table";
90 } else {
91 tableName = "0Table";
92 }
93
94 DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
95 byte[] tableStream = new byte[table.getSize()];
96
97 din = fsys.createDocumentInputStream(tableName);
98
99 din.read(tableStream);
100 din.close();
101
102 int chpOffset = LittleEndian.getInt(header, 0xfa);
103 int chpSize = LittleEndian.getInt(header, 0xfe);
104 int fcMin = LittleEndian.getInt(header, 0x18);
105 CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
106
107
108 ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
109 TextPieceTable tpt = cft.getTextPieceTable();
110 List textPieces = tpt.getTextPieces();
111
112
113 din = null;
114 fsys = null;
115 table = null;
116 headerProps = null;
117
118 List textRuns = cbt.getTextRuns();
119 Iterator runIt = textRuns.iterator();
120 Iterator textIt = textPieces.iterator();
121
122 TextPiece currentPiece = (TextPiece)textIt.next();
123 int currentTextStart = currentPiece.getStart();
124 int currentTextEnd = currentPiece.getEnd();
125
126 WordTextBuffer finalTextBuf = new WordTextBuffer(appendable);
127
128
129
130 while (runIt.hasNext()) {
131 CHPX chpx = (CHPX)runIt.next();
132 boolean deleted = isDeleted(chpx.getGrpprl());
133 if (deleted) {
134 continue;
135 }
136
137 int runStart = chpx.getStart();
138 int runEnd = chpx.getEnd();
139
140 while (runStart >= currentTextEnd) {
141 currentPiece = (TextPiece) textIt.next ();
142 currentTextStart = currentPiece.getStart ();
143 currentTextEnd = currentPiece.getEnd ();
144 }
145
146 if (runEnd < currentTextEnd) {
147 String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
148 finalTextBuf.append(str);
149 } else if (runEnd > currentTextEnd) {
150 while (runEnd > currentTextEnd) {
151 String str = currentPiece.substring(runStart - currentTextStart,
152 currentTextEnd - currentTextStart);
153 finalTextBuf.append(str);
154 if (textIt.hasNext()) {
155 currentPiece = (TextPiece) textIt.next ();
156 currentTextStart = currentPiece.getStart ();
157 runStart = currentTextStart;
158 currentTextEnd = currentPiece.getEnd ();
159 } else {
160 return;
161 }
162 }
163 String str = currentPiece.substring(0, runEnd - currentTextStart);
164 finalTextBuf.append(str);
165 } else {
166 String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
167 if (textIt.hasNext()) {
168 currentPiece = (TextPiece) textIt.next();
169 currentTextStart = currentPiece.getStart();
170 currentTextEnd = currentPiece.getEnd();
171 }
172 finalTextBuf.append(str);
173 }
174 }
175 }
176
177
178
179
180
181
182
183 private boolean isDeleted(byte[] grpprl) {
184 SprmIterator iterator = new SprmIterator(grpprl,0);
185 while (iterator.hasNext()) {
186 SprmOperation op = iterator.next();
187
188 if (op.getOperation() == 0 && op.getOperand() != 0) {
189 return true;
190 }
191 }
192 return false;
193 }
194
195 }