View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.pdf;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.util.Calendar;
22  
23  import org.apache.tika.exception.TikaException;
24  import org.apache.tika.metadata.Metadata;
25  import org.apache.tika.parser.Parser;
26  
27  import org.pdfbox.pdmodel.PDDocument;
28  import org.pdfbox.pdmodel.PDDocumentInformation;
29  import org.xml.sax.ContentHandler;
30  import org.xml.sax.SAXException;
31  
32  /**
33   * PDF parser
34   */
35  public class PDFParser implements Parser {
36  
37      public void parse(
38              InputStream stream, ContentHandler handler, Metadata metadata)
39              throws IOException, SAXException, TikaException {
40          PDDocument pdfDocument = PDDocument.load(stream);
41          try {
42              if (pdfDocument.isEncrypted()) {
43                  try {
44                      pdfDocument.decrypt("");
45                  } catch (Exception e) {
46                      // Ignore
47                  }
48              }
49              metadata.add(Metadata.CONTENT_TYPE, "application/pdf");
50              extractMetadata(pdfDocument, metadata);
51              PDF2XHTML.process(pdfDocument, handler, metadata);
52          } finally {
53              pdfDocument.close();
54          }
55      }
56  
57      private void extractMetadata(PDDocument document, Metadata metadata)
58              throws TikaException {
59          PDDocumentInformation info = document.getDocumentInformation();
60          addMetadata(metadata, Metadata.TITLE, info.getTitle());
61          addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
62          addMetadata(metadata, Metadata.CREATOR, info.getCreator());
63          addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
64          addMetadata(metadata, "producer", info.getProducer());
65          addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
66          addMetadata(metadata, "trapped", info.getTrapped());
67          try {
68              addMetadata(metadata, "created", info.getCreationDate());
69          } catch (IOException e) {
70              // Invalid date format, just ignore
71          }
72          try {
73              Calendar modified = info.getModificationDate(); 
74              addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
75          } catch (IOException e) {
76              // Invalid date format, just ignore
77          }
78      }
79  
80      private void addMetadata(Metadata metadata, String name, String value) {
81          if (value != null) {
82              metadata.add(name, value);
83          }
84      }
85  
86      private void addMetadata(Metadata metadata, String name, Calendar value) {
87          if (value != null) {
88              metadata.set(name, value.getTime().toString());
89          }
90      }
91  
92  }