View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.xml;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.util.ArrayList;
22  import java.util.Iterator;
23  import java.util.List;
24  
25  import org.apache.tika.exception.TikaException;
26  import org.apache.tika.metadata.Metadata;
27  import org.apache.tika.parser.Parser;
28  import org.apache.tika.sax.AppendableAdaptor;
29  import org.apache.tika.sax.XHTMLContentHandler;
30  import org.apache.tika.utils.Utils;
31  
32  import org.apache.commons.lang.StringUtils;
33  import org.apache.log4j.Logger;
34  import org.jaxen.JaxenException;
35  import org.jaxen.SimpleNamespaceContext;
36  import org.jaxen.jdom.JDOMXPath;
37  import org.jdom.Attribute;
38  import org.jdom.Comment;
39  import org.jdom.Document;
40  import org.jdom.Element;
41  import org.jdom.EntityRef;
42  import org.jdom.Namespace;
43  import org.jdom.ProcessingInstruction;
44  import org.jdom.Text;
45  import org.xml.sax.ContentHandler;
46  import org.xml.sax.SAXException;
47  
48  /**
49   * XML parser
50   */
51  public class XMLParser implements Parser {
52  
53      static Logger logger = Logger.getRootLogger();
54  
55      public void parse(
56              InputStream stream, ContentHandler handler, Metadata metadata)
57              throws IOException, SAXException, TikaException {
58          Document xmlDoc = Utils.parse(stream);
59  
60          extractContent(xmlDoc, Metadata.TITLE, "//dc:title", metadata);
61          extractContent(xmlDoc, Metadata.SUBJECT, "//dc:subject", metadata);
62          extractContent(xmlDoc, Metadata.CREATOR, "//dc:creator", metadata);
63          extractContent(xmlDoc, Metadata.DESCRIPTION, "//dc:description", metadata);
64          extractContent(xmlDoc, Metadata.PUBLISHER, "//dc:publisher", metadata);
65          extractContent(xmlDoc, Metadata.CONTRIBUTOR, "//dc:contributor", metadata);
66          extractContent(xmlDoc, Metadata.TYPE, "//dc:type", metadata);
67          extractContent(xmlDoc, Metadata.FORMAT, "//dc:format", metadata);
68          extractContent(xmlDoc, Metadata.IDENTIFIER, "//dc:identifier", metadata);
69          extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language", metadata);
70          extractContent(xmlDoc, Metadata.RIGHTS, "//dc:rights", metadata);
71  
72          XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
73          xhtml.startDocument();
74          xhtml.startElement("p");
75          concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
76          xhtml.endElement("p");
77          xhtml.endDocument();
78      }
79  
80      public void concatOccurrence(Object xmlDoc, String xpath, String concatSep, Appendable chaineConcat) throws IOException {
81  
82          try {
83              JDOMXPath xp = new JDOMXPath(xpath);
84              List ls = xp.selectNodes(xmlDoc);
85              Iterator i = ls.iterator();
86              int j = 0;
87              while (i.hasNext()) {
88                  j++;
89                  String text = "";
90                  Object obj = i.next();
91                  if (obj instanceof Element) {
92                      Element elem = (Element) obj;
93                      text = elem.getText().trim();
94                  } else if (obj instanceof Attribute) {
95                      Attribute att = (Attribute) obj;
96                      text = att.getValue().trim();
97                  } else if (obj instanceof Text) {
98                      Text txt = (Text) obj;
99                      text = txt.getText().trim();
100                 } else if (obj instanceof Comment) {
101                     Comment com = (Comment) obj;
102                     text = com.getText().trim();
103                 } else if (obj instanceof ProcessingInstruction) {
104                     ProcessingInstruction pi = (ProcessingInstruction) obj;
105                     text = pi.getData().trim();
106                 } else if (obj instanceof EntityRef) {
107                     EntityRef er = (EntityRef) obj;
108                     text = er.toString().trim();
109                 }
110                 if (StringUtils.isNotEmpty(text)) {
111                     chaineConcat.append(text);
112                     if (ls.size() == 1) {
113                         return;
114                     } else {
115                         if (ls.size() != j) {
116                             chaineConcat.append(' ')
117                                     .append(concatSep)
118                                     .append(' ');
119                         }
120                     }
121                 }
122             }
123         } catch (JaxenException j) {
124             logger.error(j.getMessage());
125         }
126     }
127 
128     public List getAllDocumentNs(org.jdom.Document doc) {
129         List ls = new ArrayList();
130         processChildren(doc.getRootElement(), ls);
131         return ls;
132     }
133 
134     private boolean exist(List nsLs, String nsUri) {
135         if (nsLs.isEmpty())
136             return false;
137         for (Object nsL : nsLs) {
138             if (nsL.equals(nsUri)) {
139                 return true;
140             }
141         }
142         return false;
143     }
144 
145     private void processChildren(Element elem, List ns) {
146         Namespace nsCourent = elem.getNamespace();
147         String nsUri = (nsCourent.getURI());
148         if (!exist(ns, nsUri)) {
149             ns.add(nsUri.trim());
150         }
151         List additionalNs = elem.getAdditionalNamespaces();
152         if (!additionalNs.isEmpty())
153             copyNsList(additionalNs, ns);
154         if (elem.getChildren().size() > 0) {
155             List elemChildren = elem.getChildren();
156             for (Object anElemChildren : elemChildren) {
157                 processChildren((Element) anElemChildren, ns);
158             }
159         }
160     }
161 
162     private void copyNsList(List nsElem, List nsRes) {
163         for (Object aNsElem : nsElem) {
164             nsRes.add(((Namespace) aNsElem).getURI().trim());
165         }
166     }
167 
168     public void extractContent(
169             Document xmlDoc, String name, String xpath, Metadata metadata) {
170         try {
171             JDOMXPath xp = new JDOMXPath(xpath);
172             SimpleNamespaceContext context = new SimpleNamespaceContext();
173             context.addNamespace("dc", "http://purl.org/dc/elements/1.1/");
174             context.addNamespace("meta", "urn:oasis:names:tc:opendocument:xmlns:meta:1.0");
175             xp.setNamespaceContext(context);
176             List selectNodes = xp.selectNodes(xmlDoc);
177             Iterator nodes = selectNodes.iterator();
178             while (nodes.hasNext()) {
179                 Object node = nodes.next();
180                 if (node instanceof Element) {
181                     Element elem = (Element) node;
182                     if (StringUtils.isNotBlank(elem.getText())) {
183                         metadata.add(name, elem.getText().trim());
184                     }
185                 } else if (node instanceof Attribute) {
186                     Attribute att = (Attribute) node;
187                     metadata.add(name, att.getValue());
188                 } else if (node instanceof Text) {
189                     Text text = (Text) node;
190                     metadata.add(name, text.getText());
191                 } else if (node instanceof Comment) {
192                     Comment com = (Comment) node;
193                     metadata.add(name, com.getText());
194                 } else if (node instanceof ProcessingInstruction) {
195                     ProcessingInstruction pi = (ProcessingInstruction) node;
196                     metadata.add(name, pi.getData());
197                 } else if (node instanceof EntityRef) {
198                     EntityRef er = (EntityRef) node;
199                     metadata.add(name, er.toString());
200                 }
201             }
202         } catch (JaxenException e) {
203             logger.error(e.getMessage());
204         }
205 
206     }
207 
208 }