1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.xml;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.ArrayList;
22 import java.util.Iterator;
23 import java.util.List;
24
25 import org.apache.tika.exception.TikaException;
26 import org.apache.tika.metadata.Metadata;
27 import org.apache.tika.parser.Parser;
28 import org.apache.tika.sax.AppendableAdaptor;
29 import org.apache.tika.sax.XHTMLContentHandler;
30 import org.apache.tika.utils.Utils;
31
32 import org.apache.commons.lang.StringUtils;
33 import org.apache.log4j.Logger;
34 import org.jaxen.JaxenException;
35 import org.jaxen.SimpleNamespaceContext;
36 import org.jaxen.jdom.JDOMXPath;
37 import org.jdom.Attribute;
38 import org.jdom.Comment;
39 import org.jdom.Document;
40 import org.jdom.Element;
41 import org.jdom.EntityRef;
42 import org.jdom.Namespace;
43 import org.jdom.ProcessingInstruction;
44 import org.jdom.Text;
45 import org.xml.sax.ContentHandler;
46 import org.xml.sax.SAXException;
47
48
49
50
51 public class XMLParser implements Parser {
52
53 static Logger logger = Logger.getRootLogger();
54
55 public void parse(
56 InputStream stream, ContentHandler handler, Metadata metadata)
57 throws IOException, SAXException, TikaException {
58 Document xmlDoc = Utils.parse(stream);
59
60 extractContent(xmlDoc, Metadata.TITLE, "//dc:title", metadata);
61 extractContent(xmlDoc, Metadata.SUBJECT, "//dc:subject", metadata);
62 extractContent(xmlDoc, Metadata.CREATOR, "//dc:creator", metadata);
63 extractContent(xmlDoc, Metadata.DESCRIPTION, "//dc:description", metadata);
64 extractContent(xmlDoc, Metadata.PUBLISHER, "//dc:publisher", metadata);
65 extractContent(xmlDoc, Metadata.CONTRIBUTOR, "//dc:contributor", metadata);
66 extractContent(xmlDoc, Metadata.TYPE, "//dc:type", metadata);
67 extractContent(xmlDoc, Metadata.FORMAT, "//dc:format", metadata);
68 extractContent(xmlDoc, Metadata.IDENTIFIER, "//dc:identifier", metadata);
69 extractContent(xmlDoc, Metadata.LANGUAGE, "//dc:language", metadata);
70 extractContent(xmlDoc, Metadata.RIGHTS, "//dc:rights", metadata);
71
72 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
73 xhtml.startDocument();
74 xhtml.startElement("p");
75 concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml));
76 xhtml.endElement("p");
77 xhtml.endDocument();
78 }
79
80 public void concatOccurrence(Object xmlDoc, String xpath, String concatSep, Appendable chaineConcat) throws IOException {
81
82 try {
83 JDOMXPath xp = new JDOMXPath(xpath);
84 List ls = xp.selectNodes(xmlDoc);
85 Iterator i = ls.iterator();
86 int j = 0;
87 while (i.hasNext()) {
88 j++;
89 String text = "";
90 Object obj = i.next();
91 if (obj instanceof Element) {
92 Element elem = (Element) obj;
93 text = elem.getText().trim();
94 } else if (obj instanceof Attribute) {
95 Attribute att = (Attribute) obj;
96 text = att.getValue().trim();
97 } else if (obj instanceof Text) {
98 Text txt = (Text) obj;
99 text = txt.getText().trim();
100 } else if (obj instanceof Comment) {
101 Comment com = (Comment) obj;
102 text = com.getText().trim();
103 } else if (obj instanceof ProcessingInstruction) {
104 ProcessingInstruction pi = (ProcessingInstruction) obj;
105 text = pi.getData().trim();
106 } else if (obj instanceof EntityRef) {
107 EntityRef er = (EntityRef) obj;
108 text = er.toString().trim();
109 }
110 if (StringUtils.isNotEmpty(text)) {
111 chaineConcat.append(text);
112 if (ls.size() == 1) {
113 return;
114 } else {
115 if (ls.size() != j) {
116 chaineConcat.append(' ')
117 .append(concatSep)
118 .append(' ');
119 }
120 }
121 }
122 }
123 } catch (JaxenException j) {
124 logger.error(j.getMessage());
125 }
126 }
127
128 public List getAllDocumentNs(org.jdom.Document doc) {
129 List ls = new ArrayList();
130 processChildren(doc.getRootElement(), ls);
131 return ls;
132 }
133
134 private boolean exist(List nsLs, String nsUri) {
135 if (nsLs.isEmpty())
136 return false;
137 for (Object nsL : nsLs) {
138 if (nsL.equals(nsUri)) {
139 return true;
140 }
141 }
142 return false;
143 }
144
145 private void processChildren(Element elem, List ns) {
146 Namespace nsCourent = elem.getNamespace();
147 String nsUri = (nsCourent.getURI());
148 if (!exist(ns, nsUri)) {
149 ns.add(nsUri.trim());
150 }
151 List additionalNs = elem.getAdditionalNamespaces();
152 if (!additionalNs.isEmpty())
153 copyNsList(additionalNs, ns);
154 if (elem.getChildren().size() > 0) {
155 List elemChildren = elem.getChildren();
156 for (Object anElemChildren : elemChildren) {
157 processChildren((Element) anElemChildren, ns);
158 }
159 }
160 }
161
162 private void copyNsList(List nsElem, List nsRes) {
163 for (Object aNsElem : nsElem) {
164 nsRes.add(((Namespace) aNsElem).getURI().trim());
165 }
166 }
167
168 public void extractContent(
169 Document xmlDoc, String name, String xpath, Metadata metadata) {
170 try {
171 JDOMXPath xp = new JDOMXPath(xpath);
172 SimpleNamespaceContext context = new SimpleNamespaceContext();
173 context.addNamespace("dc", "http://purl.org/dc/elements/1.1/");
174 context.addNamespace("meta", "urn:oasis:names:tc:opendocument:xmlns:meta:1.0");
175 xp.setNamespaceContext(context);
176 List selectNodes = xp.selectNodes(xmlDoc);
177 Iterator nodes = selectNodes.iterator();
178 while (nodes.hasNext()) {
179 Object node = nodes.next();
180 if (node instanceof Element) {
181 Element elem = (Element) node;
182 if (StringUtils.isNotBlank(elem.getText())) {
183 metadata.add(name, elem.getText().trim());
184 }
185 } else if (node instanceof Attribute) {
186 Attribute att = (Attribute) node;
187 metadata.add(name, att.getValue());
188 } else if (node instanceof Text) {
189 Text text = (Text) node;
190 metadata.add(name, text.getText());
191 } else if (node instanceof Comment) {
192 Comment com = (Comment) node;
193 metadata.add(name, com.getText());
194 } else if (node instanceof ProcessingInstruction) {
195 ProcessingInstruction pi = (ProcessingInstruction) node;
196 metadata.add(name, pi.getData());
197 } else if (node instanceof EntityRef) {
198 EntityRef er = (EntityRef) node;
199 metadata.add(name, er.toString());
200 }
201 }
202 } catch (JaxenException e) {
203 logger.error(e.getMessage());
204 }
205
206 }
207
208 }