1 package us.codecraft.webmagic.selector;
2
3 import java.util.*;
4 import java.util.concurrent.ConcurrentHashMap;
5
6 import javax.xml.namespace.NamespaceContext;
7 import javax.xml.parsers.ParserConfigurationException;
8 import javax.xml.xpath.XPathConstants;
9 import javax.xml.xpath.XPathExpression;
10 import javax.xml.xpath.XPathExpressionException;
11
12 import org.htmlcleaner.CleanerProperties;
13 import org.htmlcleaner.DomSerializer;
14 import org.htmlcleaner.HtmlCleaner;
15 import org.htmlcleaner.TagNode;
16 import org.slf4j.Logger;
17 import org.slf4j.LoggerFactory;
18 import org.w3c.dom.Document;
19 import org.w3c.dom.Node;
20 import org.w3c.dom.NodeList;
21
22 import net.sf.saxon.lib.NamespaceConstant;
23 import net.sf.saxon.xpath.XPathEvaluator;
24 import us.codecraft.webmagic.utils.BaseSelectorUtils;
25
26 import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*;
27
28
29
30
31
32
33
34
35 public class Xpath2Selector implements Selector, NodeSelector {
36
37 private final String xpathStr;
38
39 private XPathExpression xPathExpression;
40
41 private final Logger logger = LoggerFactory.getLogger(getClass());
42
43 public Xpath2Selector(String xpathStr) {
44 this.xpathStr = xpathStr;
45 try {
46 init();
47 } catch (XPathExpressionException e) {
48 throw new IllegalArgumentException("XPath error!", e);
49 }
50 }
51
52 public static Xpath2Selector newInstance(String xpathStr) {
53 return new Xpath2Selector(xpathStr);
54 }
55
56 enum XPath2NamespaceContext implements NamespaceContext {
57
58 INSTANCE;
59
60 private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<>();
61
62 private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<>();
63
64 private void put(String prefix, String namespaceURI) {
65 prefix2NamespaceMap.put(prefix, namespaceURI);
66 List<String> prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>());
67 prefixes.add(prefix);
68 }
69
70 XPath2NamespaceContext() {
71 put("fn", NamespaceConstant.FN);
72 put("xslt", NamespaceConstant.XSLT);
73 put("xhtml", NamespaceConstant.XHTML);
74 }
75
76 @Override
77 public String getNamespaceURI(String prefix) {
78 return prefix2NamespaceMap.get(prefix);
79 }
80
81 @Override
82 public String getPrefix(String namespaceURI) {
83 List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
84 if (prefixes == null || prefixes.size() < 1) {
85 return null;
86 }
87 return prefixes.get(0);
88 }
89
90 @Override
91 public Iterator getPrefixes(String namespaceURI) {
92 List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
93 if (prefixes == null || prefixes.size() < 1) {
94 return null;
95 }
96 return prefixes.iterator();
97 }
98 }
99
100 private void init() throws XPathExpressionException {
101 XPathEvaluator xPathEvaluator = new XPathEvaluator();
102 xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE);
103 xPathExpression = xPathEvaluator.compile(xpathStr);
104 }
105
106 @Override
107 public String select(String text) {
108 try {
109 Document doc = parse(text);
110 return select(doc);
111 } catch (Exception e) {
112 logger.error("select text error! " + xpathStr, e);
113 }
114 return null;
115 }
116
117 @Override
118 public String select(Node node) {
119 try {
120 return (String) xPathExpression.evaluate(node, XPathConstants.STRING);
121 } catch (Exception e) {
122 logger.error("select text error! " + xpathStr, e);
123 }
124 return null;
125 }
126
127 @Override
128 public List<String> selectList(String text) {
129 try {
130 Document doc = parse(text);
131 return selectList(doc);
132 } catch (Exception e) {
133 logger.error("select text error! " + xpathStr, e);
134 }
135 return null;
136 }
137
138 @Override
139 public List<String> selectList(Node node) {
140 try {
141 NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
142 List<Node> nodes = NodeListToArrayList(result);
143 return nodesToStrings(nodes);
144 } catch (Exception e) {
145 logger.error("select text error! " + xpathStr, e);
146 }
147 return null;
148 }
149
150 public Node selectNode(String text) {
151 try {
152 Document doc = parse(text);
153 return selectNode(doc);
154 } catch (Exception e) {
155 logger.error("select text error! " + xpathStr, e);
156 }
157 return null;
158 }
159
160 public Node selectNode(Node node) {
161 try {
162 return (Node) xPathExpression.evaluate(node, XPathConstants.NODE);
163 } catch (Exception e) {
164 logger.error("select text error! " + xpathStr, e);
165 }
166 return null;
167 }
168
169 public List<Node> selectNodes(String text) {
170 try {
171 Document doc = parse(text);
172 return selectNodes(doc);
173 } catch (Exception e) {
174 logger.error("select text error! " + xpathStr, e);
175 }
176 return null;
177 }
178
179 public List<Node> selectNodes(Node node) {
180 try {
181 NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
182 return NodeListToArrayList(result);
183 } catch (Exception e) {
184 logger.error("select text error! " + xpathStr, e);
185 }
186 return null;
187 }
188
189 protected static Document parse(String text) throws ParserConfigurationException {
190
191 text = BaseSelectorUtils.preParse(text);
192 HtmlCleaner htmlCleaner = new HtmlCleaner();
193 TagNode tagNode = htmlCleaner.clean(text);
194 return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
195 }
196
197 }