View Javadoc
1   package us.codecraft.webmagic.selector;
2   
3   import java.util.*;
4   import java.util.concurrent.ConcurrentHashMap;
5   
6   import javax.xml.namespace.NamespaceContext;
7   import javax.xml.parsers.ParserConfigurationException;
8   import javax.xml.xpath.XPathConstants;
9   import javax.xml.xpath.XPathExpression;
10  import javax.xml.xpath.XPathExpressionException;
11  
12  import org.htmlcleaner.CleanerProperties;
13  import org.htmlcleaner.DomSerializer;
14  import org.htmlcleaner.HtmlCleaner;
15  import org.htmlcleaner.TagNode;
16  import org.slf4j.Logger;
17  import org.slf4j.LoggerFactory;
18  import org.w3c.dom.Document;
19  import org.w3c.dom.Node;
20  import org.w3c.dom.NodeList;
21  
22  import net.sf.saxon.lib.NamespaceConstant;
23  import net.sf.saxon.xpath.XPathEvaluator;
24  import us.codecraft.webmagic.utils.BaseSelectorUtils;
25  
26  import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*;
27  
28  /**
29   * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。<br>
30   *
31   * @author code4crafter@gmail.com, hooy <br>
32   * Date: 13-4-21
33   * Time: 上午9:39
34   */
35  public class Xpath2Selector implements Selector, NodeSelector {
36  
37      private final String xpathStr;
38  
39      private XPathExpression xPathExpression;
40  
41      private final Logger logger = LoggerFactory.getLogger(getClass());
42  
43      public Xpath2Selector(String xpathStr) {
44          this.xpathStr = xpathStr;
45          try {
46              init();
47          } catch (XPathExpressionException e) {
48              throw new IllegalArgumentException("XPath error!", e);
49          }
50      }
51  
52      public static Xpath2Selector newInstance(String xpathStr) {
53          return new Xpath2Selector(xpathStr);
54      }
55  
56      enum XPath2NamespaceContext implements NamespaceContext {
57  
58          INSTANCE;
59  
60          private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<>();
61  
62          private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<>();
63  
64          private void put(String prefix, String namespaceURI) {
65              prefix2NamespaceMap.put(prefix, namespaceURI);
66              List<String> prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>());
67              prefixes.add(prefix);
68          }
69  
70          XPath2NamespaceContext() {
71              put("fn", NamespaceConstant.FN);
72              put("xslt", NamespaceConstant.XSLT);
73              put("xhtml", NamespaceConstant.XHTML);
74          }
75  
76          @Override
77          public String getNamespaceURI(String prefix) {
78              return prefix2NamespaceMap.get(prefix);
79          }
80  
81          @Override
82          public String getPrefix(String namespaceURI) {
83              List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
84              if (prefixes == null || prefixes.size() < 1) {
85                  return null;
86              }
87              return prefixes.get(0);
88          }
89  
90          @Override
91          public Iterator getPrefixes(String namespaceURI) {
92              List<String> prefixes = namespace2PrefixMap.get(namespaceURI);
93              if (prefixes == null || prefixes.size() < 1) {
94                  return null;
95              }
96              return prefixes.iterator();
97          }
98      }
99  
100     private void init() throws XPathExpressionException {
101         XPathEvaluator xPathEvaluator = new XPathEvaluator();
102         xPathEvaluator.setNamespaceContext(XPath2NamespaceContext.INSTANCE);
103         xPathExpression = xPathEvaluator.compile(xpathStr);
104     }
105 
106     @Override
107     public String select(String text) {
108         try {
109             Document doc = parse(text);
110             return select(doc);
111         } catch (Exception e) {
112             logger.error("select text error! " + xpathStr, e);
113         }
114         return null;
115     }
116 
117     @Override
118     public String select(Node node) {
119         try {
120             return (String) xPathExpression.evaluate(node, XPathConstants.STRING);
121         } catch (Exception e) {
122             logger.error("select text error! " + xpathStr, e);
123         }
124         return null;
125     }
126 
127     @Override
128     public List<String> selectList(String text) {
129         try {
130             Document doc = parse(text);
131             return selectList(doc);
132         } catch (Exception e) {
133             logger.error("select text error! " + xpathStr, e);
134         }
135         return null;
136     }
137 
138     @Override
139     public List<String> selectList(Node node) {
140         try {
141             NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
142             List<Node> nodes = NodeListToArrayList(result);
143             return nodesToStrings(nodes);
144         } catch (Exception e) {
145             logger.error("select text error! " + xpathStr, e);
146         }
147         return null;
148     }
149 
150     public Node selectNode(String text) {
151         try {
152             Document doc = parse(text);
153             return selectNode(doc);
154         } catch (Exception e) {
155             logger.error("select text error! " + xpathStr, e);
156         }
157         return null;
158     }
159 
160     public Node selectNode(Node node) {
161         try {
162             return (Node) xPathExpression.evaluate(node, XPathConstants.NODE);
163         } catch (Exception e) {
164             logger.error("select text error! " + xpathStr, e);
165         }
166         return null;
167     }
168 
169     public List<Node> selectNodes(String text) {
170         try {
171             Document doc = parse(text);
172             return selectNodes(doc);
173         } catch (Exception e) {
174             logger.error("select text error! " + xpathStr, e);
175         }
176         return null;
177     }
178 
179     public List<Node> selectNodes(Node node) {
180         try {
181             NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
182             return NodeListToArrayList(result);
183         } catch (Exception e) {
184             logger.error("select text error! " + xpathStr, e);
185         }
186         return null;
187     }
188 
189     protected static Document parse(String text) throws ParserConfigurationException {
190         // HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
191         text = BaseSelectorUtils.preParse(text);
192         HtmlCleaner htmlCleaner = new HtmlCleaner();
193         TagNode tagNode = htmlCleaner.clean(text);
194         return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
195     }
196 
197 }