View Javadoc
1   package us.codecraft.webmagic.selector;
2   
3   import org.jsoup.nodes.Document;
4   import org.jsoup.nodes.Element;
5   
6   import java.util.ArrayList;
7   import java.util.List;
8   import java.util.ListIterator;
9   
10  /**
11   * @author code4crafer@gmail.com
12   */
13  public class HtmlNode extends AbstractSelectable {
14  
15      private final List<Element> elements;
16  
17      public HtmlNode(List<Element> elements) {
18          this.elements = elements;
19      }
20  
21      public HtmlNode() {
22          elements = null;
23      }
24  
25      protected List<Element> getElements() {
26          return elements;
27      }
28  
29      public Selectable smartContent() {
30          SmartContentSelector smartContentSelector = Selectors.smartContent();
31          return select(smartContentSelector, getSourceTexts());
32      }
33  
34      @Override
35      public Selectable links() {
36          return selectElements(new LinksSelector());
37      }
38  
39      @Override
40      public Selectable xpath(String xpath) {
41          XpathSelector xpathSelector = Selectors.xpath(xpath);
42          return selectElements(xpathSelector);
43      }
44  
45      @Override
46      public Selectable selectList(Selector selector) {
47          if (selector instanceof BaseElementSelector) {
48             return selectElements((BaseElementSelector) selector);
49          }
50          return selectList(selector, getSourceTexts());
51      }
52  
53      @Override
54      public Selectable select(Selector selector) {
55          return selectList(selector);
56      }
57  
58      /**
59       * select elements
60       *
61       * @param elementSelector elementSelector
62       * @return result
63       */
64      protected Selectable selectElements(BaseElementSelector elementSelector) {
65          ListIterator<Element> elementIterator = getElements().listIterator();
66          if (!elementSelector.hasAttribute()) {
67              List<Element> resultElements = new ArrayList<Element>();
68              while (elementIterator.hasNext()) {
69                  Element element = checkElementAndConvert(elementIterator);
70                  List<Element> selectElements = elementSelector.selectElements(element);
71                  resultElements.addAll(selectElements);
72              }
73              return new HtmlNode(resultElements);
74          } else {
75              // has attribute, consider as plaintext
76              List<String> resultStrings = new ArrayList<String>();
77              while (elementIterator.hasNext()) {
78                  Element element = checkElementAndConvert(elementIterator);
79                  List<String> selectList = elementSelector.selectList(element);
80                  resultStrings.addAll(selectList);
81              }
82              return new PlainText(resultStrings);
83  
84          }
85      }
86  
87      /**
88       * Only document can be select
89       * See: https://github.com/code4craft/webmagic/issues/113
90       *
91       * @param elementIterator elementIterator
92       * @return element element
93       */
94      private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
95          Element element = elementIterator.next();
96          if (!(element instanceof Document)) {
97              Document root = new Document(element.ownerDocument().baseUri());
98              Element clone = element.clone();
99              root.appendChild(clone);
100             elementIterator.set(root);
101             return root;
102         }
103         return element;
104     }
105 
106     @Override
107     public Selectable $(String selector) {
108         CssSelector cssSelector = Selectors.$(selector);
109         return selectElements(cssSelector);
110     }
111 
112     @Override
113     public Selectable $(String selector, String attrName) {
114         CssSelector cssSelector = Selectors.$(selector, attrName);
115         return selectElements(cssSelector);
116     }
117 
118     @Override
119     public List<Selectable> nodes() {
120         List<Selectable> selectables = new ArrayList<Selectable>();
121         for (Element element : getElements()) {
122             List<Element> childElements = new ArrayList<Element>(1);
123             childElements.add(element);
124             selectables.add(new HtmlNode(childElements));
125         }
126         return selectables;
127     }
128 
129     @Override
130     protected List<String> getSourceTexts() {
131         List<String> sourceTexts = new ArrayList<String>(getElements().size());
132         for (Element element : getElements()) {
133             sourceTexts.add(element.toString());
134         }
135         return sourceTexts;
136     }
137 }