View Javadoc
1   package us.codecraft.webmagic.selector;
2   
3   import org.jsoup.Jsoup;
4   import org.jsoup.nodes.Document;
5   import org.jsoup.nodes.Element;
6   import org.slf4j.Logger;
7   import org.slf4j.LoggerFactory;
8   
9   import java.util.Collections;
10  import java.util.List;
11  
12  /**
13   * Selectable html.<br>
14   *
15   * @author code4crafter@gmail.com <br>
16   * @since 0.1.0
17   */
18  public class Html extends HtmlNode {
19  
20      private Logger logger = LoggerFactory.getLogger(getClass());
21  
22  	/**
23  	 * Disable jsoup html entity escape. It can be set just before any Html instance is created.
24       * @deprecated
25  	 */
26  	public static boolean DISABLE_HTML_ENTITY_ESCAPE = false;
27  
28      /**
29       * Store parsed document for better performance when only one text exist.
30       */
31      private Document document;
32  
33      public Html(String text, String url) {
34          try {
35              this.document = Jsoup.parse(text, url);
36          } catch (Exception e) {
37              this.document = null;
38              logger.warn("parse document error ", e);
39          }
40      }
41  
42      public Html(String text) {
43          try {
44              this.document = Jsoup.parse(text);
45          } catch (Exception e) {
46              this.document = null;
47              logger.warn("parse document error ", e);
48          }
49      }
50  
51      public Html(Document document) {
52          this.document = document;
53      }
54  
55      public Document getDocument() {
56          return document;
57      }
58  
59      @Override
60      protected List<Element> getElements() {
61          return Collections.<Element>singletonList(getDocument());
62      }
63  
64      /**
65       * @param selector selector
66       * @return result
67       */
68      public String selectDocument(Selector selector) {
69          if (selector instanceof ElementSelector) {
70              ElementSelector elementSelector = (ElementSelector) selector;
71              return elementSelector.select(getDocument());
72          } else {
73              return selector.select(getFirstSourceText());
74          }
75      }
76  
77      public List<String> selectDocumentForList(Selector selector) {
78          if (selector instanceof ElementSelector) {
79              ElementSelector elementSelector = (ElementSelector) selector;
80              return elementSelector.selectList(getDocument());
81          } else {
82              return selector.selectList(getFirstSourceText());
83          }
84      }
85  
86      public static Html create(String text) {
87          return new Html(text);
88      }
89  
90  }