1 package us.codecraft.webmagic.selector;
2
3 import org.jsoup.Jsoup;
4 import org.jsoup.nodes.Document;
5 import org.jsoup.nodes.Element;
6 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory;
8
9 import java.util.Collections;
10 import java.util.List;
11
12
13
14
15
16
17
18 public class Html extends HtmlNode {
19
20 private Logger logger = LoggerFactory.getLogger(getClass());
21
22
23
24
25
26 public static boolean DISABLE_HTML_ENTITY_ESCAPE = false;
27
28
29
30
31 private Document document;
32
33 public Html(String text, String url) {
34 try {
35 this.document = Jsoup.parse(text, url);
36 } catch (Exception e) {
37 this.document = null;
38 logger.warn("parse document error ", e);
39 }
40 }
41
42 public Html(String text) {
43 try {
44 this.document = Jsoup.parse(text);
45 } catch (Exception e) {
46 this.document = null;
47 logger.warn("parse document error ", e);
48 }
49 }
50
51 public Html(Document document) {
52 this.document = document;
53 }
54
55 public Document getDocument() {
56 return document;
57 }
58
59 @Override
60 protected List<Element> getElements() {
61 return Collections.<Element>singletonList(getDocument());
62 }
63
64
65
66
67
68 public String selectDocument(Selector selector) {
69 if (selector instanceof ElementSelector) {
70 ElementSelector elementSelector = (ElementSelector) selector;
71 return elementSelector.select(getDocument());
72 } else {
73 return selector.select(getFirstSourceText());
74 }
75 }
76
77 public List<String> selectDocumentForList(Selector selector) {
78 if (selector instanceof ElementSelector) {
79 ElementSelector elementSelector = (ElementSelector) selector;
80 return elementSelector.selectList(getDocument());
81 } else {
82 return selector.selectList(getFirstSourceText());
83 }
84 }
85
86 public static Html create(String text) {
87 return new Html(text);
88 }
89
90 }