View Javadoc
1   package us.codecraft.webmagic;
2   
3   import org.apache.commons.lang3.StringUtils;
4   import us.codecraft.webmagic.selector.Html;
5   import us.codecraft.webmagic.selector.Json;
6   import us.codecraft.webmagic.selector.Selectable;
7   import us.codecraft.webmagic.utils.HttpConstant;
8   import us.codecraft.webmagic.utils.UrlUtils;
9   
10  import java.util.ArrayList;
11  import java.util.Arrays;
12  import java.util.List;
13  import java.util.Map;
14  
15  /**
16   * Object storing extracted result and urls to fetch.<br>
17   * Not thread safe.<br>
18   * Main method:                                               <br>
19   * {@link #getUrl()} get url of current page                   <br>
20   * {@link #getHtml()}  get content of current page                 <br>
21   * {@link #putField(String, Object)}  save extracted result            <br>
22   * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
23   * {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch                 <br>
24   *
25   * @author code4crafter@gmail.com <br>
26   * @see us.codecraft.webmagic.downloader.Downloader
27   * @see us.codecraft.webmagic.processor.PageProcessor
28   * @since 0.1.0
29   */
30  public class Page {
31  
32      private Request request;
33  
34      private ResultItems resultItems = new ResultItems();
35  
36      private Html html;
37  
38      private Json json;
39  
40      private String rawText;
41  
42      private Selectable url;
43  
44      private Map<String,List<String>> headers;
45  
46      private int statusCode = HttpConstant.StatusCode.CODE_200;
47  
48      private boolean downloadSuccess = true;
49  
50      private byte[] bytes;
51  
52      private List<Request> targetRequests = new ArrayList<>();
53  
54      private String charset;
55  
56      public Page() {
57      }
58  
59      /**
60       * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}.
61       *
62       * @return the page.
63       * @deprecated Use {@link #fail(Request)} instead.
64       */
65      @Deprecated
66      public static Page fail() {
67          return fail(null);
68      }
69  
70      /**
71       * Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
72       * and {@link #request} is specified.
73       *
74       * @param request the {@link Request}.
75       * @return the page.
76       * @since 0.10.0
77       */
78      public static Page fail(Request request){
79          Page page = new Page();
80          page.setRequest(request);
81          page.setDownloadSuccess(false);
82          return page;
83      }
84  
85      public Page setSkip(boolean skip) {
86          resultItems.setSkip(skip);
87          return this;
88  
89      }
90  
91      /**
92       * store extract results
93       *
94       * @param key key
95       * @param field field
96       */
97      public void putField(String key, Object field) {
98          resultItems.put(key, field);
99      }
100 
101     /**
102      * get html content of page
103      *
104      * @return html
105      */
106     public Html getHtml() {
107         if (html == null) {
108             html = new Html(rawText, request.getUrl());
109         }
110         return html;
111     }
112 
113     /**
114      * get json content of page
115      *
116      * @return json
117      * @since 0.5.0
118      */
119     public Json getJson() {
120         if (json == null) {
121             json = new Json(rawText);
122         }
123         return json;
124     }
125 
126     /**
127      * @param html html
128      * @deprecated since 0.4.0
129      * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
130      */
131     @Deprecated
132 	public void setHtml(Html html) {
133         this.html = html;
134     }
135 
136     public List<Request> getTargetRequests() {
137         return targetRequests;
138     }
139 
140     /**
141      * add urls to fetch
142      *
143      * @param requests requests
144      */
145     public void addTargetRequests(Iterable<String> requests) {
146     	addTargetRequests(requests, 0); // Default priority is 0
147     }
148 
149     /**
150      * add urls to fetch
151      *
152      * @param requests requests
153      * @param priority priority
154      */
155     public void addTargetRequests(Iterable<String> requests, long priority) {
156     	if(requests == null) {
157     		return;
158     	}
159     	
160         for (String req : requests) {
161         	addRequestIfValid(req, priority);
162         }
163     }
164     
165     /**
166      * Helper method to add a request if it's valid.
167      *
168      * @param url      URL to add
169      * @param priority Priority for the URL
170      */
171     private void addRequestIfValid(String url, long priority) {
172         if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
173             return;
174         }
175 
176         String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
177         Request req = new Request(canonicalizedUrl);
178         if(priority > 0) {
179             req.setPriority(priority);
180         }
181         targetRequests.add(req);
182     }
183 
184     /**
185      * add url to fetch
186      *
187      * @param requestString requestString
188      */
189     public void addTargetRequest(String requestString) {
190         if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
191             return;
192         }
193         requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
194         targetRequests.add(new Request(requestString));
195     }
196 
197     /**
198      * add requests to fetch
199      *
200      * @param request request
201      */
202     public void addTargetRequest(Request request) {
203         targetRequests.add(request);
204     }
205 
206     /**
207      * get url of current page
208      *
209      * @return url of current page
210      */
211     public Selectable getUrl() {
212         return url;
213     }
214 
215     public void setUrl(Selectable url) {
216         this.url = url;
217     }
218 
219     /**
220      * get request of current page
221      *
222      * @return request
223      */
224     public Request getRequest() {
225         return request;
226     }
227 
228     public void setRequest(Request request) {
229         this.request = request;
230         this.resultItems.setRequest(request);
231     }
232 
233     public ResultItems getResultItems() {
234         return resultItems;
235     }
236 
237     public int getStatusCode() {
238         return statusCode;
239     }
240 
241     public void setStatusCode(int statusCode) {
242         this.statusCode = statusCode;
243     }
244 
245     public String getRawText() {
246         return rawText;
247     }
248 
249     public Page setRawText(String rawText) {
250         this.rawText = rawText;
251         return this;
252     }
253 
254     public Map<String, List<String>> getHeaders() {
255         return headers;
256     }
257 
258     public void setHeaders(Map<String, List<String>> headers) {
259         this.headers = headers;
260     }
261 
262     public boolean isDownloadSuccess() {
263         return downloadSuccess;
264     }
265 
266     public void setDownloadSuccess(boolean downloadSuccess) {
267         this.downloadSuccess = downloadSuccess;
268     }
269 
270     public byte[] getBytes() {
271         return bytes;
272     }
273 
274     public void setBytes(byte[] bytes) {
275         this.bytes = bytes;
276     }
277 
278     public String getCharset() {
279         return charset;
280     }
281 
282     public void setCharset(String charset) {
283         this.charset = charset;
284     }
285 
286     @Override
287     public String toString() {
288         return "Page{" +
289                 "request=" + request +
290                 ", resultItems=" + resultItems +
291                 ", html=" + html +
292                 ", json=" + json +
293                 ", rawText='" + rawText + '\'' +
294                 ", url=" + url +
295                 ", headers=" + headers +
296                 ", statusCode=" + statusCode +
297                 ", downloadSuccess=" + downloadSuccess +
298                 ", targetRequests=" + targetRequests +
299                 ", charset='" + charset + '\'' +
300                 ", bytes=" + Arrays.toString(bytes) +
301                 '}';
302     }
303 }