View Javadoc
1   package us.codecraft.webmagic;
2   
3   import java.util.HashMap;
4   import java.util.HashSet;
5   import java.util.LinkedHashMap;
6   import java.util.Map;
7   import java.util.Set;
8   import java.util.UUID;
9   
10  import us.codecraft.webmagic.utils.HttpConstant;
11  
12  /**
13   * Object contains setting for crawler.<br>
14   *
15   * @author code4crafter@gmail.com <br>
16   * @see us.codecraft.webmagic.processor.PageProcessor
17   * @since 0.1.0
18   */
19  public class Site {
20  
21      private String domain;
22  
23      private String userAgent;
24  
25      private Map<String, String> defaultCookies = new LinkedHashMap<String, String>();
26  
27      private Map<String, Map<String, String>> cookies = new HashMap<String, Map<String, String>>();
28  
29      private String charset;
30  
31      private String defaultCharset;
32  
33      private int sleepTime = 5000;
34  
35      private int retryTimes = 0;
36  
37      private int cycleRetryTimes = 0;
38  
39      private int retrySleepTime = 1000;
40  
41      private int timeOut = 5000;
42  
43      private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
44  
45      private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
46  
47      private Map<String, String> headers = new HashMap<String, String>();
48  
49      private boolean useGzip = true;
50  
51      private boolean disableCookieManagement = false;
52  
53      static {
54          DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200);
55      }
56  
57      /**
58       * new a Site
59       *
60       * @return new site
61       */
62      public static Site me() {
63          return new Site();
64      }
65  
66      /**
67       * Add a cookie with domain {@link #getDomain()}
68       *
69       * @param name name
70       * @param value value
71       * @return this
72       */
73      public Site addCookie(String name, String value) {
74          defaultCookies.put(name, value);
75          return this;
76      }
77  
78      /**
79       * Add a cookie with specific domain.
80       *
81       * @param domain domain
82       * @param name name
83       * @param value value
84       * @return this
85       */
86      public Site addCookie(String domain, String name, String value) {
87          if (!cookies.containsKey(domain)){
88              cookies.put(domain,new HashMap<String, String>());
89          }
90          cookies.get(domain).put(name, value);
91          return this;
92      }
93  
94      /**
95       * set user agent
96       *
97       * @param userAgent userAgent
98       * @return this
99       */
100     public Site setUserAgent(String userAgent) {
101         this.userAgent = userAgent;
102         return this;
103     }
104 
105     /**
106      * get cookies
107      *
108      * @return get cookies
109      */
110     public Map<String, String> getCookies() {
111         return defaultCookies;
112     }
113 
114     /**
115      * get cookies of all domains
116      *
117      * @return get cookies
118      */
119     public Map<String,Map<String, String>> getAllCookies() {
120         return cookies;
121     }
122 
123     /**
124      * get user agent
125      *
126      * @return user agent
127      */
128     public String getUserAgent() {
129         return userAgent;
130     }
131 
132     /**
133      * get domain
134      *
135      * @return get domain
136      */
137     public String getDomain() {
138         return domain;
139     }
140 
141     /**
142      * set the domain of site.
143      *
144      * @param domain domain
145      * @return this
146      */
147     public Site setDomain(String domain) {
148         this.domain = domain;
149         return this;
150     }
151 
152     /**
153      * Set charset of page manually.<br>
154      * When charset is not set or set to null, it can be auto detected by Http header.
155      *
156      * @param charset charset
157      * @return this
158      */
159     public Site setCharset(String charset) {
160         this.charset = charset;
161         return this;
162     }
163 
164     /**
165      * get charset set manually
166      *
167      * @return charset
168      */
169     public String getCharset() {
170         return charset;
171     }
172 
173     /**
174      * Set default charset of page.
175      *
176      * When charset detect failed, use this default charset.
177      *
178      * @param defaultCharset the default charset
179      * @return this
180      * @since 0.9.0
181      */
182     public Site setDefaultCharset(String defaultCharset) {
183         this.defaultCharset = defaultCharset;
184         return this;
185     }
186 
187     /**
188      * The default charset if charset detected failed.
189      *
190      * @return the defulat charset
191      * @since 0.9.0
192      */
193     public String getDefaultCharset() {
194         return defaultCharset;
195     }
196 
197     public int getTimeOut() {
198         return timeOut;
199     }
200 
201     /**
202      * set timeout for downloader in ms
203      *
204      * @param timeOut timeOut
205      * @return this
206      */
207     public Site setTimeOut(int timeOut) {
208         this.timeOut = timeOut;
209         return this;
210     }
211 
212     /**
213      * Set acceptStatCode.<br>
214      * When status code of http response is in acceptStatCodes, it will be processed.<br>
215      * {200} by default.<br>
216      * It is not necessarily to be set.<br>
217      *
218      * @param acceptStatCode acceptStatCode
219      * @return this
220      */
221     public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
222         this.acceptStatCode = acceptStatCode;
223         return this;
224     }
225 
226     /**
227      * get acceptStatCode
228      *
229      * @return acceptStatCode
230      */
231     public Set<Integer> getAcceptStatCode() {
232         return acceptStatCode;
233     }
234 
235     /**
236      * Set the interval between the processing of two pages.<br>
237      * Time unit is milliseconds.<br>
238      *
239      * @param sleepTime sleepTime
240      * @return this
241      */
242     public Site setSleepTime(int sleepTime) {
243         this.sleepTime = sleepTime;
244         return this;
245     }
246 
247     /**
248      * Get the interval between the processing of two pages.<br>
249      * Time unit is milliseconds.<br>
250      *
251      * @return the interval between the processing of two pages,
252      */
253     public int getSleepTime() {
254         return sleepTime;
255     }
256 
257     /**
258      * Get retry times immediately when download fail, 0 by default.<br>
259      *
260      * @return retry times when download fail
261      */
262     public int getRetryTimes() {
263         return retryTimes;
264     }
265 
266     public Map<String, String> getHeaders() {
267         return headers;
268     }
269 
270     /**
271      * Put an Http header for downloader. <br>
272      * Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br>
273      *
274      * @param key   key of http header, there are some keys constant in {@link HttpConstant.Header}
275      * @param value value of header
276      * @return this
277      */
278     public Site addHeader(String key, String value) {
279         headers.put(key, value);
280         return this;
281     }
282 
283     /**
284      * Set retry times when download fail, 0 by default.<br>
285      *
286      * @param retryTimes retryTimes
287      * @return this
288      */
289     public Site setRetryTimes(int retryTimes) {
290         this.retryTimes = retryTimes;
291         return this;
292     }
293 
294     /**
295      * When cycleRetryTimes is more than 0, it will add back to scheduler and try download again. <br>
296      *
297      * @return retry times when download fail
298      */
299     public int getCycleRetryTimes() {
300         return cycleRetryTimes;
301     }
302 
303     /**
304      * Set cycleRetryTimes times when download fail, 0 by default. <br>
305      *
306      * @param cycleRetryTimes cycleRetryTimes
307      * @return this
308      */
309     public Site setCycleRetryTimes(int cycleRetryTimes) {
310         this.cycleRetryTimes = cycleRetryTimes;
311         return this;
312     }
313 
314     public boolean isUseGzip() {
315         return useGzip;
316     }
317 
318     public int getRetrySleepTime() {
319         return retrySleepTime;
320     }
321 
322     /**
323      * Set retry sleep times when download fail, 1000 by default. <br>
324      *
325      * @param retrySleepTime retrySleepTime
326      * @return this
327      */
328     public Site setRetrySleepTime(int retrySleepTime) {
329         this.retrySleepTime = retrySleepTime;
330         return this;
331     }
332 
333     /**
334      * Whether use gzip. <br>
335      * Default is true, you can set it to false to disable gzip.
336      *
337      * @param useGzip useGzip
338      * @return this
339      */
340     public Site setUseGzip(boolean useGzip) {
341         this.useGzip = useGzip;
342         return this;
343     }
344 
345     public boolean isDisableCookieManagement() {
346         return disableCookieManagement;
347     }
348 
349     /**
350      * Downloader is supposed to store response cookie.
351      * Disable it to ignore all cookie fields and stay clean.
352      * Warning: Set cookie will still NOT work if disableCookieManagement is true.
353      * @param disableCookieManagement disableCookieManagement
354      * @return this
355      */
356     public Site setDisableCookieManagement(boolean disableCookieManagement) {
357         this.disableCookieManagement = disableCookieManagement;
358         return this;
359     }
360 
361     public Task toTask() {
362         return new Task() {
363             @Override
364             public String getUUID() {
365                 String uuid = Site.this.getDomain();
366                 if (uuid == null) {
367                     uuid = UUID.randomUUID().toString();
368                 }
369                 return uuid;
370             }
371 
372             @Override
373             public Site getSite() {
374                 return Site.this;
375             }
376         };
377     }
378 
379     @Override
380     public boolean equals(Object o) {
381         if (this == o) return true;
382         if (o == null || getClass() != o.getClass()) return false;
383 
384         Site site = (Site) o;
385 
386         if (cycleRetryTimes != site.cycleRetryTimes) return false;
387         if (retryTimes != site.retryTimes) return false;
388         if (sleepTime != site.sleepTime) return false;
389         if (timeOut != site.timeOut) return false;
390         if (acceptStatCode != null ? !acceptStatCode.equals(site.acceptStatCode) : site.acceptStatCode != null)
391             return false;
392         if (charset != null ? !charset.equals(site.charset) : site.charset != null) return false;
393         if (defaultCookies != null ? !defaultCookies.equals(site.defaultCookies) : site.defaultCookies != null)
394             return false;
395         if (domain != null ? !domain.equals(site.domain) : site.domain != null) return false;
396         if (headers != null ? !headers.equals(site.headers) : site.headers != null) return false;
397         if (userAgent != null ? !userAgent.equals(site.userAgent) : site.userAgent != null) return false;
398 
399         return true;
400     }
401 
402     @Override
403     public int hashCode() {
404         int result = domain != null ? domain.hashCode() : 0;
405         result = 31 * result + (userAgent != null ? userAgent.hashCode() : 0);
406         result = 31 * result + (defaultCookies != null ? defaultCookies.hashCode() : 0);
407         result = 31 * result + (charset != null ? charset.hashCode() : 0);
408         result = 31 * result + sleepTime;
409         result = 31 * result + retryTimes;
410         result = 31 * result + cycleRetryTimes;
411         result = 31 * result + timeOut;
412         result = 31 * result + (acceptStatCode != null ? acceptStatCode.hashCode() : 0);
413         result = 31 * result + (headers != null ? headers.hashCode() : 0);
414         return result;
415     }
416 
417     @Override
418     public String toString() {
419         return "Site{" +
420                 "domain='" + domain + '\'' +
421                 ", userAgent='" + userAgent + '\'' +
422                 ", cookies=" + defaultCookies +
423                 ", charset='" + charset + '\'' +
424                 ", sleepTime=" + sleepTime +
425                 ", retryTimes=" + retryTimes +
426                 ", cycleRetryTimes=" + cycleRetryTimes +
427                 ", timeOut=" + timeOut +
428                 ", acceptStatCode=" + acceptStatCode +
429                 ", headers=" + headers +
430                 '}';
431     }
432 
433 }