View Javadoc
1   package us.codecraft.webmagic.scheduler;
2   
3   import org.slf4j.Logger;
4   import org.slf4j.LoggerFactory;
5   import us.codecraft.webmagic.Request;
6   import us.codecraft.webmagic.Task;
7   import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
8   import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
9   import us.codecraft.webmagic.utils.HttpConstant;
10  
11  /**
12   * Remove duplicate urls and only push urls which are not duplicate.<br><br>
13   *
14   * @author code4crafer@gmail.com
15   * @since 0.5.0
16   */
17  public abstract class DuplicateRemovedScheduler implements Scheduler {
18  
19      protected Logger logger = LoggerFactory.getLogger(getClass());
20  
21      private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover();
22  
23      public DuplicateRemover getDuplicateRemover() {
24          return duplicatedRemover;
25      }
26  
27      public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover duplicatedRemover) {
28          this.duplicatedRemover = duplicatedRemover;
29          return this;
30      }
31  
32      @Override
33      public void push(Request request, Task task) {
34          logger.trace("get a candidate url {}", request.getUrl());
35          if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) {
36              logger.debug("push to queue {}", request.getUrl());
37              pushWhenNoDuplicate(request, task);
38          }
39      }
40  
41      protected boolean shouldReserved(Request request) {
42          return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
43      }
44  
45      protected boolean noNeedToRemoveDuplicate(Request request) {
46          return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod());
47      }
48  
49      protected void pushWhenNoDuplicate(Request request, Task task) {
50  
51      }
52  }