1 package us.codecraft.webmagic.scheduler;
2
3 import org.slf4j.Logger;
4 import org.slf4j.LoggerFactory;
5 import us.codecraft.webmagic.Request;
6 import us.codecraft.webmagic.Task;
7 import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
8 import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
9 import us.codecraft.webmagic.utils.HttpConstant;
10
11
12
13
14
15
16
17 public abstract class DuplicateRemovedScheduler implements Scheduler {
18
19 protected Logger logger = LoggerFactory.getLogger(getClass());
20
21 private DuplicateRemover duplicatedRemover = new HashSetDuplicateRemover();
22
23 public DuplicateRemover getDuplicateRemover() {
24 return duplicatedRemover;
25 }
26
27 public DuplicateRemovedScheduler setDuplicateRemover(DuplicateRemover duplicatedRemover) {
28 this.duplicatedRemover = duplicatedRemover;
29 return this;
30 }
31
32 @Override
33 public void push(Request request, Task task) {
34 logger.trace("get a candidate url {}", request.getUrl());
35 if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) {
36 logger.debug("push to queue {}", request.getUrl());
37 pushWhenNoDuplicate(request, task);
38 }
39 }
40
41 protected boolean shouldReserved(Request request) {
42 return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
43 }
44
45 protected boolean noNeedToRemoveDuplicate(Request request) {
46 return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod());
47 }
48
49 protected void pushWhenNoDuplicate(Request request, Task task) {
50
51 }
52 }