1 package us.codecraft.webmagic.scheduler;
2
3
4
5
6
7
8
9 import com.google.common.hash.BloomFilter;
10 import com.google.common.hash.Funnels;
11 import us.codecraft.webmagic.Request;
12 import us.codecraft.webmagic.Task;
13 import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
14
15 import java.nio.charset.Charset;
16 import java.util.concurrent.atomic.AtomicInteger;
17
18
19
20
21
22
23
24 public class BloomFilterDuplicateRemover implements DuplicateRemover {
25
26 private int expectedInsertions;
27
28 private double fpp;
29
30 private AtomicInteger counter;
31
32 public BloomFilterDuplicateRemover(int expectedInsertions) {
33 this(expectedInsertions, 0.01);
34 }
35
36
37
38
39
40
41 public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) {
42 this.expectedInsertions = expectedInsertions;
43 this.fpp = fpp;
44 this.bloomFilter = rebuildBloomFilter();
45 }
46
47 protected BloomFilter<CharSequence> rebuildBloomFilter() {
48 counter = new AtomicInteger(0);
49 return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
50 }
51
52 private final BloomFilter<CharSequence> bloomFilter;
53
54 @Override
55 public boolean isDuplicate(Request request, Task task) {
56 boolean isDuplicate = bloomFilter.mightContain(getUrl(request));
57 if (!isDuplicate) {
58 bloomFilter.put(getUrl(request));
59 counter.incrementAndGet();
60 }
61 return isDuplicate;
62 }
63
64 protected String getUrl(Request request) {
65 return request.getUrl();
66 }
67
68 @Override
69 public void resetDuplicateCheck(Task task) {
70 rebuildBloomFilter();
71 }
72
73 @Override
74 public int getTotalRequestsCount(Task task) {
75 return counter.get();
76 }
77 }