View Javadoc
1   package us.codecraft.webmagic.scheduler;
2   
3   /**
4    * @author code4crafter@gmail.com
5    *         Date: 16/12/18
6    *         Time: 上午10:23
7    */
8   
9   import com.google.common.hash.BloomFilter;
10  import com.google.common.hash.Funnels;
11  import us.codecraft.webmagic.Request;
12  import us.codecraft.webmagic.Task;
13  import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
14  
15  import java.nio.charset.Charset;
16  import java.util.concurrent.atomic.AtomicInteger;
17  
18  /**
19   * BloomFilterDuplicateRemover for huge number of urls.
20   *
21   * @author code4crafer@gmail.com
22   * @since 0.5.1
23   */
24  public class BloomFilterDuplicateRemover implements DuplicateRemover {
25  
26      private int expectedInsertions;
27  
28      private double fpp;
29  
30      private AtomicInteger counter;
31  
32      public BloomFilterDuplicateRemover(int expectedInsertions) {
33          this(expectedInsertions, 0.01);
34      }
35  
36      /**
37       *
38       * @param expectedInsertions the number of expected insertions to the constructed
39       * @param fpp the desired false positive probability (must be positive and less than 1.0)
40       */
41      public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) {
42          this.expectedInsertions = expectedInsertions;
43          this.fpp = fpp;
44          this.bloomFilter = rebuildBloomFilter();
45      }
46  
47      protected BloomFilter<CharSequence> rebuildBloomFilter() {
48          counter = new AtomicInteger(0);
49          return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
50      }
51  
52      private final BloomFilter<CharSequence> bloomFilter;
53  
54      @Override
55      public boolean isDuplicate(Request request, Task task) {
56          boolean isDuplicate = bloomFilter.mightContain(getUrl(request));
57          if (!isDuplicate) {
58              bloomFilter.put(getUrl(request));
59              counter.incrementAndGet();
60          }
61          return isDuplicate;
62      }
63  
64      protected String getUrl(Request request) {
65          return request.getUrl();
66      }
67  
68      @Override
69      public void resetDuplicateCheck(Task task) {
70          rebuildBloomFilter();
71      }
72  
73      @Override
74      public int getTotalRequestsCount(Task task) {
75          return counter.get();
76      }
77  }