HashSetDuplicateRemover.java
package us.codecraft.webmagic.scheduler.component;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
/**
* @author code4crafer@gmail.com
*/
public class HashSetDuplicateRemover implements DuplicateRemover {
private Set<String> urls = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
@Override
public boolean isDuplicate(Request request, Task task) {
return !urls.add(getUrl(request));
}
protected String getUrl(Request request) {
return request.getUrl();
}
@Override
public void resetDuplicateCheck(Task task) {
urls.clear();
}
@Override
public int getTotalRequestsCount(Task task) {
return urls.size();
}
}