ConfigurablePageProcessor.java
package us.codecraft.webmagic.configurable;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.utils.Experimental;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
*/
@Experimental
public class ConfigurablePageProcessor implements PageProcessor {
private Site site;
private List<ExtractRule> extractRules;
public ConfigurablePageProcessor(Site site, List<ExtractRule> extractRules) {
this.site = site;
this.extractRules = extractRules;
}
@Override
public void process(Page page) {
for (ExtractRule extractRule : extractRules) {
if (extractRule.isMulti()) {
List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
if (extractRule.isNotNull() && results.size() == 0) {
page.setSkip(true);
} else {
page.getResultItems().put(extractRule.getFieldName(), results);
}
} else {
String result = page.getHtml().selectDocument(extractRule.getSelector());
if (extractRule.isNotNull() && result == null) {
page.setSkip(true);
} else {
page.getResultItems().put(extractRule.getFieldName(), result);
}
}
}
}
@Override
public Site getSite() {
return site;
}
}