SimplePageProcessor.java
package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import java.util.List;
/**
* A simple PageProcessor.
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class SimplePageProcessor implements PageProcessor {
private String urlPattern;
private Site site;
public SimplePageProcessor(String urlPattern) {
this.site = Site.me();
//compile "*" expression to regex
this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
}
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex(urlPattern).all();
//add urls to fetch
page.addTargetRequests(requests);
//extract by XPath
page.putField("title", page.getHtml().xpath("//title"));
page.putField("html", page.getHtml().toString());
//extract by Readability
page.putField("content", page.getHtml().smartContent());
}
@Override
public Site getSite() {
//settings
return site;
}
}