View Javadoc
1   package us.codecraft.webmagic.processor;
2   
3   import us.codecraft.webmagic.Page;
4   import us.codecraft.webmagic.Site;
5   
6   /**
7    * Interface to be implemented to customize a crawler.
8    *
9    * <p>
10   * In PageProcessor, you can customize:
11   * </p>
12   * <ul>
13   * <li>start URLs and other settings in {@link Site}</li>
14   * <li>how the URLs to fetch are detected</li>
15   * <li>how the data are extracted and stored</li>
16   * </ul>
17   *
18   * @author code4crafter@gmail.com <br>
19   * @see Site
20   * @see Page
21   * @since 0.1.0
22   */
23  public interface PageProcessor {
24  
25      /**
26       * Processes the page, extract URLs to fetch, extract the data and store.
27       *
28       * @param page page
29       */
30      void process(Page page);
31  
32      /**
33       * Returns the site settings.
34       *
35       * @return site
36       * @see Site
37       */
38      default Site getSite() {
39          return Site.me();
40      }
41  
42  }