View Javadoc
1   package us.codecraft.webmagic.selector;
2   
3   import us.codecraft.webmagic.utils.Experimental;
4   
5   import java.util.ArrayList;
6   import java.util.Arrays;
7   import java.util.List;
8   
9   /**
10   * Borrowed from https://code.google.com/p/cx-extractor/
11   *
12   * @author code4crafter@gmail.com <br>
13   * @since 0.4.1
14   *
15   */
16  @Experimental
17  public class SmartContentSelector implements Selector {
18  
19      public SmartContentSelector() {
20      }
21  
22      @Override
23      public String select(String html) {
24          html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
25          html = html.replaceAll("(?is)<!--.*?-->", "");				// remove html comment
26          html = html.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove javascript
27          html = html.replaceAll("(?is)<style.*?>.*?</style>", "");   // remove css
28          html = html.replaceAll("&.{2,5};|&#.{2,5};", " ");			// remove special char
29          html = html.replaceAll("(?is)<.*?>", "");
30          List<String> lines;
31          int blocksWidth =3;
32          int threshold =86;
33          int start;
34          int end;
35          StringBuilder text = new StringBuilder();
36          ArrayList<Integer> indexDistribution = new ArrayList<Integer>();
37  
38          lines = Arrays.asList(html.split("\n"));
39  
40          for (int i = 0; i < lines.size() - blocksWidth; i++) {
41              int wordsNum = 0;
42              for (int j = i; j < i + blocksWidth; j++) {
43                  lines.set(j, lines.get(j).replaceAll("\\s+", ""));
44                  wordsNum += lines.get(j).length();
45              }
46              indexDistribution.add(wordsNum);
47          }
48  
49          start = -1; end = -1;
50          boolean boolstart = false, boolend = false;
51          text.setLength(0);
52  
53          for (int i = 0; i < indexDistribution.size() - 1; i++) {
54              if (indexDistribution.get(i) > threshold && ! boolstart) {
55                  if (indexDistribution.get(i+1).intValue() != 0
56                          || indexDistribution.get(i+2).intValue() != 0
57                          || indexDistribution.get(i+3).intValue() != 0) {
58                      boolstart = true;
59                      start = i;
60                      continue;
61                  }
62              }
63              if (boolstart) {
64                  if (indexDistribution.get(i).intValue() == 0
65                          || indexDistribution.get(i+1).intValue() == 0) {
66                      end = i;
67                      boolend = true;
68                  }
69              }
70              StringBuilder tmp = new StringBuilder();
71              if (boolend) {
72                  //System.out.println(start+1 + "\t\t" + end+1);
73                  for (int ii = start; ii <= end; ii++) {
74                      if (lines.get(ii).length() < 5) continue;
75                      tmp.append(lines.get(ii) + "\n");
76                  }
77                  String str = tmp.toString();
78                  //System.out.println(str);
79                  if (str.contains("Copyright")   ) continue;
80                  text.append(str);
81                  boolstart = boolend = false;
82              }
83          }
84          return text.toString();
85      }
86  
87      @Override
88      public List<String> selectList(String text) {
89          throw new UnsupportedOperationException();
90      }
91  }