1 package us.codecraft.webmagic.selector;
2
3 import us.codecraft.webmagic.utils.Experimental;
4
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.List;
8
9
10
11
12
13
14
15
16 @Experimental
17 public class SmartContentSelector implements Selector {
18
19 public SmartContentSelector() {
20 }
21
22 @Override
23 public String select(String html) {
24 html = html.replaceAll("(?is)<!DOCTYPE.*?>", "");
25 html = html.replaceAll("(?is)<!--.*?-->", "");
26 html = html.replaceAll("(?is)<script.*?>.*?</script>", "");
27 html = html.replaceAll("(?is)<style.*?>.*?</style>", "");
28 html = html.replaceAll("&.{2,5};|&#.{2,5};", " ");
29 html = html.replaceAll("(?is)<.*?>", "");
30 List<String> lines;
31 int blocksWidth =3;
32 int threshold =86;
33 int start;
34 int end;
35 StringBuilder text = new StringBuilder();
36 ArrayList<Integer> indexDistribution = new ArrayList<Integer>();
37
38 lines = Arrays.asList(html.split("\n"));
39
40 for (int i = 0; i < lines.size() - blocksWidth; i++) {
41 int wordsNum = 0;
42 for (int j = i; j < i + blocksWidth; j++) {
43 lines.set(j, lines.get(j).replaceAll("\\s+", ""));
44 wordsNum += lines.get(j).length();
45 }
46 indexDistribution.add(wordsNum);
47 }
48
49 start = -1; end = -1;
50 boolean boolstart = false, boolend = false;
51 text.setLength(0);
52
53 for (int i = 0; i < indexDistribution.size() - 1; i++) {
54 if (indexDistribution.get(i) > threshold && ! boolstart) {
55 if (indexDistribution.get(i+1).intValue() != 0
56 || indexDistribution.get(i+2).intValue() != 0
57 || indexDistribution.get(i+3).intValue() != 0) {
58 boolstart = true;
59 start = i;
60 continue;
61 }
62 }
63 if (boolstart) {
64 if (indexDistribution.get(i).intValue() == 0
65 || indexDistribution.get(i+1).intValue() == 0) {
66 end = i;
67 boolend = true;
68 }
69 }
70 StringBuilder tmp = new StringBuilder();
71 if (boolend) {
72
73 for (int ii = start; ii <= end; ii++) {
74 if (lines.get(ii).length() < 5) continue;
75 tmp.append(lines.get(ii) + "\n");
76 }
77 String str = tmp.toString();
78
79 if (str.contains("Copyright") ) continue;
80 text.append(str);
81 boolstart = boolend = false;
82 }
83 }
84 return text.toString();
85 }
86
87 @Override
88 public List<String> selectList(String text) {
89 throw new UnsupportedOperationException();
90 }
91 }