1 package us.codecraft.webmagic.selector;
2
3 import org.apache.commons.lang3.StringUtils;
4
5 import java.util.ArrayList;
6 import java.util.List;
7 import java.util.regex.Matcher;
8 import java.util.regex.Pattern;
9 import java.util.regex.PatternSyntaxException;
10
11
12
13
14
15
16
17 public class RegexSelector implements Selector {
18
19 private String regexStr;
20
21 private Pattern regex;
22
23 private int group = 1;
24
25 public RegexSelector(String regexStr, int group) {
26 this.compileRegex(regexStr);
27 this.group = group;
28 }
29
30 private void compileRegex(String regexStr) {
31 if (StringUtils.isBlank(regexStr)) {
32 throw new IllegalArgumentException("regex must not be empty");
33 }
34 try {
35 this.regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
36 this.regexStr = regexStr;
37 } catch (PatternSyntaxException e) {
38 throw new IllegalArgumentException("invalid regex "+regexStr, e);
39 }
40 }
41
42
43
44
45
46 public RegexSelector(String regexStr) {
47 this.compileRegex(regexStr);
48 if (regex.matcher("").groupCount() == 0) {
49 this.group = 0;
50 } else {
51 this.group = 1;
52 }
53 }
54
55 @Override
56 public String select(String text) {
57 return selectGroup(text).get(group);
58 }
59
60 @Override
61 public List<String> selectList(String text) {
62 List<String> strings = new ArrayList<String>();
63 List<RegexResult> results = selectGroupList(text);
64 for (RegexResult result : results) {
65 strings.add(result.get(group));
66 }
67 return strings;
68 }
69
70 public RegexResult selectGroup(String text) {
71 Matcher matcher = regex.matcher(text);
72 if (matcher.find()) {
73 String[] groups = new String[matcher.groupCount() + 1];
74 for (int i = 0; i < groups.length; i++) {
75 groups[i] = matcher.group(i);
76 }
77 return new RegexResult(groups);
78 }
79 return RegexResult.EMPTY_RESULT;
80 }
81
82 public List<RegexResult> selectGroupList(String text) {
83 Matcher matcher = regex.matcher(text);
84 List<RegexResult> resultList = new ArrayList<RegexResult>();
85 while (matcher.find()) {
86 String[] groups = new String[matcher.groupCount() + 1];
87 for (int i = 0; i < groups.length; i++) {
88 groups[i] = matcher.group(i);
89 }
90 resultList.add(new RegexResult(groups));
91 }
92 return resultList;
93 }
94
95 @Override
96 public String toString() {
97 return regexStr;
98 }
99
100 }