View Javadoc
1   package us.codecraft.webmagic.selector;
2   
3   import org.apache.commons.lang3.StringUtils;
4   
5   import java.util.ArrayList;
6   import java.util.List;
7   import java.util.regex.Matcher;
8   import java.util.regex.Pattern;
9   import java.util.regex.PatternSyntaxException;
10  
11  /**
12   * Selector in regex.<br>
13   *
14   * @author code4crafter@gmail.com <br>
15   * @since 0.1.0
16   */
17  public class RegexSelector implements Selector {
18  
19      private String regexStr;
20  
21      private Pattern regex;
22  
23      private int group = 1;
24  
25      public RegexSelector(String regexStr, int group) {
26          this.compileRegex(regexStr);
27          this.group = group;
28      }
29  
30      private void compileRegex(String regexStr) {
31          if (StringUtils.isBlank(regexStr)) {
32              throw new IllegalArgumentException("regex must not be empty");
33          }
34          try {
35              this.regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
36              this.regexStr = regexStr;
37          } catch (PatternSyntaxException e) {
38              throw new IllegalArgumentException("invalid regex "+regexStr, e);
39          }
40      }
41  
42      /**
43       * Create a RegexSelector. When there is no capture group, the value is set to 0 else set to 1.
44       * @param regexStr the regular expression.
45       */
46      public RegexSelector(String regexStr) {
47          this.compileRegex(regexStr);
48          if (regex.matcher("").groupCount() == 0) {
49              this.group = 0;
50          } else {
51              this.group = 1;
52          }
53      }
54  
55      @Override
56      public String select(String text) {
57          return selectGroup(text).get(group);
58      }
59  
60      @Override
61      public List<String> selectList(String text) {
62          List<String> strings = new ArrayList<String>();
63          List<RegexResult> results = selectGroupList(text);
64          for (RegexResult result : results) {
65              strings.add(result.get(group));
66          }
67          return strings;
68      }
69  
70      public RegexResult selectGroup(String text) {
71          Matcher matcher = regex.matcher(text);
72          if (matcher.find()) {
73              String[] groups = new String[matcher.groupCount() + 1];
74              for (int i = 0; i < groups.length; i++) {
75                  groups[i] = matcher.group(i);
76              }
77              return new RegexResult(groups);
78          }
79          return RegexResult.EMPTY_RESULT;
80      }
81  
82      public List<RegexResult> selectGroupList(String text) {
83          Matcher matcher = regex.matcher(text);
84          List<RegexResult> resultList = new ArrayList<RegexResult>();
85          while (matcher.find()) {
86              String[] groups = new String[matcher.groupCount() + 1];
87              for (int i = 0; i < groups.length; i++) {
88                  groups[i] = matcher.group(i);
89              }
90              resultList.add(new RegexResult(groups));
91          }
92          return resultList;
93      }
94  
95      @Override
96      public String toString() {
97          return regexStr;
98      }
99  
100 }