View Javadoc
1   package us.codecraft.webmagic.utils;
2   
3   import org.apache.commons.lang3.StringUtils;
4   import us.codecraft.webmagic.Request;
5   
6   import java.net.MalformedURLException;
7   import java.net.URL;
8   import java.nio.charset.Charset;
9   import java.util.ArrayList;
10  import java.util.Collection;
11  import java.util.List;
12  import java.util.regex.Matcher;
13  import java.util.regex.Pattern;
14  
15  /**
16   * url and html utils.
17   *
18   * @author code4crafter@gmail.com <br>
19   * @since 0.1.0
20   */
21  public class UrlUtils {
22  
23      /**
24       * canonicalizeUrl
25       * <br>
26       * Borrowed from Jsoup.
27       *
28       * @param url url
29       * @param refer refer
30       * @return canonicalizeUrl
31       */
32      public static String canonicalizeUrl(String url, String refer) {
33          URL base;
34          try {
35              try {
36                  base = new URL(refer);
37              } catch (MalformedURLException e) {
38                  // the base is unsuitable, but the attribute may be abs on its own, so try that
39                  URL abs = new URL(refer);
40                  return abs.toExternalForm();
41              }
42              // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
43              if (url.startsWith("?"))
44                  url = base.getPath() + url;
45              URL abs = new URL(base, url);
46              return abs.toExternalForm();
47          } catch (MalformedURLException e) {
48              return "";
49          }
50      }
51  
52      /**
53       *
54       * @param url url
55       * @return new url
56       * @deprecated
57       */
58      public static String encodeIllegalCharacterInUrl(String url) {
59          return url.replace(" ", "%20");
60      }
61  
62      public static String fixIllegalCharacterInUrl(String url) {
63          //TODO more charator support
64          return url.replace(" ", "%20").replaceAll("#+", "#");
65      }
66  
67      public static String getHost(String url) {
68          String host = url;
69          int i = StringUtils.ordinalIndexOf(url, "/", 3);
70          if (i > 0) {
71              host = StringUtils.substring(url, 0, i);
72          }
73          return host;
74      }
75  
76      private static Pattern patternForProtocal = Pattern.compile("[\\w]+://");
77  
78      public static String removeProtocol(String url) {
79          return patternForProtocal.matcher(url).replaceAll("");
80      }
81  
82      public static String getDomain(String url) {
83          String domain = removeProtocol(url);
84          int i = StringUtils.indexOf(domain, "/", 1);
85          if (i > 0) {
86              domain = StringUtils.substring(domain, 0, i);
87          }
88          return removePort(domain);
89      }
90  
91      public static String removePort(String domain) {
92          int portIndex = domain.indexOf(":");
93          if (portIndex != -1) {
94              return domain.substring(0, portIndex);
95          }else {
96              return domain;
97          }
98      }
99  
100     public static List<Request> convertToRequests(Collection<String> urls) {
101         List<Request> requestList = new ArrayList<Request>(urls.size());
102         for (String url : urls) {
103             requestList.add(new Request(url));
104         }
105         return requestList;
106     }
107 
108     public static List<String> convertToUrls(Collection<Request> requests) {
109         List<String> urlList = new ArrayList<String>(requests.size());
110         for (Request request : requests) {
111             urlList.add(request.getUrl());
112         }
113         return urlList;
114     }
115 
116     private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE);
117 
118     public static String getCharset(String contentType) {
119         if (contentType == null) {
120             return null;
121         }
122 
123         Matcher matcher = patternForCharset.matcher(contentType);
124         if (matcher.find()) {
125             String charset = matcher.group(1);
126             if (Charset.isSupported(charset)) {
127                 return charset;
128             }
129         }
130         return null;
131     }
132 
133 }