1 package us.codecraft.webmagic.utils;
2
3 import org.apache.commons.lang3.StringUtils;
4 import us.codecraft.webmagic.Request;
5
6 import java.net.MalformedURLException;
7 import java.net.URL;
8 import java.nio.charset.Charset;
9 import java.util.ArrayList;
10 import java.util.Collection;
11 import java.util.List;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14
15
16
17
18
19
20
21 public class UrlUtils {
22
23
24
25
26
27
28
29
30
31
32 public static String canonicalizeUrl(String url, String refer) {
33 URL base;
34 try {
35 try {
36 base = new URL(refer);
37 } catch (MalformedURLException e) {
38
39 URL abs = new URL(refer);
40 return abs.toExternalForm();
41 }
42
43 if (url.startsWith("?"))
44 url = base.getPath() + url;
45 URL abs = new URL(base, url);
46 return abs.toExternalForm();
47 } catch (MalformedURLException e) {
48 return "";
49 }
50 }
51
52
53
54
55
56
57
58 public static String encodeIllegalCharacterInUrl(String url) {
59 return url.replace(" ", "%20");
60 }
61
62 public static String fixIllegalCharacterInUrl(String url) {
63
64 return url.replace(" ", "%20").replaceAll("#+", "#");
65 }
66
67 public static String getHost(String url) {
68 String host = url;
69 int i = StringUtils.ordinalIndexOf(url, "/", 3);
70 if (i > 0) {
71 host = StringUtils.substring(url, 0, i);
72 }
73 return host;
74 }
75
76 private static Pattern patternForProtocal = Pattern.compile("[\\w]+://");
77
78 public static String removeProtocol(String url) {
79 return patternForProtocal.matcher(url).replaceAll("");
80 }
81
82 public static String getDomain(String url) {
83 String domain = removeProtocol(url);
84 int i = StringUtils.indexOf(domain, "/", 1);
85 if (i > 0) {
86 domain = StringUtils.substring(domain, 0, i);
87 }
88 return removePort(domain);
89 }
90
91 public static String removePort(String domain) {
92 int portIndex = domain.indexOf(":");
93 if (portIndex != -1) {
94 return domain.substring(0, portIndex);
95 }else {
96 return domain;
97 }
98 }
99
100 public static List<Request> convertToRequests(Collection<String> urls) {
101 List<Request> requestList = new ArrayList<Request>(urls.size());
102 for (String url : urls) {
103 requestList.add(new Request(url));
104 }
105 return requestList;
106 }
107
108 public static List<String> convertToUrls(Collection<Request> requests) {
109 List<String> urlList = new ArrayList<String>(requests.size());
110 for (Request request : requests) {
111 urlList.add(request.getUrl());
112 }
113 return urlList;
114 }
115
116 private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)", Pattern.CASE_INSENSITIVE);
117
118 public static String getCharset(String contentType) {
119 if (contentType == null) {
120 return null;
121 }
122
123 Matcher matcher = patternForCharset.matcher(contentType);
124 if (matcher.find()) {
125 String charset = matcher.group(1);
126 if (Charset.isSupported(charset)) {
127 return charset;
128 }
129 }
130 return null;
131 }
132
133 }