1 package us.codecraft.webmagic.utils;
2
3 import org.apache.commons.lang3.StringUtils;
4 import org.jsoup.Jsoup;
5 import org.jsoup.nodes.Document;
6 import org.jsoup.nodes.Element;
7 import org.jsoup.select.Elements;
8 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory;
10
11 import java.io.IOException;
12 import java.nio.charset.Charset;
13
14
15
16
17
18
19
20 public abstract class CharsetUtils {
21
22 private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
23
24 public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
25 String charset;
26
27
28 charset = UrlUtils.getCharset(contentType);
29 if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) {
30 logger.debug("Auto get charset: {}", charset);
31 return charset;
32 }
33
34 Charset defaultCharset = Charset.defaultCharset();
35 String content = new String(contentBytes, defaultCharset);
36
37 if (StringUtils.isNotEmpty(content)) {
38 Document document = Jsoup.parse(content);
39 Elements links = document.select("meta");
40 for (Element link : links) {
41
42 String metaContent = link.attr("content");
43 String metaCharset = link.attr("charset");
44 if (metaContent.indexOf("charset") != -1) {
45 metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
46 charset = metaContent.split("=")[1];
47 break;
48 }
49
50 else if (StringUtils.isNotEmpty(metaCharset)) {
51 charset = metaCharset;
52 break;
53 }
54 }
55 }
56 logger.debug("Auto get charset: {}", charset);
57
58 return charset;
59 }
60
61 }