View Javadoc
1   package us.codecraft.webmagic.utils;
2   
3   import org.apache.commons.lang3.StringUtils;
4   import org.jsoup.Jsoup;
5   import org.jsoup.nodes.Document;
6   import org.jsoup.nodes.Element;
7   import org.jsoup.select.Elements;
8   import org.slf4j.Logger;
9   import org.slf4j.LoggerFactory;
10  
11  import java.io.IOException;
12  import java.nio.charset.Charset;
13  
14  /**
15   * @author code4crafter@gmail.com
16   *         Date: 17/3/11
17   *         Time: 10:36
18   * @since 0.6.2
19   */
20  public abstract class CharsetUtils {
21  
22      private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
23  
24      public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
25          String charset;
26          // charset
27          // 1、encoding in http header Content-Type
28          charset = UrlUtils.getCharset(contentType);
29          if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) {
30              logger.debug("Auto get charset: {}", charset);
31              return charset;
32          }
33          // use default charset to decode first time
34          Charset defaultCharset = Charset.defaultCharset();
35          String content = new String(contentBytes, defaultCharset);
36          // 2、charset in meta
37          if (StringUtils.isNotEmpty(content)) {
38              Document document = Jsoup.parse(content);
39              Elements links = document.select("meta");
40              for (Element link : links) {
41                  // 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
42                  String metaContent = link.attr("content");
43                  String metaCharset = link.attr("charset");
44                  if (metaContent.indexOf("charset") != -1) {
45                      metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
46                      charset = metaContent.split("=")[1];
47                      break;
48                  }
49                  // 2.2、html5 <meta charset="UTF-8" />
50                  else if (StringUtils.isNotEmpty(metaCharset)) {
51                      charset = metaCharset;
52                      break;
53                  }
54              }
55          }
56          logger.debug("Auto get charset: {}", charset);
57          // 3、todo use tools as cpdetector for content decode
58          return charset;
59      }
60      
61  }