java用htmlparser爬取页面后的中文内容问题
问题是这样的:我要爬取的网页编码是utf-8的,我在parser里面设置请求编码格式utf-8,然后爬取后的内容就是这样的:<a class="a_topic" href="http:\/\/huati.weibo.com\/k\/%E7%BB%A7%E6%89%BF%E8%80%85%E4%BB%AC?from=526" target="_blank">#\u7ee7\u627f\u8005\u4eec#<\/a>\u8f66\u6069\u5c1a\u548c\u5d14\u82f1\u9053\u7684\u7f57\u66fc\u53f2\u5373\u5c06\u4e0a\u6f14\uff01\u660e\u665a\u5c06\u64ad\u51fa\u7684\u7b2c9\u96c6\u4e2d\uff0c\u6069\u5c1a\u5728\u5496\u5561\u5e97\u88ab\u4e00\u540d\u7537\u5ba2\u4eba\u7ea0\u7f20\uff0c\u6b64\u65f6\u82f1\u9053\u633a\u8eab\u800c\u51fa\u66ff\u5979\u89e3\u56f4\u2026\u2026\u8be6\u60c5\uff1a 是不是应该对这些做什么进一步处理啊?而且还有一个问题是我测试其他的一些编码为utf-8的网页,显示内容有中文 都是正常的。实在搞不懂是为什么啊,求大神指导,谢谢! java htmlparser 中文
[解决办法]
<a class="a_topic" href="http:\/\/huati.weibo.com\/k\/%E7%BB%A7%E6%89%BF%E8%80%85%E4%BB%AC?from=526" target="_blank">#继承者们#<\/a>车恩尚和崔英道的罗曼史即将上演!明晚将播出的第9集中,恩尚在咖啡店被一名男客人纠缠,此时英道挺身而出替她解围……详情:
[解决办法]
public static void main(String[] args) {
String str = "<a class="a_topic" href="http://huati.weibo.com/k/%E7%BB%A7%E6%89%BF%E8%80%85%E4%BB%AC?from=526" target="_blank">#\u7ee7\u627f\u8005\u4eec#</a>\u8f66\u6069\u5c1a\u548c\u5d14\u82f1\u9053\u7684\u7f57\u66fc\u53f2\u5373\u5c06\u4e0a\u6f14\uff01\u660e\u665a\u5c06\u64ad\u51fa\u7684\u7b2c9\u96c6\u4e2d\uff0c\u6069\u5c1a\u5728\u5496\u5561\u5e97\u88ab\u4e00\u540d\u7537\u5ba2\u4eba\u7ea0\u7f20\uff0c\u6b64\u65f6\u82f1\u9053\u633a\u8eab\u800c\u51fa\u66ff\u5979\u89e3\u56f4\u2026\u2026\u8be6\u60c5\uff1a";
System.out.println(decodeUnicode(str));
}
public static String decodeUnicode(String theString) {
char aChar;
int len = theString.length();
StringBuffer outBuffer = new StringBuffer(len);
for (int x = 0; x < len;) {
aChar = theString.charAt(x++);
if (aChar == '\\') {
aChar = theString.charAt(x++);
if (aChar == 'u') {
// Read the xxxx
int value = 0;
for (int i = 0; i < 4; i++) {
aChar = theString.charAt(x++);
switch (aChar) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
value = (value << 4) + aChar - '0';
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
value = (value << 4) + 10 + aChar - 'a';
break;
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
value = (value << 4) + 10 + aChar - 'A';
break;
default:
throw new IllegalArgumentException(
"Malformed \\uxxxx encoding.");
}
}
outBuffer.append((char) value);
} else {
if (aChar == 't')
aChar = '\t';
else if (aChar == 'r')
aChar = '\r';
else if (aChar == 'n')
aChar = '\n';
else if (aChar == 'f')
aChar = '\f';
outBuffer.append(aChar);
}
} else
outBuffer.append(aChar);
}
return outBuffer.toString();
}