使用 Apache HttpClient 工具模拟百度蜘蛛或浏览器抓取和解压gzip网页
?
package httpclient;
?
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.zip.GZIPInputStream;
?
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
/**
?* 使用 Apache HttpClient 工具模拟百度蜘蛛或浏览器抓取和解压gzip网页
?* @author Ivan
?*
?*/
?
public class HttpClientTest {
?
/**
* @param args
* @throws IOException
* @throws HttpException
*/
public static void main(String[] args) throws HttpException, IOException {
?
HttpClient httpclient = new HttpClient();// 创建一个客户端,类似打开一个浏览器
// httpclient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
// httpclient.getParams().setParameter("http.protocol.single-cookie-header",true);
?
GetMethod getMethod = new GetMethod("http://www.iteye.com");//http://itindex.net
?
// getMethod.setRequestHeader("Host", "laohuang.iteye.com");
// getMethod.setRequestHeader("Connection", "Keep-Alive");
// getMethod.setRequestHeader("Accept", "*/*");
// getMethod.setRequestHeader("From", "goolebot@googlebot.com");
// getMethod.setRequestHeader("User-Agent",
// "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)");
// getMethod.setRequestHeader("Accept-Encoding", "gzip, deflate");
?
// baidu
getMethod.setRequestHeader("Host", "www.iteye?.com");//itindex.net
getMethod.setRequestHeader("Connection", "Keep-Alive");
getMethod.setRequestHeader("Accept", "*/*");
getMethod.setRequestHeader("From", "goolebot@googlebot.com");
getMethod
.setRequestHeader(
"User-Agent",
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
getMethod.setRequestHeader("Accept-Encoding", "gzip");
?
?
int statusCode = httpclient.executeMethod(getMethod);
?
?
System.out.println(getMethod.getResponseCharSet());
System.out.println(getMethod.getResponseHeader("Content-Encoding"));
System.out.println(getBodyAsString(getMethod,getMethod.getResponseCharSet()));
?
?
?
}
?
public static String getBodyAsString(GetMethod getHC, String charset)
throws IOException {
String acceptEncoding = "";
if (getHC.getResponseHeader("Content-Encoding") != null)
acceptEncoding = getHC.getResponseHeader("Content-Encoding")
.getValue();
StringBuffer sb = new StringBuffer();
?
if (acceptEncoding.toLowerCase().indexOf("gzip") > -1) {
// 建立gzip解压工作流
InputStream is = getHC.getResponseBodyAsStream();
GZIPInputStream gzin = new GZIPInputStream(is);
InputStreamReader isr = new InputStreamReader(gzin, charset); // 设置读取流的编码格式,自定义编码
java.io.BufferedReader br = new java.io.BufferedReader(isr);
String tempbf;
while ((tempbf = br.readLine()) != null) {
sb.append(tempbf);
sb.append("\r\n");
}
isr.close();
gzin.close();
} else {
InputStreamReader isr = new InputStreamReader(getHC
.getResponseBodyAsStream(), charset); // 设置读取流的编码格式,自定义编码
java.io.BufferedReader br = new java.io.BufferedReader(isr);
String tempbf;
while ((tempbf = br.readLine()) != null) {
sb.append(tempbf);
sb.append("\r\n");
}
isr.close();
}
getHC.abort();
getHC.releaseConnection();
return sb.toString();
}
?
}
?
Via http://itindex.net