自己写的实时爬取 CSDN 2012 博客之星 88位候选人排名
如果觉得我的技术文章还有点让列为看官汲取之处,
请给我投上宝贵的一篇,以兹鼓励呵,多谢,多谢!!本人ID:m13666368773投票地址:http://vote.blog.csdn.net/item/blogstar/m13666368773
有幸入选 CSDN 2012 博客之星 88位候选人,但是排名不是很靠前,想看看自己距离前面几名 多少投票,遂写了这个 粗劣的程序,跑了一下
原理:由于评选页面估计是 异步读取的信息,所以只能进入88名候选人投票页面,获取有用信息:用户名,票数,排名,所以爬虫爬行时间有点慢,需要优化,不过基本上实现排名。
程序如下:
package com.aptech;import java.io.BufferedReader;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.net.HttpURLConnection;import java.net.URL;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.HashMap;import java.util.List;import java.util.Map;import org.junit.Test;@SuppressWarnings("unchecked")public class TestPachongUrl {static Map messageMap = new HashMap();private static List list = new ArrayList();static String url = "http://vote.blog.csdn.net/item/blogstar/";static String user[] = new String[] { "Testing_is_believing", "t0nsha", "iukey", "yjflinchong", "taomanman", "chinafe", "hliq5399", "dog250", "qinjuning", "cheny_com", "v_JULY_v", "zhmxy555","Purpleendurer", "iihero", "yming0221", "ccanan", "tigerjb", "cheungmine", "hawksoft", "sheismylife", "hfahe", "cyq1984", "littletigerat", "kmyhy", "caimouse", "manoel", "xyz_lmn","hunkcai", "yiyaaixuexi", "norains", "clever101", "leftfist", "xiaominghimi", "niyi0318", "yanghuiliu", "abandonship", "mapdigit", "bill_man", "Augusdi", "LoveLion", "sunboy_2050","kongxx", "21aspnet", "chszs", "thl789", "mylxiaoyi", "akof1314", "yincheng01", "keyboardOTA", "pan_tian", "downmoon", "wangkuifeng0118", "robinson_0612", "bluishglc", "coolbacon","tangcheng_ok", "tianxiaode", "cjjky", "MoreWindows", "mr_raptor", "dojotoolkit", "chelsea", "chgaowei", "teamlet", "IBM_hoojo", "iefreer", "lee576", "jaminwm", "xuhuojun", "linghe301","caolaosanahnu", "ricohzhanglong", "totogo2010", "axman", "ce123", "rabbit729", "nkmnkm", "superdont", "m13666368773", "aomandeshangxiao", "hitlion2008", "siren0203", "feixiaoxing","Poechant", "cloudhsu", "Innost", "yanghua_kobe", "tianlesoftware" };@Testpublic static String test(URL url) throws Exception {/** * 首先要和URL下的URLConnection对话。 URLConnection可以很容易的从URL得到。比如: // Using * java.net.URL and //java.net.URLConnection */HttpURLConnection connection = (HttpURLConnection) url.openConnection();/** * 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。 * 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做: */connection.setDoOutput(true);connection.setRequestMethod("POST");connection.setRequestProperty("user-agent", "mozilla/4.7 [en] (win98; i)");connection.connect();/** * 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ... */OutputStreamWriter out = new OutputStreamWriter(connection.getOutputStream(), "UTF-8");out.flush();out.close();/** * 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT: * text/plain Content-type: application/x-www-form-urlencoded * Content-length: 99 username=bob password=someword */// 一旦发送成功,用以下方法就可以得到服务器的回应:String sCurrentLine;String sTotalString;sCurrentLine = "";sTotalString = "";InputStream l_urlStream;l_urlStream = connection.getInputStream();// 传说中的三层包装阿!BufferedReader l_reader = new BufferedReader(new InputStreamReader(l_urlStream));while ((sCurrentLine = l_reader.readLine()) != null) {sTotalString += sCurrentLine + "\r\n";}int begin0 = sTotalString.indexOf("博客地址:<a href=\"http://blog.csdn.net/");int end0 = sTotalString.indexOf("\" class=\"red\" target=\"_blank\">");int begin1 = sTotalString.indexOf("票数:<span class=\"red\">");int end1 = sTotalString.indexOf("</span> 票</li>");int begin2 = sTotalString.indexOf("当前排名:<span class=\"red\">");int end2 = sTotalString.indexOf("</span> 名</li>");String message = sTotalString.substring(begin0 + 35, end0) + "-" + sTotalString.substring(begin1 + 21, end1) + "=" + sTotalString.substring(begin2 + 23, end2);return message;}public static void main(String[] args) throws Exception {for (int i = 0; i < user.length; i++) {list.add(new URL(url + user[i]));}SimpleDateFormat dateformat = new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒 E ");String nowTime = dateformat.format(new Date());System.out.println("统计时间:" + nowTime);System.out.println("候选人数量:" + user.length);System.out.println(addChinaBlank("用户名") + addChinaBlank("票数") + "排名");for (int i = 0; i < list.size(); i++) {String subMessage = test((URL) list.get(i));String key = subMessage.substring(subMessage.indexOf("=") + 1, subMessage.length());messageMap.put(key, subMessage);}for (int i = 1; i <= 88; i++) {String endMessage = messageMap.get("" + i).toString();System.out.println(addBlank(endMessage.substring(0, endMessage.indexOf("-"))) + endMessage.substring(endMessage.indexOf("-") + 1, endMessage.indexOf("=")) + ""+ endMessage.substring(endMessage.indexOf("=") + 1, endMessage.length()));}}public static String addBlank(String user) {String blank = " ";int userLength = user.length();for (int i = 0; i < 30 - userLength; i++) {user += blank;}return user;}public static String addChinaBlank(String message) {String blank = " ";int userLength = message.length() * 2;for (int i = 0; i < 70 - userLength; i++) {message += blank;}return message;}}
运行一下:
统计时间:2012年12月08日 13时31分24秒 星期六 候选人数量:88用户名 票数 排名v_JULY_v 11331yincheng01 8272MoreWindows 4463mr_raptor 3714yiyaaixuexi 3505LoveLion 2986ricohzhanglong 2957tianlesoftware 2438xiaominghimi 2369taomanman 22510yming0221 20511zhmxy555 18012Poechant 16513aomandeshangxiao 14614linghe301 13315hawksoft 12516nkmnkm 11217cjjky 11218niyi0318 9019cyq1984 8720clever101 8421cloudhsu 7922akof1314 7023Testing_is_believing 6524cheny_com 5725yanghuiliu 5226lee576 4527manoel 4528bill_man 4329hfahe 4330tangcheng_ok 4131teamlet 4132dojotoolkit 4133cheungmine 4134yjflinchong 4135norains 4036sheismylife 3837m13666368773 3638coolbacon 3639pan_tian 3440sunboy_2050 3441qinjuning 3142Augusdi 304321aspnet 2944tigerjb 2945axman 2946mapdigit 2947downmoon 2748chgaowei 2749ce123 2650mylxiaoyi 2651dog250 2552t0nsha 2553feixiaoxing 2554thl789 2555kongxx 2556abandonship 2457iukey 2358caimouse 2259caolaosanahnu 2260xyz_lmn 2161robinson_0612 2062IBM_hoojo 2063Innost 2064wangkuifeng0118 1965iihero 1966hunkcai 1967rabbit729 1968chelsea 1969totogo2010 1870tianxiaode 1871Purpleendurer 1872yanghua_kobe 1773jaminwm 1674iefreer 1675siren0203 1676ccanan 1577littletigerat 1578kmyhy 1579chszs 1580superdont 1481keyboardOTA 1482leftfist 1483chinafe 1384hliq5399 1285bluishglc 1086hitlion2008 987xuhuojun 788