java新闻抓取程序图片下载不全的问题
我做了个程序把新浪上的天气新闻抓过来存到本地,考虑访问速度问题,新闻中的图片也要保存到本地。
程序如下
package vnet.com.weather1;import java.io.BufferedReader;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.PrintWriter;import java.net.URL;import java.net.URLConnection;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import vnet.com.update.Getdata;/** * 正则方式抓取新浪天气新闻上的新闻 * 地址http://weather.news.sina.com.cn/weather/news/index.html * @param args */public class Newlist { private static final Log log = LogFactory.getLog(Newlist.class); /** * 测试 * @param args */ public static void main(String args[]){ Newlist n=new Newlist(); String[] k=n.getNewList(); for (int i=0;i<k.length;i++){ System.out.println(k[i].replace("href=\"", "href=\"newinfo2.jsp?url=")); } String[] m=n.getNewinfo("news/2008/1119/35261.html"); for (int l=0;l<m.length;l++){ System.out.println(m[l]); } } /** * 由url地址获得新闻内容string[] * 新闻中的图片下载到本地,文中新闻地址改成本地地址 * @param url * @return */ public String[] getNewinfo(String url){ String URL="http://weather.news.sina.com.cn/"+url; //30是指取30段满足给出的正则条件的字符串,如果只找出10个,那数组后面的全为null String[] s = analysis("<p>(.*?)</p>" , getContent(URL) , 30); for (int i=0;i<s.length;i++){ Pattern sp = Pattern.compile("src=\"(.*?)\""); Matcher matcher = sp.matcher(s[i]); if (matcher.find()){ String imageurl=analysis("src=\"(.*?)\"" , s[i] , 1)[0]; if(!imageurl.startsWith("http://")){ imageurl="http://weather.news.sina.com.cn/"+imageurl; } System.out.println("新闻有图片:"+imageurl); String content=getContent(imageurl); String[] images=imageurl.split("/"); String imagename=images[images.length-1]; System.out.println("图片名:"+imagename); try { File fwl = new File(imagename); PrintWriter outl = new PrintWriter(fwl); outl.println(content); outl.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("s[i]:"+s[i]); //修改文件图片地址 s[i]=s[i].replace(analysis("src=\"(.*?)\"" , s[i] , 1)[0], imagename); } } return s; } public String[] getNewList(){ String url="http://weather.news.sina.com.cn/weather/news/index.html"; return getNewList(getContent(url)); } private String[] getNewList(String content ){ //String[] s = analysis("align=\"center\" valign=\"top\"><img src=\"../images/a(.*?).gif\" width=\"70\" height=\"65\"></td>" , content , 50); String[] s = analysis("<li>(.*?)</li>" , content , 50); return s; } private String[] analysis(String pattern, String match , int i){ Pattern sp = Pattern.compile(pattern); Matcher matcher = sp.matcher(match); String[] content = new String[i]; for (int i1 = 0; matcher.find(); i1++){ content[i1] = matcher.group(1); } //下面一段是为了剔除为空的串 int l=0; for (int k=0;k<content.length;k++){ if (content[k]==null){ l=k; break; } } String[] content2; if (l!=0){ content2=new String[l]; for (int n=0;n<l;n++){ content2[n]=content[n]; } return content2; }else{ return content; } } /** * 由地址获取网页内容 * @param strUrl * @return private String getContent(String strUrl){ try{ //URL url = new URL(strUrl); //BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream())); URLConnection uc = new URL(strUrl).openConnection(); //通过修改http头的User-Agent来伪装成是通过浏览器提交的请求 uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)"); System.out.println("-----------------------------------------"); System.out.println("Content-Length: "+uc.getContentLength()); System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie")); System.out.println("-----------------------------------------"); //获取文件头信息 System.out.println("Header"+uc.getHeaderFields().toString()); System.out.println("-----------------------------------------"); BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream(), "gb2312")); String s = ""; StringBuffer sb=new StringBuffer(); while((s = br.readLine())!=null){ sb.append(s+"\r\n"); } System.out.println("长度+"+sb.toString().length()); return sb.toString(); }catch(Exception e){ return "error open url" + strUrl; } } */ public static String getContent (String strUrl){ URLConnection uc = null; String all_content=null; try { all_content =new String(); URL url = new URL(strUrl); uc = url.openConnection(); uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)"); System.out.println("-----------------------------------------"); System.out.println("Content-Length: "+uc.getContentLength()); System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie")); System.out.println("-----------------------------------------"); //获取文件头信息 System.out.println("Header"+uc.getHeaderFields().toString()); System.out.println("-----------------------------------------"); if (uc == null) return null; InputStream ins = uc.getInputStream(); ByteArrayOutputStream outputstream = new ByteArrayOutputStream(); byte[] str_b = new byte[1024]; int i = -1; while ((i=ins.read(str_b)) > 0) { outputstream.write(str_b,0,i); } all_content = outputstream.toString(); // System.out.println(all_content); } catch (Exception e) { e.printStackTrace(); log.error("获取网页内容出错"); }finally{ uc = null; } // return new String(all_content.getBytes("ISO8859-1")); System.out.println(all_content.length()); return all_content; } }