采摘电子报纸

2014-01-01

采集电子报纸/** *报纸采集器 * @author 杨尚川 */public interface PaperCollector {/*** 下载当日报纸，

采集电子报纸
/** *报纸采集器 * @author 杨尚川 */public interface PaperCollector { /** * 下载当日报纸，一个文件对应一个版面 * @return 报纸 */ List<File> collect(); /** * 下载指定日期的报纸，一个文件对应一个版面 * @param date 指定日期 * @return 报纸 */ List<File> collect(Date date);}

2、抽象类

/** *报纸采集器抽象类，通用采集功能实现 * @author 杨尚川 */public abstract class AbstractPaperCollector implements PaperCollector{    protected final Logger LOG = LoggerFactory.getLogger(getClass());    @Override    public List<File> collect() {        return collect(new Date());    }    /**     * 根据下载链接提取文件夹名称     * @param href 下载链接     * @return 文件夹名称     */    protected abstract String getPath(String href);    /**     * 根据下载链接提取文件名称     * @param href 下载链接     * @return 文件名称     */    protected abstract String getFile(String href);    protected List<File> downloadPaper(List<String> hrefs){        final List<File> files = new ArrayList<>();        List<Thread> ts = new ArrayList<>();        LOG.info("报纸有"+hrefs.size()+"个版面需要下载:");        for(final String href : hrefs){               Thread t = new Thread(new Runnable(){                @Override                public void run() {                    File file = downloadPaper(href);                    if(file != null){                        files.add(file);                    }                }                            });            t.start();            ts.add(t);        }        for(Thread t : ts){            try {                t.join();            } catch (InterruptedException ex) {                LOG.error("下载报纸出错：",ex);            }        }        return files;    }    protected File downloadPaper(String href){        try{            LOG.info("下载报纸："+href);            String path = getPath(href);            LOG.debug("报纸保存目录："+path);            String file = getFile(href);            LOG.debug("报纸保存文件："+file);            File dir = new File(path);            if(!dir.exists()){                LOG.debug("创建目录："+dir.getAbsolutePath());                dir.mkdirs();            }            File absoluteFile = new File(path, file);            LOG.debug("报纸保存绝对路径："+absoluteFile.getAbsolutePath());            Tools.copyFile(new URL(href).openStream(), absoluteFile);            LOG.info("报纸下载成功："+href);            LOG.info("报纸成功保存到："+absoluteFile.getAbsolutePath());            return absoluteFile;        }catch(IOException e){            LOG.error("报纸下载失败："+e);        }        return null;    }        protected void run() {        //今天        List<File> files = collect();        int i = 1;        for(File file : files){            LOG.info((i++)+" : " + file.getAbsolutePath());        }        //昨天        Date date = new Date();        date.setTime(System.currentTimeMillis()-24*3600*1000);        files = collect(date);        i = 1;        for(File file : files){            LOG.info((i++)+" : " + file.getAbsolutePath());        }        //前天        date = new Date();        date.setTime(System.currentTimeMillis()-2*24*3600*1000);        files = collect(date);        i = 1;        for(File file : files){            LOG.info((i++)+" : " + file.getAbsolutePath());        }    }}

3、采集新华日报

/** * 新华日报 * @author 杨尚川 */public class XHRBPaperCollector extends AbstractPaperCollector{    private static final String paperName = "新华日报";    private static final String paperPath = "http://xh.xhby.net/newxh/";    private static final String url = paperPath+"html/";    private static final String hrefPrefix = paperPath+"page/1/";    private static final String start = "node_2.htm";    private static final String pdfCssQuery = "html body table tbody tr td table tbody tr td table tbody tr td table tbody tr td div table tbody tr td a";    private static final SimpleDateFormat sf = new SimpleDateFormat("yyyy-MM/dd/");         @Override    public List<File> collect(Date date) {        List<String> hrefs = new ArrayList<>();        try {            LOG.debug("url: "+url);            String paper = url + sf.format(date) + start;            LOG.debug("paper: "+paper);            Document document = Jsoup.connect(paper).get();                        LOG.debug("pdfCssQuery: " + pdfCssQuery);            Elements elements = document.select(pdfCssQuery);            for(Element element : elements){                String href = element.attr("href");                if(href != null && href.endsWith(".pdf")){                    LOG.debug("报纸链接："+href);                    href = href.replace("../../../", "");                    LOG.debug("报纸链接："+href);                    hrefs.add(paperPath+href);                }else{                    LOG.debug("不是报纸链接："+href);                }            }                    } catch (IOException ex) {            LOG.error("采集出错",ex);        }        return downloadPaper(hrefs);    }    @Override    protected String getPath(String href) {        String path = href.replace(hrefPrefix, "");        String[] attrs = path.split("/");        attrs = attrs[0].split("-");        StringBuilder str = new StringBuilder();        str.append(paperName)            .append(File.separator)            .append(attrs[0])            .append("-")            .append(attrs[1])            .append(File.separator)            .append(attrs[2]);        return str.toString();    }    @Override    protected String getFile(String href) {        String path = href.replace(hrefPrefix, "");        String[] attrs = path.split("/");        String file = attrs[1]+".pdf";        return file;    }    public static void main(String[] args) {        new XHRBPaperCollector().run();    }}

?4、采集楚天都市报

/** * 楚天都市报 * @author 杨尚川 */public class CTDSBPaperCollector extends AbstractPaperCollector{    private static final String paperName = "楚天都市报";    private static final String host = "http://ctdsb.cnhubei.com/";    private static final String paperPath = host+"ctdsb/";    private static final String url = host+"html/ctdsb/";    private static final String hrefPrefix = paperPath;    private static final String start = "index.html";    private static final String pdfCssQuery = "html body center table tbody tr td table tbody tr td table tbody tr td table tbody tr td div table tbody tr td.info3 a";    private static final SimpleDateFormat sf = new SimpleDateFormat("yyyyMMdd/");         @Override    public List<File> collect(Date date) {        List<String> hrefs = new ArrayList<>();        try {            LOG.debug("url: "+url);            String paper = url + sf.format(date) + start;            LOG.debug("paper: "+paper);            Document document = Jsoup.connect(paper).get();                        LOG.debug("pdfCssQuery: " + pdfCssQuery);            Elements elements = document.select(pdfCssQuery);            int count=0;            for(Element element : elements){                String text = element.text();                if(text != null && text.startsWith("第")){                    LOG.debug("报纸文本："+text);                    count++;                }else{                    LOG.debug("不是报纸文本："+text);                }            }            //有的版面缺失，而文件名是顺序递增的            for(int i=1; i<=count; i++){                String seq = Integer.toString(i);                if(i<10){                    seq="0"+seq;                }                hrefs.add(paperPath + sf.format(date) + "page_"+seq+".jpg");            }        } catch (IOException ex) {            LOG.error("采集出错",ex);        }        return downloadPaper(hrefs);    }    @Override    protected String getPath(String href) {        String path = href.replace(hrefPrefix, "");        String[] attrs = path.split("/");        StringBuilder str = new StringBuilder();        str.append(paperName)            .append(File.separator)            .append(attrs[0].substring(0, 4))            .append("-")            .append(attrs[0].substring(4, 6))            .append(File.separator)            .append(attrs[0].substring(6, 8));        return str.toString();    }    @Override    protected String getFile(String href) {        String path = href.replace(hrefPrefix, "");        String[] attrs = path.split("/");        String file = attrs[1].split("_")[1];        return file;    }    public static void main(String[] args) {        new CTDSBPaperCollector().run();    }}

5、采集京九晚报

/** * 京九晚报 * @author 杨尚川 */public class JJWBPaperCollector extends AbstractPaperCollector{    private static final String paperName = "京九晚报";    private static final String paperPath = "http://epaper.cnsq.com.cn/jjwb/";    private static final String url = paperPath+"html/";    private static final String hrefPrefix = paperPath+"page/10/";    private static final String start = "node_11.htm";    private static final String pdfCssQuery = "html body table tbody tr td table tbody tr td table tbody tr td table tbody tr td div table tbody tr td a";    private static final SimpleDateFormat sf = new SimpleDateFormat("yyyy-MM/dd/");         @Override    public List<File> collect(Date date) {        List<String> hrefs = new ArrayList<>();        try {            LOG.debug("url: "+url);            String paper = url + sf.format(date) + start;            LOG.debug("paper: "+paper);            Document document = Jsoup.connect(paper).get();                        LOG.debug("pdfCssQuery: " + pdfCssQuery);            Elements elements = document.select(pdfCssQuery);            for(Element element : elements){                String href = element.attr("href");                if(href != null && href.endsWith(".pdf")){                    LOG.debug("报纸链接："+href);                    href = href.replace("../../../", "");                    LOG.debug("报纸链接："+href);                    hrefs.add(paperPath+href);                }else{                    LOG.debug("不是报纸链接："+href);                }            }                    } catch (IOException ex) {            LOG.error("采集出错",ex);        }        return downloadPaper(hrefs);    }    @Override    protected String getPath(String href) {        String path = href.replace(hrefPrefix, "");        String[] attrs = path.split("/");        StringBuilder str = new StringBuilder();        str.append(paperName)            .append(File.separator)            .append(attrs[0])            .append(File.separator)            .append(attrs[1]);        return str.toString();    }    @Override    protected String getFile(String href) {        String path = href.replace(hrefPrefix, "");        String[] attrs = path.split("/");        String file = attrs[2]+".pdf";        return file;    }    public static void main(String[] args) {        new JJWBPaperCollector().run();    }}

?6、采集信息时报

/** * 信息时报 * @author 杨尚川 */public class XXSBPaperCollector extends AbstractPaperCollector{    private static final String paperName = "信息时报";    private static final String host = "http://informationtimes.dayoo.com/";    private static final String paperPath = host+"page/1019/";    private static final String url = host+"html/";    private static final String hrefPrefix = paperPath;    private static final String start = "node_1019.htm";    private static final String pdfCssQuery = "html body#content div.container div.leftcolumn div.leftcolumncontent div.pagebuttontwo div.con p.right span.dfive a";    private static final String subCssQuery = "html body#listcontent div.container div.rightcolumn div.subcbga div.listcontent div#all_article_list.list h4 span.left a";    private static final String contentCssQuery = "html body div.container div.leftcolumn div.tbga div.bbga div.cbga div.left div.pagepicture div map area";    private static final SimpleDateFormat sf = new SimpleDateFormat("yyyy-MM/dd/");         @Override    public List<File> collect(Date date) {        List<String> hrefs = new ArrayList<>();        try {            LOG.debug("url: "+url);            String paper = url + sf.format(date) + start;            LOG.debug("paper: "+paper);            Document document = Jsoup.connect(paper).get();                        //1、找到子报纸            LOG.debug("subCssQuery: " + subCssQuery);            Elements elements = document.select(subCssQuery);            for(Element element : elements){                String text = element.text();                String href = element.attr("href");                if(text != null && text.contains("：") && href != null && href.endsWith(".htm")){                    String subPaperURL = url + sf.format(date) + href;                    LOG.debug("子报纸文本："+text+" , "+href);                    LOG.debug("subPaperURL："+subPaperURL);                    //2、找到内容页面                    LOG.debug("contentCssQuery: " + contentCssQuery);                    Elements contentElements = Jsoup.connect(subPaperURL).get().select(contentCssQuery);                    for(Element contentElement : contentElements){                        String h = contentElement.attr("href");                        if(h != null && h.startsWith("content_") && h.endsWith(".htm")){                            String contentURL = url + sf.format(date) + h;                            LOG.debug("contentURL："+contentURL);                            //3、找PDF                            LOG.debug("pdfCssQuery: " + pdfCssQuery);                            Elements pdfElements = Jsoup.connect(contentURL).get().select(pdfCssQuery);                            for(Element pdfElement : pdfElements){                                String pdf = pdfElement.attr("href");                                if(pdf != null && pdf.endsWith(".pdf")){                                    LOG.debug("报纸链接："+pdf);                                    pdf = pdf.replace("../../../", "");                                    LOG.debug("报纸链接："+pdf);                                    hrefs.add(host+pdf);                                }else{                                    LOG.debug("不是报纸链接："+pdf);                                }                            }                            //有多个content，选择一个即可                            break;                        }                    }                }else{                    LOG.debug("不是子报纸文本："+text+" , "+href);                }            }        } catch (IOException ex) {            LOG.error("采集出错",ex);        }        return downloadPaper(hrefs);    }    @Override    protected String getPath(String href) {        String path = href.replace(hrefPrefix, "");        String[] attrs = path.split("/");        StringBuilder str = new StringBuilder();        str.append(paperName)            .append(File.separator)            .append(attrs[0])            .append(File.separator)            .append(attrs[1]);        return str.toString();    }    @Override    protected String getFile(String href) {        String path = href.replace(hrefPrefix, "");        String[] attrs = path.split("/");        String file = attrs[2]+".pdf";        return file;    }    public static void main(String[] args) {        new XXSBPaperCollector().run();    }}

7、采集羊城晚报

/** * 羊城晚报 * @author 杨尚川 */public class YCWBPaperCollector extends AbstractPaperCollector{    private static final String paperName = "羊城晚报";    private static final String paperPath = "http://www.ycwb.com/ePaper/ycwb/";    private static final String url = paperPath+"html/";    private static final String hrefPrefix = paperPath+"images/";    private static final String start = "node_2081.htm";    private static final String pdfCssQuery = "html body div.cbody div.areaL div.box div.conBox2 div div.xx h2 em a.px12";    private static final SimpleDateFormat sf = new SimpleDateFormat("yyyy-MM/dd/");         @Override    public List<File> collect(Date date) {        List<String> hrefs = new ArrayList<>();        try {            LOG.debug("url: "+url);            String paper = url + sf.format(date) + start;            LOG.debug("paper: "+paper);            Document document = Jsoup.connect(paper).get();                        LOG.debug("pdfCssQuery: " + pdfCssQuery);            Elements elements = document.select(pdfCssQuery);            for(Element element : elements){                String href = element.attr("href");                if(href != null && href.endsWith(".pdf")){                    LOG.debug("报纸链接："+href);                    href = href.replace("../../../", "");                    LOG.debug("报纸链接："+href);                    hrefs.add(paperPath+href);                }else{                    LOG.debug("不是报纸链接："+href);                }            }                    } catch (IOException ex) {            LOG.error("采集出错",ex);        }        return downloadPaper(hrefs);    }    @Override    protected String getPath(String href) {        String path = href.replace(hrefPrefix, "");        String[] attrs = path.split("/");        StringBuilder str = new StringBuilder();        str.append(paperName)            .append(File.separator)            .append(attrs[0])            .append(File.separator)            .append(attrs[1]);        return str.toString();    }    @Override    protected String getFile(String href) {        String path = href.replace(hrefPrefix, "");        String[] attrs = path.split("/");        String file = attrs[2]+".pdf";        return file;    }    public static void main(String[] args) {        new YCWBPaperCollector().run();    }}

热点排行

互联网

采摘电子报纸