使用site地图gen4j结合htmlParser生成网站的site地图.xml

2012-12-27

使用sitemapgen4j结合htmlParser生成网站的sitemap.xml今天尝试写了小段代码来给别人的网站生成sitemap.xm

使用sitemapgen4j结合htmlParser生成网站的sitemap.xml
今天尝试写了小段代码来给别人的网站生成sitemap.xml文件。
在google code中找到sitemapgen4j 开源组件，专门针对生成对应google search engine 的sitemap.xml文件。

sitemapgen4j 组件的主页：http://code.google.com/p/sitemapgen4j/
上面包括组件的源代码和简易的使用教程，懂点英文的就动手了。

sitemapgen4j 组件主要是针对你提供的URL以及该URL对应的属性，生成sitemap中如下这段xml：

<url>  <loc>http://www.google.com.hk</loc>  <lastmod>2010-09-29</lastmod>  <changefreq>weekly</changefreq>  <priority>0.7</priority></url>

而一个网站的sitemap.xml文件中需要包括提供给搜索引擎收录的URL，那如何获取网站上的提供给SE
收录的URL呢，这个当然用到htmlParser了。关于这个组件，大家可以去google下，
包括功能介绍和教程；同时这个组件中用的几个设计模式都非常典型，学习设计模式的可以作为参考。

生成sitemap.xml的主要思路：
1、从网站首页开始使用htmlParser，才去广度优先策略对抓取网站上的URL
2、使用sitemapgen4j针对每一个URL设置对应的属性，并生成XML

具体代码如下:
a、扩展WebSitemapUrl，以便能在队列中方便控制

public interface CrawlUrl {    boolean canCrawl();    void disable();}

import org.apache.commons.lang.StringUtils;import com.redfin.sitemapgenerator.WebSitemapUrl;public class ExtWebSiteMapUrl extends WebSitemapUrl implements CrawlUrl {    public ExtWebSiteMapUrl(Options options) {        super(options);    }    private boolean canCrawl = true;    @Override    public boolean canCrawl() {        return canCrawl;    }    @Override    public void disable() {        canCrawl = false;    }    @Override    public boolean equals(Object obj) {        if (obj == null) {            return false;        }        if (obj instanceof ExtWebSiteMapUrl) {            ExtWebSiteMapUrl url = (ExtWebSiteMapUrl) obj;            return StringUtils.equals(url.getUrlStr(), getUrlStr());        }        return false;    }    public String getUrlStr() {        return super.getUrl().toExternalForm();    }}

import java.io.File;import java.net.MalformedURLException;import java.util.Date;import java.util.LinkedList;import java.util.Queue;import java.util.TimeZone;import org.apache.commons.collections.CollectionUtils;import org.apache.commons.lang.StringUtils;import org.htmlparser.Parser;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import com.redfin.sitemapgenerator.ChangeFreq;import com.redfin.sitemapgenerator.W3CDateFormat;import com.redfin.sitemapgenerator.WebSitemapGenerator;import com.redfin.sitemapgenerator.WebSitemapUrl;public class HtmlCrawler {    private static NodeClassFilter LINK_FILTER = new NodeClassFilter(            LinkTag.class);    private static Parser parser = new Parser();    private static File dir = new File("D:\\sitemaptest");    private static String BASE_PREFIX = "http://www.xxxx.com";    private static WebSitemapGenerator wsg = null;    static {        W3CDateFormat dateFormat = new W3CDateFormat(W3CDateFormat.Pattern.DAY);        dateFormat.setTimeZone(TimeZone.getTimeZone("GMT+8"));        try {            wsg = WebSitemapGenerator.builder(BASE_PREFIX, dir).dateFormat(                    dateFormat).build();        } catch (MalformedURLException e) {            System.out.println("the start url [" + BASE_PREFIX                    + "] is malformed");        }    }    public static void main(String[] args) throws ParserException,            MalformedURLException {        ExtWebSiteMapUrl startUrl = new ExtWebSiteMapUrl(                new WebSitemapUrl.Options("http://www.xxxx.com").lastMod(                        new Date()).priority(0.9).changeFreq(ChangeFreq.WEEKLY));        Queue<ExtWebSiteMapUrl> queue = new LinkedList<ExtWebSiteMapUrl>();        queue.add(startUrl);        crawl(queue, wsg);        System.out.println("done");    }    /** *//**     * 检测是否为同一个域下的url     *      * @param url     * @param basePrefix     * @return     */    public static boolean check(String url, String basePrefix) {        return StringUtils.isNotBlank(url) ? url.startsWith(basePrefix) : false;    }    /** *//**     * 使用队列循环抓取页面上的URL     *      * @param queue     * @param wsg     */    public static void crawl(Queue<ExtWebSiteMapUrl> queue,            WebSitemapGenerator wsg) {        if (CollectionUtils.isEmpty(queue)) {            return;        }        Queue<ExtWebSiteMapUrl> crawled = new LinkedList<ExtWebSiteMapUrl>();        do {            ExtWebSiteMapUrl url = queue.poll();            crawled.add(url);            if (url != null && url.canCrawl()) {                try {                    parser.setURL(url.getUrl().toExternalForm());                    NodeList list = parser.parse(LINK_FILTER);                    for (NodeIterator iter = list.elements(); iter                            .hasMoreNodes();) {                        String link = ((LinkTag) iter.nextNode()).getLink();                        ExtWebSiteMapUrl newUrl = null;                        try {                            newUrl = new ExtWebSiteMapUrl(                                    new WebSitemapUrl.Options(link).lastMod(                                            new Date()).priority(0.7)                                            .changeFreq(ChangeFreq.WEEKLY));                        } catch (MalformedURLException e) {                            System.out.println("the url [" + link                                    + "] is malformed");                            continue;                        }                        if (check(link, BASE_PREFIX) && !queue.contains(newUrl)                                && !crawled.contains(newUrl)) {                            queue.add(newUrl);                            wsg.addUrl(newUrl);                        }                    }                } catch (ParserException e) {                    System.out.println("can not parser the url : "                            + url.getUrl());                } finally {                    url.disable();                }            }        } while (queue.size() > 0);        wsg.write();    }}

热点排行

CSS

使用site地图gen4j结合htmlParser生成网站的site地图.xml