构建自己的DSL之一 Simple Crawler
转载请标明出处:http://fuliang.iteye.com/blog/1122008
经常需要从网上抓取一些需要的内容做成语料,供分类使用。所以需要一个灵活的抓取、抽取程序-自己的DSL来做这件事,这样每次只需要写几行代码就能得到需要的内容。比如我比较希望以下几行代码就能把我的博客的内容给抓下来:
crawler = Crawler.new1.upto(10) do |pn| urls = [] crawler.fetch "http://fuliang.iteye.com/?page=#{pn}" do |page| page.css("div.blog_title > h3 > a").each do |node| urls << "http://fuliang.iteye.com#{node.attributes['href']}" end end urls.each do |url| crawler.fetch url do |page| page.xpath(:title => '//*[@id="main"]/div/div[2]/h3/a',:content => '//*[@id="blog_content"]').each do |entry| printf("%s\t%s\n",entry[:title].text.gsub(/\s+/,""),entry[:content].text.gsub(/\s+/,"")) end end end breakend
class Crawler def initialize @proxies = 1.upto(6).collect{|index| "http://l-crwl#{index}:1080"} end def fetch(url) yield Page.new( Nokogiri::HTML(open(url,fetch_options)) ) endprivate def rand_proxy @proxies[(rand * 6).to_i] end def fetch_options user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20061201 Firefox/2.0.0.2 (Ubuntu-feisty)" fetch_options = { "User-Agent" => user_agent, "proxy" => rand_proxy } endend
class Page def initialize(html) @html = html end class_eval do [:css,:xpath].each do |extract_by| define_method extract_by do |arg,&block| if arg.is_a? String then if block.nil? then @html.send(extract_by,arg) else block.call(@html.send(extract_by,arg)) end elsif arg.is_a? Hash then extract_raw = arg.collect{|key,value| [key, @html.send(extract_by,value)]} data = extract_raw.collect do |key, vals| ([key] * vals.size).zip(vals) end result = data[0].zip(*data[1..-1]).collect{|e| Hash[ * e.flatten ]} if block.nil? then result else block.call(result) end else raise ArgumentError.new('Argument type must String or Hash type') end end end endend
#!/usr/bin/env rubyrequire 'rubygems'require 'nokogiri'require 'open-uri'class Crawler def initialize @proxies = 1.upto(6).collect{|index| "http://l-crwl#{index}:1080"} end def fetch(url) yield Page.new( Nokogiri::HTML(open(url,fetch_options)) ) endprivate def rand_proxy @proxies[(rand * 6).to_i] end def fetch_options user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20061201 Firefox/2.0.0.2 (Ubuntu-feisty)" fetch_options = { "User-Agent" => user_agent, "proxy" => rand_proxy } end endclass Page def initialize(html) @html = html end class_eval do [:css,:xpath].each do |extract_by| define_method extract_by do |arg,&block| if arg.is_a? String then if block.nil? then @html.send(extract_by,arg) else block.call(@html.send(extract_by,arg)) end elsif arg.is_a? Hash then extract_raw = arg.collect{|key,value| [key, @html.send(extract_by,value)]} data = extract_raw.collect do |key, vals| ([key] * vals.size).zip(vals) end result = data[0].zip(*data[1..-1]).collect{|e| Hash[ * e.flatten ]} if block.nil? then result else block.call(result) end else raise ArgumentError.new('Argument type must String or Hash type') end end end endendcrawler = Crawler.new1.upto(10) do |pn| urls = [] crawler.fetch "http://fuliang.iteye.com/?page=#{pn}" do |page| page.css("div.blog_title > h3 > a").each do |node| urls << "http://fuliang.iteye.com#{node.attributes['href']}" end end urls.each do |url| crawler.fetch url do |page| page.xpath(:title => '//*[@id="main"]/div/div[2]/h3/a',:content => '//*[@id="blog_content"]').each do |entry| printf("%s\t%s\n",entry[:title].text.gsub(/\s+/,""),entry[:content].text.gsub(/\s+/,"")) end end end breakend