Set Proxy and timout interval
I'm using the read method because I don't need to send request over and over to try different encodings
require 'open-uri'
html_data = open(url, :proxy => PROXY, :read_timeout => REQUEST_TIMEOUT).read rescue ''
Set The Encoding
requrie 'nokogiri'
ENCODING = 'WINDOWS-1251' # 'UTF-8'
html = Nokogiri::HTML(html_data, nil,ENCODING)
Try different encoding
match = html.xpath(SOME_XPATH) rescue []
if match.empty?
html = Nokogiri::HTML(html_data, nil, 'UTF-8')
#using .css is more awesome
match = html.xpath(SOME_XPATH) rescue []
end
Get HTML from RSS
rss = Nokogiri::XML(open(RSS_URL, :proxy => PROXY))
nodes = rss.search('item/link')
nodes.each do |node|
html_data = open(node.inner_text, :proxy => PROXY).read
end
Rescue Timout errors
begin
news_html = Nokogiri::HTML(html_data, nil, 'WINDOWS-1251')
rescue Timeout::Error => e
html = Nokogiri::HTML::Document.new('')
end
No comments:
Post a Comment