# Written by Christian Cantrell
# http://www.livingdigitally.net
# http://weblogs.macromedia.com/cantrell
class SiteLanguage
require 'net/http'
require 'uri'
def guess_site_language(site_url)
raise 'Please specify a URL.' unless site_url
site_body = get_site_source(site_url)
# Check to see if the site itself gives away the answer either through
# meta data, or the xml namespace.
site_body.gsub(/]+content-language[^>]+content="(.+)"/i) do |match|
return guess_language($1) unless nil
end
site_body.gsub(/xml:lang="([^"]*)"/i) do |match|
return guess_language($1) unless nil
end
# Iterate through the link tags and find alternates.
links = Array.new
site_body.gsub(/]*>/i) { |match| links.push(match) if match =~ /"alternate"/ }
# Pull the feed URLs out of the link tags.
feeds = Array.new
links.each { |feed| feeds.push(feed.match(/href="([^"]*)"/i)[1]) }
# If no link tags referencing feeds were found, try to parse out links that look like
# they point to feeds. This is at best a guess, but a reasonable one.
if feeds.length == 0
site_body.gsub(/href="([^"]*(rss|atom)[^"]*)"/i) { |match| feeds.push($1) }
end
# Resolve URLs.
uri = URI.parse(site_url)
urls = Array.new
feeds.each do |feed|
feed.gsub!('feed:', 'http:')
if feed.match(/^http:\/\//)
urls.push(feed)
elsif feed.match(/^\//)
urls.push("http://#{uri.host}#{feed}")
else
path = uri.path
path.sub!(/\/[^\/]+$/,'/') if path.match(/\/[^\/]+\..+$/)
path << '/' unless path.match(/\/$/)
#path.sub!(/\/[^\/]+$/,'/') unless path.match(/\/$/)
full_url = "http://#{uri.host}#{path}#{feed}"
full_url.gsub!(/\/[^\/]+\/\.\./,'')
urls.push(full_url)
end
end
# Check to make sure this is a blog or news site.
raise 'This site doesn\'t give any good clues.' if urls.length == 0
# Grab the site's feed and look for a clue as to what language it's written in.
urls.each do |url|
feed_source = get_site_source(url)
return guess_language(feed_source.match(/xml:lang="([^"]*)"/)[1]) if feed_source =~ /xml:lang/
return guess_language(feed_source.match(/(.+)<\/dc:language>/)[1]) if feed_source =~ /dc:language/
return guess_language(feed_source.match(/(.+)<\/language>/)[1]) if feed_source =~ //
end
raise 'Can\'t figure this one out.'
end
private
# Get a site's source.
def get_site_source(url)
begin
url = "#{url}/" unless url.match(/http:\/\/.*\//)
uri = URI.parse(url)
http = Net::HTTP.new(uri.host, 80)
http.read_timeout = 10
q = uri.query ? "?#{uri.query}" : ''
resp = http.get("#{uri.path}#{q}")
if resp.instance_of?(Net::HTTPMovedPermanently) or resp.instance_of?(Net::HTTPFound)
loc = resp.fetch('location')
raise 'Invalid headers. A redirect was sent, but no new location was provided' if loc == nil
return get_site_source(loc)
elsif resp.code != '200'
raise "Invalid request. Response code was #{resp.code}."
end
return resp.body
rescue Timeout::Error
raise 'Request timed out. Try another URL.'
rescue Errno::ECONNREFUSED
raise 'Connection refused for some reason. Try another URL.'
rescue
raise 'An unknown network error occurred.'
end
end
def guess_language(clue)
clue.downcase!
if clue.match(/^en/)
return 'English'
elsif clue.match(/^fr/)
return 'French'
elsif clue.match(/^de/)
return 'German'
elsif clue.match(/^it/)
return 'Italian'
elsif clue.match(/^nl/)
return 'Dutch'
elsif clue.match(/^el/)
return 'Greek'
elsif clue.match(/^es/)
return 'Spanish'
elsif clue.match(/^pt/)
return 'Portuguese'
elsif clue.match(/^ar/)
return 'Arabic'
elsif clue.match(/^he/)
return 'Hebrew'
elsif clue.match(/^ru/)
return 'Russian'
elsif clue.match(/^zh/)
return 'Chinese'
elsif clue.match(/^cn/)
return 'Chinese'
elsif clue.match(/^ja/)
return 'Japanese'
elsif clue.match(/^ko/)
return 'Korean'
elsif clue.match(/^pl/)
return 'Polish'
elsif clue.match(/^pol/)
return 'Polish'
elsif clue.match(/^tr/)
return 'Turkish'
elsif clue.match(/^fa/)
return 'Persian'
else
raise "I don't recognize the language '#{clue}.'"
end
end
end