# Written by Christian Cantrell # http://www.livingdigitally.net # http://weblogs.macromedia.com/cantrell class SiteLanguage require 'net/http' require 'uri' def guess_site_language(site_url) raise 'Please specify a URL.' unless site_url site_body = get_site_source(site_url) # Check to see if the site itself gives away the answer either through # meta data, or the xml namespace. site_body.gsub(/]+content-language[^>]+content="(.+)"/i) do |match| return guess_language($1) unless nil end site_body.gsub(/xml:lang="([^"]*)"/i) do |match| return guess_language($1) unless nil end # Iterate through the link tags and find alternates. links = Array.new site_body.gsub(/]*>/i) { |match| links.push(match) if match =~ /"alternate"/ } # Pull the feed URLs out of the link tags. feeds = Array.new links.each { |feed| feeds.push(feed.match(/href="([^"]*)"/i)[1]) } # If no link tags referencing feeds were found, try to parse out links that look like # they point to feeds. This is at best a guess, but a reasonable one. if feeds.length == 0 site_body.gsub(/href="([^"]*(rss|atom)[^"]*)"/i) { |match| feeds.push($1) } end # Resolve URLs. uri = URI.parse(site_url) urls = Array.new feeds.each do |feed| feed.gsub!('feed:', 'http:') if feed.match(/^http:\/\//) urls.push(feed) elsif feed.match(/^\//) urls.push("http://#{uri.host}#{feed}") else path = uri.path path.sub!(/\/[^\/]+$/,'/') if path.match(/\/[^\/]+\..+$/) path << '/' unless path.match(/\/$/) #path.sub!(/\/[^\/]+$/,'/') unless path.match(/\/$/) full_url = "http://#{uri.host}#{path}#{feed}" full_url.gsub!(/\/[^\/]+\/\.\./,'') urls.push(full_url) end end # Check to make sure this is a blog or news site. raise 'This site doesn\'t give any good clues.' if urls.length == 0 # Grab the site's feed and look for a clue as to what language it's written in. urls.each do |url| feed_source = get_site_source(url) return guess_language(feed_source.match(/xml:lang="([^"]*)"/)[1]) if feed_source =~ /xml:lang/ return guess_language(feed_source.match(/(.+)<\/dc:language>/)[1]) if feed_source =~ /dc:language/ return guess_language(feed_source.match(/(.+)<\/language>/)[1]) if feed_source =~ // end raise 'Can\'t figure this one out.' end private # Get a site's source. def get_site_source(url) begin url = "#{url}/" unless url.match(/http:\/\/.*\//) uri = URI.parse(url) http = Net::HTTP.new(uri.host, 80) http.read_timeout = 10 q = uri.query ? "?#{uri.query}" : '' resp = http.get("#{uri.path}#{q}") if resp.instance_of?(Net::HTTPMovedPermanently) or resp.instance_of?(Net::HTTPFound) loc = resp.fetch('location') raise 'Invalid headers. A redirect was sent, but no new location was provided' if loc == nil return get_site_source(loc) elsif resp.code != '200' raise "Invalid request. Response code was #{resp.code}." end return resp.body rescue Timeout::Error raise 'Request timed out. Try another URL.' rescue Errno::ECONNREFUSED raise 'Connection refused for some reason. Try another URL.' rescue raise 'An unknown network error occurred.' end end def guess_language(clue) clue.downcase! if clue.match(/^en/) return 'English' elsif clue.match(/^fr/) return 'French' elsif clue.match(/^de/) return 'German' elsif clue.match(/^it/) return 'Italian' elsif clue.match(/^nl/) return 'Dutch' elsif clue.match(/^el/) return 'Greek' elsif clue.match(/^es/) return 'Spanish' elsif clue.match(/^pt/) return 'Portuguese' elsif clue.match(/^ar/) return 'Arabic' elsif clue.match(/^he/) return 'Hebrew' elsif clue.match(/^ru/) return 'Russian' elsif clue.match(/^zh/) return 'Chinese' elsif clue.match(/^cn/) return 'Chinese' elsif clue.match(/^ja/) return 'Japanese' elsif clue.match(/^ko/) return 'Korean' elsif clue.match(/^pl/) return 'Polish' elsif clue.match(/^pol/) return 'Polish' elsif clue.match(/^tr/) return 'Turkish' elsif clue.match(/^fa/) return 'Persian' else raise "I don't recognize the language '#{clue}.'" end end end