Skip to content

Commit

Permalink
Fetching
Browse files Browse the repository at this point in the history
  • Loading branch information
samvasko committed Jan 24, 2014
0 parents commit ba8f929
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cache
8 changes: 8 additions & 0 deletions app.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
require 'pp'

$LOAD_PATH << './lib'

require 'fetch'
require 'convert'

File.write('out.html', Fetch.run())
1 change: 1 addition & 0 deletions lib/convert.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
require 'eeepub'
76 changes: 76 additions & 0 deletions lib/fetch.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
require 'nokogiri'
require 'sanitize'
require 'open-uri'
require 'digest/md5'

class Fetch

def self.run
content = self.get_content

toc = self.parse_toc content
self.remove_elements content
self.transform_code content
self.absolute_links content
self.cleanup_paragraphs content

return content
end

# Get the goods, sorry for the global
def self.get_content
$doc = Nokogiri::HTML(self.download('https://wiki.gnome.org/Projects/Vala/Tutorial'))
$doc.css('#content')
end

def self.parse_toc content
content.css('.table-of-contents > ol > li > ol')
end

def self.absolute_links content
content.css('a').each do |a|
if a.attr('href').starts_with '/'
a.set_attribute('href', 'https://wiki.gnome.org' + a.attr('href'))
end
end
end

def self.cleanup_paragraphs content
content.css('p').each do |p|
p.remove_attribute('class')
p.remove if p.content.strip.empty?
end
end

# Removes individual or groups of elements
def self.remove_elements content
['span.anchor','.table-of-contents', 'div.comment'].each { |el| content.css(el).remove }
end

def self.transform_code content
content.css('.highlight').each do |el|
pre = Nokogiri::XML::Node.new 'pre', $doc
pre.content = el.text
el.replace(pre)
end

content.css('tt.backtick').each do |el|
code = Nokogiri::XML::Node.new 'code', $doc
code.content = el.text
el.replace(code)
end
end

# Simple caching
def self.download url
Dir.mkdir('cache') unless File.exists?('cache')
filename = Digest::MD5.hexdigest(url)
if File.exists?('cache/' + filename)
File.read('cache/' + filename)
else
data = open(url).read
File.write('cache/' + filename, data)
return data
end
end
end

0 comments on commit ba8f929

Please sign in to comment.