Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated to make this work in 2016, added run_all script #4

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
data
*.sqlite3
*.sqlite3
movies.tsv
Empty file modified 00-download.sh
100644 → 100755
Empty file.
45 changes: 27 additions & 18 deletions 02-import.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
require 'rubygems'
require 'sqlite3'
$db = SQLite3::Database.new( "movies.sqlite3" )
$title = "[a-z,&-;0-9$#+=\/!?. ]+"
$title = "[a-z,&-;0-9$#+=\/!?. ]+|\"[a-z,&-;0-9$#+=\/!?. ]+\""

def remove_quotations(str)
str.chomp('"').reverse.chomp('"').reverse
end

def import_movies
#$100,000 Pyramid, The (2001) (VG) 2001
Expand All @@ -14,7 +18,7 @@ def import_movies

File.new("data/movies.list").each_line do |l|
print "." if (i = i + 1) % 5000 == 0; STDOUT.flush
if match = title_re.match(l)
if match = title_re.match(l.chars.select(&:valid_encoding?).join)
stmt.execute!(match[1], match[2].to_i)
end
end
Expand All @@ -32,9 +36,11 @@ def import_times
$db.transaction do
File.new("data/running-times.list").each_line do |l|
print "." if (i = i + 1) % 5000 == 0; STDOUT.flush
l = l.chars.select(&:valid_encoding?).join
if match = time_re.match(l)
stmt.execute!(match[3].to_i, match[1], match[2].to_i)
stmt.execute!(match[3].to_i,
remove_quotations(match[1]),
match[2].to_i)
end
end
end
Expand All @@ -45,14 +51,17 @@ def import_times

def import_budgets
dashes = "-------------------------------------------------------------------------------"
title_re = /MV:\s+(#{$title}?) \s \(([0-9]+)\)/ix
title_re = /MV:\s+(#{$title}?)\s+\(([0-9]+)\)/ix
budget_re = /BT:\s+USD\s+([0-9,.]+)/ix

stmt = $db.prepare("UPDATE Movies set budget=? WHERE title=? AND year=?;")
$db.transaction do
File.new("data/business.list").each(dashes) do |l|
l=l.chars.select(&:valid_encoding?).join
if match = title_re.match(l.to_s) and bt = budget_re.match(l.to_s)
stmt.execute!(bt[1].gsub!(",","").to_i, match[1], match[2].to_i)
stmt.execute!(bt[1].gsub!(",","").to_i,
remove_quotations(match[1]),
match[2].to_i)
end
end
end
Expand All @@ -66,8 +75,11 @@ def import_mpaa_ratings
stmt = $db.prepare("UPDATE Movies set mpaa_rating=? WHERE title=? AND year=?;")
$db.transaction do
File.new("data/mpaa-ratings-reasons.list").each(dashes) do |l|
l = l.chars.select(&:valid_encoding?).join
if match = title_re.match(l.to_s) and rt = rating_re.match(l.to_s)
stmt.execute!(rt[1], match[1], match[2].to_i)
stmt.execute!(rt[1],
remove_quotations(match[1]),
match[2].to_i)
end
end
end
Expand All @@ -85,8 +97,11 @@ def import_genres

File.new("data/genres.list").each_line do |l|
print "." if (i = i + 1) % 1000 == 0; STDOUT.flush
l = l.chars.select(&:valid_encoding?).join
if match = genre_re.match(l)
stmt.execute!(match[3], match[1], match[2].to_i)
stmt.execute!(match[3],
remove_quotations(match[1]),
match[2].to_i)
end
end
puts
Expand All @@ -102,17 +117,18 @@ def import_ratings
$db.transaction

File.new("data/ratings.list").each_line do |l|
l = l.chars.select(&:valid_encoding?).join
if match = ratings_re.match(l)
rating, votes, outof10, title, year = match[1], match[2], match[3], match[4], match[5]
rating, votes, outof10, title, year = match[1], match[2], match[3], remove_quotations(match[4]), match[5]
stmt.execute!(votes, outof10, rating, title, year)
end
end
$db.commit

end

# puts "Importing movies"
# import_movies
puts "Importing movies"
import_movies
puts "Importing times"
import_times
puts "Importing budgets"
Expand All @@ -123,10 +139,3 @@ def import_ratings
import_ratings
puts "Importing genres"
import_genres


#puts Movie.count( "budget > 0")
#puts Movie.count( "length > 0")
#puts Movie.count( "budget > 0 and length > 0")
#puts Movie.count( "imdb_votes > 0 and length > 0")
#puts Movie.count( "budget > 0 and length > 0 and imdb_votes > 0")
14 changes: 7 additions & 7 deletions 03-export.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@


def genres_binary(id, db)
genres = db.execute("SELECT genre FROM Genres where movie_id = #{id};").flatten.to_set
$genres_of_interest.map { |genre| (genres.include? genre) ? 1 : 0}
genres = db.execute("SELECT genre FROM Genres where movie_id = #{id};").map{|g| g['genre']}.to_s
$genres_of_interest.map { |genre| (genres.include? genre) ? 1 : 0}
end

def ratings_breakdown(ratings)
ratings[0..ratings.length].to_s.split(//).map{|s| $ratings_map[s]}
ratings[0..ratings.length].to_s.split(//).map{|s| $ratings_map[s]} rescue nil
end

db = SQLite3::Database.new( "movies.sqlite3" )
db.results_as_hash= true
sql = "
SELECT Movies.*
FROM Movies
Expand All @@ -25,13 +26,12 @@ def ratings_breakdown(ratings)

i = 0

File.open("movies.tab", "w") do |out|
File.open("movies.tsv", "w") do |out|
out << [
'title', 'year', 'length', 'budget',
'rating', 'votes', (1..10).map{|i| "r" + i.to_s},
'mpaa', $genres_of_interest
].flatten.join("\t") + "\n"

db.execute(sql) do |row|
puts i if (i = i + 1) % 5000 == 0

Expand All @@ -42,6 +42,6 @@ def ratings_breakdown(ratings)
row["budget"],
row["imdb_rating"], row["imdb_votes"], ratings_breakdown(row["imdb_rating_votes"]),
row["mpaa_rating"], genres_binary(row['id'], db)
].flatten.join("\t") + "\n" rescue nil
].flatten.join("\t") + "\n"
end
end
end
5 changes: 5 additions & 0 deletions run_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
./00-download.sh
gunzip ./data/*
sqlite3 movies.sqlite3 ".read 01-movies.sql"
ruby 02-import.rb
ruby 03-export.rb