From 058f0f2d8675b526726f7bf7f38ce592ce843345 Mon Sep 17 00:00:00 2001 From: Andrey Kurenkov Date: Sun, 10 Jul 2016 20:07:45 -0700 Subject: [PATCH 1/3] Updated to make this work in 2016, added run_all script --- 00-download.sh | 0 02-import.rb | 45 +++++++++++++++++++++++++++------------------ 03-export.rb | 10 +++++----- run_all.sh | 5 +++++ 4 files changed, 37 insertions(+), 23 deletions(-) mode change 100644 => 100755 00-download.sh create mode 100755 run_all.sh diff --git a/00-download.sh b/00-download.sh old mode 100644 new mode 100755 diff --git a/02-import.rb b/02-import.rb index a32d907..a53e5c0 100644 --- a/02-import.rb +++ b/02-import.rb @@ -1,7 +1,11 @@ require 'rubygems' require 'sqlite3' $db = SQLite3::Database.new( "movies.sqlite3" ) -$title = "[a-z,&-;0-9$#+=\/!?. ]+" +$title = "[a-z,&-;0-9$#+=\/!?. ]+|\"[a-z,&-;0-9$#+=\/!?. ]+\"" + +def remove_quotations(str) + str.chomp('"').reverse.chomp('"').reverse +end def import_movies #$100,000 Pyramid, The (2001) (VG) 2001 @@ -14,7 +18,7 @@ def import_movies File.new("data/movies.list").each_line do |l| print "." if (i = i + 1) % 5000 == 0; STDOUT.flush - if match = title_re.match(l) + if match = title_re.match(l.chars.select(&:valid_encoding?).join) stmt.execute!(match[1], match[2].to_i) end end @@ -32,9 +36,11 @@ def import_times $db.transaction do File.new("data/running-times.list").each_line do |l| print "." if (i = i + 1) % 5000 == 0; STDOUT.flush - + l = l.chars.select(&:valid_encoding?).join if match = time_re.match(l) - stmt.execute!(match[3].to_i, match[1], match[2].to_i) + stmt.execute!(match[3].to_i, + remove_quotations(match[1]), + match[2].to_i) end end end @@ -45,14 +51,17 @@ def import_times def import_budgets dashes = "-------------------------------------------------------------------------------" - title_re = /MV:\s+(#{$title}?) \s \(([0-9]+)\)/ix + title_re = /MV:\s+(#{$title}?)\s+\(([0-9]+)\)/ix budget_re = /BT:\s+USD\s+([0-9,.]+)/ix stmt = $db.prepare("UPDATE Movies set budget=? WHERE title=? AND year=?;") $db.transaction do File.new("data/business.list").each(dashes) do |l| + l=l.chars.select(&:valid_encoding?).join if match = title_re.match(l.to_s) and bt = budget_re.match(l.to_s) - stmt.execute!(bt[1].gsub!(",","").to_i, match[1], match[2].to_i) + stmt.execute!(bt[1].gsub!(",","").to_i, + remove_quotations(match[1]), + match[2].to_i) end end end @@ -66,8 +75,11 @@ def import_mpaa_ratings stmt = $db.prepare("UPDATE Movies set mpaa_rating=? WHERE title=? AND year=?;") $db.transaction do File.new("data/mpaa-ratings-reasons.list").each(dashes) do |l| + l = l.chars.select(&:valid_encoding?).join if match = title_re.match(l.to_s) and rt = rating_re.match(l.to_s) - stmt.execute!(rt[1], match[1], match[2].to_i) + stmt.execute!(rt[1], + remove_quotations(match[1]), + match[2].to_i) end end end @@ -85,8 +97,11 @@ def import_genres File.new("data/genres.list").each_line do |l| print "." if (i = i + 1) % 1000 == 0; STDOUT.flush + l = l.chars.select(&:valid_encoding?).join if match = genre_re.match(l) - stmt.execute!(match[3], match[1], match[2].to_i) + stmt.execute!(match[3], + remove_quotations(match[1]), + match[2].to_i) end end puts @@ -102,8 +117,9 @@ def import_ratings $db.transaction File.new("data/ratings.list").each_line do |l| + l = l.chars.select(&:valid_encoding?).join if match = ratings_re.match(l) - rating, votes, outof10, title, year = match[1], match[2], match[3], match[4], match[5] + rating, votes, outof10, title, year = match[1], match[2], match[3], remove_quotations(match[4]), match[5] stmt.execute!(votes, outof10, rating, title, year) end end @@ -111,8 +127,8 @@ def import_ratings end -# puts "Importing movies" -# import_movies +puts "Importing movies" +import_movies puts "Importing times" import_times puts "Importing budgets" @@ -123,10 +139,3 @@ def import_ratings import_ratings puts "Importing genres" import_genres - - -#puts Movie.count( "budget > 0") -#puts Movie.count( "length > 0") -#puts Movie.count( "budget > 0 and length > 0") -#puts Movie.count( "imdb_votes > 0 and length > 0") -#puts Movie.count( "budget > 0 and length > 0 and imdb_votes > 0") \ No newline at end of file diff --git a/03-export.rb b/03-export.rb index 642d938..e35be53 100644 --- a/03-export.rb +++ b/03-export.rb @@ -13,10 +13,11 @@ def genres_binary(id, db) end def ratings_breakdown(ratings) - ratings[0..ratings.length].to_s.split(//).map{|s| $ratings_map[s]} + ratings[0..ratings.length].to_s.split(//).map{|s| $ratings_map[s]} rescue nil end db = SQLite3::Database.new( "movies.sqlite3" ) +db.results_as_hash= true sql = " SELECT Movies.* FROM Movies @@ -25,13 +26,12 @@ def ratings_breakdown(ratings) i = 0 -File.open("movies.tab", "w") do |out| +File.open("movies.csv", "w") do |out| out << [ 'title', 'year', 'length', 'budget', 'rating', 'votes', (1..10).map{|i| "r" + i.to_s}, 'mpaa', $genres_of_interest ].flatten.join("\t") + "\n" - db.execute(sql) do |row| puts i if (i = i + 1) % 5000 == 0 @@ -42,6 +42,6 @@ def ratings_breakdown(ratings) row["budget"], row["imdb_rating"], row["imdb_votes"], ratings_breakdown(row["imdb_rating_votes"]), row["mpaa_rating"], genres_binary(row['id'], db) - ].flatten.join("\t") + "\n" rescue nil + ].flatten.join("\t") + "\n" end -end \ No newline at end of file +end diff --git a/run_all.sh b/run_all.sh new file mode 100755 index 0000000..2997fb4 --- /dev/null +++ b/run_all.sh @@ -0,0 +1,5 @@ +./00-download.sh +gunzip ./data/* +sqlite3 movies.sqlite3 ".read 01-movies.sql" +ruby 02-import.rb +ruby 03-export.rb From dcc01d2196d2e29a90149f0ea56f3e22ee952c1e Mon Sep 17 00:00:00 2001 From: Andrey Kurenkov Date: Mon, 11 Jul 2016 01:20:29 -0700 Subject: [PATCH 2/3] Makes genres work with update --- 03-export.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/03-export.rb b/03-export.rb index e35be53..91605f1 100644 --- a/03-export.rb +++ b/03-export.rb @@ -8,8 +8,8 @@ def genres_binary(id, db) - genres = db.execute("SELECT genre FROM Genres where movie_id = #{id};").flatten.to_set - $genres_of_interest.map { |genre| (genres.include? genre) ? 1 : 0} + genres = db.execute("SELECT genre FROM Genres where movie_id = #{id};").map{|g| g['genre']}.to_s + $genres_of_interest.map { |genre| (genres.include? genre) ? 1 : 0} end def ratings_breakdown(ratings) From 2362ace8b731aa543d963999ebf16b3841d9e33b Mon Sep 17 00:00:00 2001 From: Andrey Kurenkov Date: Wed, 13 Jul 2016 11:32:05 -0700 Subject: [PATCH 3/3] Rename to .tsv --- .gitignore | 3 ++- 03-export.rb | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 4ff90cc..659b1a7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ data -*.sqlite3 \ No newline at end of file +*.sqlite3 +movies.tsv diff --git a/03-export.rb b/03-export.rb index 91605f1..787a394 100644 --- a/03-export.rb +++ b/03-export.rb @@ -26,7 +26,7 @@ def ratings_breakdown(ratings) i = 0 -File.open("movies.csv", "w") do |out| +File.open("movies.tsv", "w") do |out| out << [ 'title', 'year', 'length', 'budget', 'rating', 'votes', (1..10).map{|i| "r" + i.to_s},