forked from hadley/data-movies
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02-import.rb
132 lines (107 loc) · 3.59 KB
/
02-import.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
require 'rubygems'
require 'sqlite3'
$db = SQLite3::Database.new( "movies.sqlite3" )
$title = "[a-z,&-;0-9$#+=\/!?. ]+"
def import_movies
#$100,000 Pyramid, The (2001) (VG) 2001
title_re = /^(#{$title}) \s+ \([0-9]+\) \s+ ([0-9]+)$/ix
i = 0
stmt = $db.prepare("INSERT INTO Movies (title, year) VALUES (?, ?);")
$db.transaction do
$db.execute "DELETE FROM Movies;"
File.new("data/movies.list").each_line do |l|
print "." if (i = i + 1) % 5000 == 0; STDOUT.flush
if match = title_re.match(l)
stmt.execute!(match[1], match[2].to_i)
end
end
end
puts
end
def import_times
#"Ballyskillen Opera House, The" (1980) 30 (6 episodes)
time_re = /^(#{$title}) \s+ \(([0-9]+)\) \s+ (?:[a-z]+:)?([0-9]+)/ix
i = 0
stmt = $db.prepare("UPDATE Movies set length=? WHERE title=? AND year=?;")
$db.transaction do
File.new("data/running-times.list").each_line do |l|
print "." if (i = i + 1) % 5000 == 0; STDOUT.flush
if match = time_re.match(l)
stmt.execute!(match[3].to_i, match[1], match[2].to_i)
end
end
end
puts
end
def import_budgets
dashes = "-------------------------------------------------------------------------------"
title_re = /MV:\s+(#{$title}?) \s \(([0-9]+)\)/ix
budget_re = /BT:\s+USD\s+([0-9,.]+)/ix
stmt = $db.prepare("UPDATE Movies set budget=? WHERE title=? AND year=?;")
$db.transaction do
File.new("data/business.list").each(dashes) do |l|
if match = title_re.match(l.to_s) and bt = budget_re.match(l.to_s)
stmt.execute!(bt[1].gsub!(",","").to_i, match[1], match[2].to_i)
end
end
end
end
def import_mpaa_ratings
dashes = "-------------------------------------------------------------------------------"
title_re = /MV:\s+(#{$title}?) \s \(([0-9]+)\)/ix
rating_re = /RE: Rated (.*?) /i
stmt = $db.prepare("UPDATE Movies set mpaa_rating=? WHERE title=? AND year=?;")
$db.transaction do
File.new("data/mpaa-ratings-reasons.list").each(dashes) do |l|
if match = title_re.match(l.to_s) and rt = rating_re.match(l.to_s)
stmt.execute!(rt[1], match[1], match[2].to_i)
end
end
end
end
def import_genres
#D2: The Mighty Ducks (1994) Family
genre_re = /^(#{$title}?) \s+ \(([0-9]+)\) (?:\s*[({].*[})])* \s+(.*?)$/ix
i = 0
stmt = $db.prepare("INSERT INTO Genres (genre, movie_id) VALUES (?, (SELECT id FROM Movies WHERE title=? AND year=?));")
$db.transaction do
$db.execute "DELETE FROM Genres;"
File.new("data/genres.list").each_line do |l|
print "." if (i = i + 1) % 1000 == 0; STDOUT.flush
if match = genre_re.match(l)
stmt.execute!(match[3], match[1], match[2].to_i)
end
end
puts
end
end
def import_ratings
#.0.1112000 14 5.9 365 Nights in Hollywood (1934)
ratings_re = /([0-9.\*]+) \s+ ([0-9]+) \s+ ([0-9.]+) \s+ (#{$title}?) \s+ \(([0-9]+)\)/ix
stmt = $db.prepare("UPDATE Movies set imdb_votes=?, imdb_rating=?, imdb_rating_votes=? WHERE title=? AND year=?;")
$db.transaction
File.new("data/ratings.list").each_line do |l|
if match = ratings_re.match(l)
rating, votes, outof10, title, year = match[1], match[2], match[3], match[4], match[5]
stmt.execute!(votes, outof10, rating, title, year)
end
end
$db.commit
end
# puts "Importing movies"
# import_movies
puts "Importing times"
import_times
puts "Importing budgets"
import_budgets
puts "Importing ratings"
import_mpaa_ratings
puts "Importing votes"
import_ratings
puts "Importing genres"
import_genres
#puts Movie.count( "budget > 0")
#puts Movie.count( "length > 0")
#puts Movie.count( "budget > 0 and length > 0")
#puts Movie.count( "imdb_votes > 0 and length > 0")
#puts Movie.count( "budget > 0 and length > 0 and imdb_votes > 0")