Skip to content

Commit

Permalink
Better levenshtein distances
Browse files Browse the repository at this point in the history
  • Loading branch information
delucas committed Oct 25, 2019
1 parent 57ae3db commit bce4708
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 34 deletions.
3 changes: 2 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,5 @@ gem "aws-sdk"
gem "bcrypt"

# Levenshtein Distance Analyzer
gem 'edits'
gem 'edits'
gem "table_print"
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,7 @@ GEM
activesupport (>= 4.0)
sprockets (>= 3.0.0)
sqlite3 (1.3.13)
table_print (1.5.6)
thor (0.20.3)
thread_safe (0.3.6)
tilt (2.0.9)
Expand Down Expand Up @@ -944,6 +945,7 @@ DEPENDENCIES
spring (~> 2.0)
spring-watcher-listen (~> 2.0)
sqlite3 (~> 1.3.6)
table_print
turbolinks (~> 5)
tzinfo-data
uglifier (>= 1.3.0)
Expand Down
11 changes: 1 addition & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -45,22 +45,13 @@ production_deploy:
heroku restart --app confy-wecodeio
heroku maintenance:off --app confy-wecodeio

capture_production_db:
use_production_db:
heroku pg:backups capture --app confy-wecodeio
$(MAKE) download_production_db

download_production_db:
curl -o tmp/latest.dump `heroku pg:backups public-url --app confy-wecodeio`

restore_production_db:
docker-compose run web bundle exec rake db:drop db:create DISABLE_DATABASE_ENVIRONMENT_CHECK=1
docker cp tmp/latest.dump confy_db_1:/latest.dump
! docker exec confy_db_1 pg_restore --verbose --clean --no-acl --no-owner -h localhost -d confy_development -U confy /latest.dump
docker-compose run web bundle exec rake db:migrate

get_current_production_db:
$(MAKE) capture_production_db
$(MAKE) restore_production_db

analyze_levenshtein:
docker-compose run web rails runner utilities/data_analysis/distance.rb
32 changes: 17 additions & 15 deletions app/controllers/admin/speakers_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,27 @@ def levenshtein
field = params[:field] || "name"

by_size = items.group_by { |s| s.name.length }
@pairs = []
pairs = []
(0..(by_size.keys.size - 1)).each do |k|
interesting_values = []
((-[k, THRESHOLD - 1].min)..0).each do |p|
interesting_values << by_size[by_size.keys.sort[k + p]]
end
(0..interesting_values.length - 1).each do |o|
(0..(interesting_values[o].length - 1)).each do |i|
d = 0
((i+1)..(interesting_values[interesting_values.length - 1].length - 1)).each do |j|
p1 = interesting_values[o][i]
p2 = interesting_values[interesting_values.length - 1][j]
d = Edits::Levenshtein.distance_with_max(p1[field], p2[field], THRESHOLD + 1)
@pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < THRESHOLD
interesting_values = []
((-[k, THRESHOLD - 1].min)..0).each do |p|
interesting_values << by_size[by_size.keys.sort[k + p]]
end
(0..interesting_values.length - 1).each do |o|
(0..(interesting_values[o].length - 1)).each do |i|
d = 0
((i+1)..(interesting_values[interesting_values.length - 1].length - 1)).each do |j|
p1 = interesting_values[o][i]
p2 = interesting_values[interesting_values.length - 1][j]
d = Edits::Levenshtein.distance_with_max(p1.levenshtein_name, p2.levenshtein_name, THRESHOLD + 1)
pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < THRESHOLD
end
end
end
end
end
@pairs.sort_by!(&:distance)
pairs.sort_by!(&:distance)

@pairs = pairs
end

def merge
Expand Down
5 changes: 5 additions & 0 deletions app/models/speaker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,9 @@ def merge_with(disposable)
talks << disposable.talks
disposable.delete
end

def levenshtein_name
name.gsub(/(\W|\d)/, "").split.sort.join(" ").downcase
end

end
26 changes: 18 additions & 8 deletions utilities/data_analysis/distance.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,32 @@
clazz = Speaker
field = "name"

THRESHOLD = 5

items = clazz.all
total = items.size
steps = total / 20.0

by_size = items.group_by { |s| s.name.length }
pairs = []
(0..(total - 1)).each do |i|
((i+1)..(total - 1)).each do |j|
p1 = items[i]
p2 = items[j]
d = Edits::Levenshtein.distance(p1[field], p2[field])
pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < 3
(0..(by_size.keys.size - 1)).each do |k|
interesting_values = []
((-[k, THRESHOLD - 1].min)..0).each do |p|
interesting_values << by_size[by_size.keys.sort[k + p]]
end
(0..interesting_values.length - 1).each do |o|
(0..(interesting_values[o].length - 1)).each do |i|
d = 0
((i+1)..(interesting_values[interesting_values.length - 1].length - 1)).each do |j|
p1 = interesting_values[o][i]
p2 = interesting_values[interesting_values.length - 1][j]
d = Edits::Levenshtein.distance_with_max(p1.levenshtein_name, p2.levenshtein_name, THRESHOLD + 1)
pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < THRESHOLD
end
end
end
# printf("\rProgress: [%-20s]", "=" * (i/steps))
# puts
end
pairs.sort_by!(&:distance)


puts "Informe para #{clazz} por #{field}"
Expand Down

0 comments on commit bce4708

Please sign in to comment.