From bce4708f5941406665168224ebdf91e265b5b2f8 Mon Sep 17 00:00:00 2001 From: Lucas Videla Date: Fri, 25 Oct 2019 16:54:36 -0300 Subject: [PATCH] Better levenshtein distances --- Gemfile | 3 +- Gemfile.lock | 2 ++ Makefile | 11 +------ app/controllers/admin/speakers_controller.rb | 32 +++++++++++--------- app/models/speaker.rb | 5 +++ utilities/data_analysis/distance.rb | 26 +++++++++++----- 6 files changed, 45 insertions(+), 34 deletions(-) diff --git a/Gemfile b/Gemfile index 9c96332..21de501 100644 --- a/Gemfile +++ b/Gemfile @@ -59,4 +59,5 @@ gem "aws-sdk" gem "bcrypt" # Levenshtein Distance Analyzer -gem 'edits' \ No newline at end of file +gem 'edits' +gem "table_print" \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index 43574b9..41a344c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -886,6 +886,7 @@ GEM activesupport (>= 4.0) sprockets (>= 3.0.0) sqlite3 (1.3.13) + table_print (1.5.6) thor (0.20.3) thread_safe (0.3.6) tilt (2.0.9) @@ -944,6 +945,7 @@ DEPENDENCIES spring (~> 2.0) spring-watcher-listen (~> 2.0) sqlite3 (~> 1.3.6) + table_print turbolinks (~> 5) tzinfo-data uglifier (>= 1.3.0) diff --git a/Makefile b/Makefile index 5c401f8..36418e0 100644 --- a/Makefile +++ b/Makefile @@ -45,22 +45,13 @@ production_deploy: heroku restart --app confy-wecodeio heroku maintenance:off --app confy-wecodeio -capture_production_db: +use_production_db: heroku pg:backups capture --app confy-wecodeio - $(MAKE) download_production_db - -download_production_db: curl -o tmp/latest.dump `heroku pg:backups public-url --app confy-wecodeio` - -restore_production_db: docker-compose run web bundle exec rake db:drop db:create DISABLE_DATABASE_ENVIRONMENT_CHECK=1 docker cp tmp/latest.dump confy_db_1:/latest.dump ! docker exec confy_db_1 pg_restore --verbose --clean --no-acl --no-owner -h localhost -d confy_development -U confy /latest.dump docker-compose run web bundle exec rake db:migrate -get_current_production_db: - $(MAKE) capture_production_db - $(MAKE) restore_production_db - analyze_levenshtein: docker-compose run web rails runner utilities/data_analysis/distance.rb \ No newline at end of file diff --git a/app/controllers/admin/speakers_controller.rb b/app/controllers/admin/speakers_controller.rb index e557236..fd4fda5 100644 --- a/app/controllers/admin/speakers_controller.rb +++ b/app/controllers/admin/speakers_controller.rb @@ -10,25 +10,27 @@ def levenshtein field = params[:field] || "name" by_size = items.group_by { |s| s.name.length } - @pairs = [] + pairs = [] (0..(by_size.keys.size - 1)).each do |k| - interesting_values = [] - ((-[k, THRESHOLD - 1].min)..0).each do |p| - interesting_values << by_size[by_size.keys.sort[k + p]] - end - (0..interesting_values.length - 1).each do |o| - (0..(interesting_values[o].length - 1)).each do |i| - d = 0 - ((i+1)..(interesting_values[interesting_values.length - 1].length - 1)).each do |j| - p1 = interesting_values[o][i] - p2 = interesting_values[interesting_values.length - 1][j] - d = Edits::Levenshtein.distance_with_max(p1[field], p2[field], THRESHOLD + 1) - @pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < THRESHOLD + interesting_values = [] + ((-[k, THRESHOLD - 1].min)..0).each do |p| + interesting_values << by_size[by_size.keys.sort[k + p]] + end + (0..interesting_values.length - 1).each do |o| + (0..(interesting_values[o].length - 1)).each do |i| + d = 0 + ((i+1)..(interesting_values[interesting_values.length - 1].length - 1)).each do |j| + p1 = interesting_values[o][i] + p2 = interesting_values[interesting_values.length - 1][j] + d = Edits::Levenshtein.distance_with_max(p1.levenshtein_name, p2.levenshtein_name, THRESHOLD + 1) + pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < THRESHOLD + end end end - end end - @pairs.sort_by!(&:distance) + pairs.sort_by!(&:distance) + + @pairs = pairs end def merge diff --git a/app/models/speaker.rb b/app/models/speaker.rb index db2a472..7cf8ef7 100644 --- a/app/models/speaker.rb +++ b/app/models/speaker.rb @@ -14,4 +14,9 @@ def merge_with(disposable) talks << disposable.talks disposable.delete end + + def levenshtein_name + name.gsub(/(\W|\d)/, "").split.sort.join(" ").downcase + end + end diff --git a/utilities/data_analysis/distance.rb b/utilities/data_analysis/distance.rb index 2c0d86a..fce594e 100644 --- a/utilities/data_analysis/distance.rb +++ b/utilities/data_analysis/distance.rb @@ -3,22 +3,32 @@ clazz = Speaker field = "name" +THRESHOLD = 5 items = clazz.all total = items.size steps = total / 20.0 +by_size = items.group_by { |s| s.name.length } pairs = [] -(0..(total - 1)).each do |i| - ((i+1)..(total - 1)).each do |j| - p1 = items[i] - p2 = items[j] - d = Edits::Levenshtein.distance(p1[field], p2[field]) - pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < 3 +(0..(by_size.keys.size - 1)).each do |k| + interesting_values = [] + ((-[k, THRESHOLD - 1].min)..0).each do |p| + interesting_values << by_size[by_size.keys.sort[k + p]] + end + (0..interesting_values.length - 1).each do |o| + (0..(interesting_values[o].length - 1)).each do |i| + d = 0 + ((i+1)..(interesting_values[interesting_values.length - 1].length - 1)).each do |j| + p1 = interesting_values[o][i] + p2 = interesting_values[interesting_values.length - 1][j] + d = Edits::Levenshtein.distance_with_max(p1.levenshtein_name, p2.levenshtein_name, THRESHOLD + 1) + pairs << OpenStruct.new(item1: p1, item2: p2, distance: d) if d < THRESHOLD + end + end end - # printf("\rProgress: [%-20s]", "=" * (i/steps)) - # puts end +pairs.sort_by!(&:distance) puts "Informe para #{clazz} por #{field}"