diff --git a/.circleci/config.yml b/.circleci/config.yml index e60cb218..5b613c9b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -44,7 +44,7 @@ jobs: test-odm: <<: *shared docker: - - image: circleci/ruby:2.6.3 + - image: circleci/ruby:2.6.5 environment: BUNDLE_JOBS: 3 BUNDLE_RETRY: 3 diff --git a/.ruby-version b/.ruby-version index bd4053bf..57cf282e 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -2.6.3 \ No newline at end of file +2.6.5 diff --git a/Gemfile b/Gemfile index 5b6b452f..1ad32fde 100644 --- a/Gemfile +++ b/Gemfile @@ -1,5 +1,5 @@ source 'https://rubygems.org' -ruby '2.6.3' +ruby '2.6.5' # Distribute your app as a gem # gemspec diff --git a/Gemfile.lock b/Gemfile.lock index fbb299f2..8bdcb25f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -28,7 +28,7 @@ GEM cf-app-utils (0.6) coderay (1.1.2) concurrent-ruby (1.1.5) - crass (1.0.4) + crass (1.0.5) diff-lcs (1.3) docile (1.3.2) dotenv (2.7.5) @@ -57,7 +57,7 @@ GEM liquid (3.0.3) liquify (0.2.7) liquid (>= 2.2.2) - loofah (2.2.3) + loofah (2.3.1) crass (~> 1.0.2) nokogiri (>= 1.5.9) mail (2.7.1) @@ -213,7 +213,7 @@ DEPENDENCIES typhoeus RUBY VERSION - ruby 2.6.3p62 + ruby 2.6.5p114 BUNDLED WITH 1.17.2 diff --git a/app/controllers.rb b/app/controllers.rb index a3d13010..f503fa34 100644 --- a/app/controllers.rb +++ b/app/controllers.rb @@ -99,7 +99,7 @@ def set_content_type(options) # see comment in method body def get_search_args_from_params(params) options = {} - %w(metrics sort fields zip distance page per_page debug keys_nested all_programs).each do |opt| + %w(metrics sort fields zip distance page per_page debug keys_nested all_programs all_programs_nested).each do |opt| options[opt.to_sym] = params.delete("_#{opt}") # TODO: remove next line to end support for un-prefixed option parameters options[opt.to_sym] ||= params.delete(opt) @@ -113,8 +113,9 @@ def get_search_args_from_params(params) options[:fields] = check_fields_for_wildcards(options[:fields]) - options[:keys_nested] = check_for_valid_key_format_input(options[:keys_nested]) - options[:all_programs] = check_for_valid_key_format_input(options[:all_programs]) + options[:keys_nested] = check_for_valid_key_format_input(options[:keys_nested]) + options[:all_programs] = check_for_valid_key_format_input(options[:all_programs]) + options[:all_programs_nested] = check_for_valid_key_format_input(options[:all_programs_nested]) options[:metrics] = options[:metrics].split(/\s*,\s*/) if options[:metrics] options diff --git a/lib/data_magic.rb b/lib/data_magic.rb index b0db7479..50444ff7 100644 --- a/lib/data_magic.rb +++ b/lib/data_magic.rb @@ -97,7 +97,7 @@ def self.search(terms, options = {}) time_start = Time.now.to_f result = client.search full_query - + search_time = Time.now.to_f - time_start logger.info "ES query time (ms): #{result["took"]} ; Query fetch time (s): #{search_time} ; result: #{result.inspect[0..500]}" @@ -149,11 +149,12 @@ def self.process_result_from_es( hits, result_processing_info, query_body, optio # Collect list of nested fields that need to be filtered # This is neccessary because the standard ES fields filter creates arrays from nested data, which we don't want nested_fields_filter = result_processing_info[:nested_fields_filter] ? result_processing_info[:nested_fields_filter] : [] - - if query_body.dig(:_source).class == Hash + all_programs_nested = options[:all_programs_nested] + + if query_body.dig(:_source) == { exclude: ["_*"] } # we're getting the whole document and we can find in _source results = hits["hits"].map {|hit| hit["_source"]} - + # Tested - implementation of nested vs dotted option - when line below is exposed, # and &keys_nested=true is in query, I get Error: JSON::NestingError - nesting of 100 is too deep # results = options[:keys_nested] ? NestedHash.new(results) : results @@ -165,19 +166,12 @@ def self.process_result_from_es( hits, result_processing_info, query_body, optio from_source = hit.fetch("_source", {}) dotted_from_source = NestedHash.new.withdotkeys(from_source) found = found.merge(dotted_from_source) - - # When an inner query is submitted, the nested data_type fields are under inner_hits - inner = hit.fetch("inner_hits", {}) - delete_set = Set[] + delete_set = Set[] delete_set.each { |k| found.delete k } - - # each result looks like this: - # {"city"=>["Springfield"], "address"=>["742 Evergreen Terrace"], "children" => [{...}, {...}, {...}] } - found.keys.each { |key| found[key] = found[key].length > 1 ? found[key] : found[key][0] } - # now it should look like this: - # {"city"=>"Springfield", "address"=>"742 Evergreen Terrace, "children" => [{...}, {...}, {...}]} - + + found = transform_array_values(found) + # re-insert null fields that didn't get returned by ES if query_body[:fields] query_body[:fields].each do |field| @@ -187,39 +181,11 @@ def self.process_result_from_es( hits, result_processing_info, query_body, optio end end - # Collect inner hits - nested_details_hash = {} - if !inner.empty? - inner.keys.each do |inn_key| - inner_details = inner[inn_key]["hits"]["hits"].map do |nested_obj| - details = nested_obj.fetch("_source", {}) - n_hash = NestedHash.new - - details.keys.each do |key| - n_hash[key] = details[key] - end - # Convert to dotted keys - n_hash = n_hash.withdotkeys - - # If there is a fields filter for nested datatypes, apply it here - if !nested_fields_filter.empty? - keys_to_keep = nested_fields_filter.select { |f| f.start_with? inn_key }.map do |n| - n.gsub(inn_key + ".","") - end - n_hash_filtered = n_hash.select { |k| keys_to_keep.include?(k) } - end - - !n_hash_filtered.nil? ? n_hash_filtered : n_hash - end + # When an inner query is submitted, the nested data_type fields are under inner_hits + inner = hit.fetch("inner_hits", {}) - # Set the nested data type string as the key and the array of inner hits as the value - nested_details_hash[inn_key] = inner_details - end - end - - # If nested hits, combine with other fields in found hash - if !nested_details_hash.empty? - found = found.merge(nested_details_hash) + if !all_programs_nested && !inner.empty? + found = collect_inner_hits(inner, found, nested_fields_filter) end # If keys_nested option passed in params, then return result keys in nested format @@ -233,6 +199,78 @@ def self.process_result_from_es( hits, result_processing_info, query_body, optio results end + def self.field_type_nested?(field_name) + nested_datatypes = DataMagic.config.es_data_types["nested"] + + if nested_datatypes + nested_datatypes.any? {|nested| field_name.start_with? nested } + end + end + + def self.transform_array_values(found) + # each result looks like this: + # { + # "city"=>["Springfield"], + # "address"=>["742 Evergreen Terrace"], + # "children" => [{...}, {...}, {...}] + # } + found.keys.each do |key| + nested_data_type = field_type_nested?(key) + + # Keep nested datatypes in an array, even when there is just one program + if !nested_data_type && found[key].length <= 1 + found[key] = found[key][0] + else + found[key] = found[key] + end + end + # Now, it looks like the following.... + # { + # "city"=>"Springfield", + # "address"=>"742 Evergreen Terrace", + # "children" => [{...}, {...}, {...}] + # } + + found + end + + def self.collect_inner_hits(inner, found, nested_fields_filter) + nested_details_hash = {} + + inner.keys.each do |inn_key| + inner_details = inner[inn_key]["hits"]["hits"].map do |nested_obj| + details = nested_obj.fetch("_source", {}) + n_hash = NestedHash.new + + details.keys.each do |key| + n_hash[key] = details[key] + end + # Convert to dotted keys + n_hash = n_hash.withdotkeys + + # If there is a fields filter for nested datatypes, apply it here + if !nested_fields_filter.empty? + keys_to_keep = nested_fields_filter.select { |f| f.start_with? inn_key }.map do |n| + n.gsub(inn_key + ".","") + end + n_hash_filtered = n_hash.select { |k| keys_to_keep.include?(k) } + end + + !n_hash_filtered.nil? ? n_hash_filtered : n_hash + end + + # Set the nested data type string as the key and the array of inner hits as the value + nested_details_hash[inn_key] = inner_details + end + + # If nested hits, combine with other fields in found hash + if !nested_details_hash.empty? + found = found.merge(nested_details_hash) + end + + found + end + def self.document_data_type(hash, root='') hash.each do |key, value| if value.is_a?(Hash) && value[:type].nil? # things are nested under this diff --git a/lib/data_magic/query_builder.rb b/lib/data_magic/query_builder.rb index bd3972d4..e3a5b18f 100644 --- a/lib/data_magic/query_builder.rb +++ b/lib/data_magic/query_builder.rb @@ -1,8 +1,15 @@ module DataMagic module QueryBuilder class << self + @@dictionary ||= {} + + def set_dictionary(config) + @@dictionary = config.dictionary + end + # Creates query from parameters passed into endpoint and returns a Hash def from_params(params, options, config) + set_dictionary(config) per_page = (options[:per_page] || config.page_size || DataMagic::DEFAULT_PAGE_SIZE).to_i page = options[:page].to_i || 0 per_page = DataMagic::MAX_PAGE_SIZE if per_page > DataMagic::MAX_PAGE_SIZE @@ -24,7 +31,11 @@ def from_params(params, options, config) nested_query_pairs = term_pairs[:nested_query_pairs] query_pairs = term_pairs[:query_pairs] - all_programs = options[:all_programs] + all_programs_nested = options[:all_programs_nested] + if !all_programs_nested && options[:all_programs] + all_programs = options[:all_programs] + end + # Use stretchy to build query if all_programs # Treat all query fields as standard data types, rather than nested datatypes @@ -38,7 +49,7 @@ def from_params(params, options, config) nested_query = false if !all_programs && !nested_query_pairs.empty? nested_query = true - + if query_pairs.empty? build_query_from_nested_datatypes(nested_query_pairs, query_hash) else @@ -61,7 +72,7 @@ def from_params(params, options, config) query_hash.merge! add_aggregations(params, options, config) end - query_hash = set_query_source(query_hash, nested_query, nested_fields, query_fields) + query_hash = set_query_source(query_hash, nested_query, nested_fields, query_fields, all_programs_nested) query_hash[:sort] = get_sort_order(options[:sort], config) if options[:sort] && !options[:sort].empty? @@ -114,9 +125,23 @@ def nested_fields(submitted_fields) def determine_query_term_datatypes(params) nested_terms = params.keys.select { |key| field_type_nested?(key) } - nested_query_pairs = {} - nested_terms.each { |key| nested_query_pairs[key] = params[key] } + + nested_terms.each do |key| + split_key_terms = key.split(".") + nested, *standard_fields = split_key_terms + dotted_field = standard_fields.join(".") + + field_type = @@dictionary[dotted_field]["type"] + value = params[key] + + if field_type == "integer" && value.is_a?(String) && /,/.match(value) # list of integers + value = value.split(',').map do |str| + str.tr("[]","").to_i + end + end + nested_query_pairs[key] = value + end if !nested_terms.empty? nested_terms.each do |key| @@ -169,40 +194,99 @@ def sort_nested_query_paths_and_terms(nested_query_pairs) if nested_data_types.any? {|nested| key.start_with? nested } path = nested_data_types.select {|nested| key.start_with? nested }.join("") end - range_query = key.include?("__range") + or_query = value.is_a? Array + + use_filter_key = false if range_query query_term = get_nested_range_query(key, value) + elsif or_query + query_term = { terms: { key => value }} + use_filter_key = true else query_term = { match: { key => value }} end paths_and_terms.push({ path: path, - term: query_term + term: query_term, + use_filter_key: use_filter_key }) end - paths_and_terms + + build_filter_query = paths_and_terms.any? do |item| + item[:use_filter_key] + end + + paths_and_terms_cleaned_up = paths_and_terms.map do |p_and_t| + { + path: p_and_t[:path], + term: p_and_t[:term] + } + end + + query_info = { + paths_and_terms: paths_and_terms_cleaned_up, + build_filter_query: build_filter_query + } + + query_info end def build_nested_query(nested_query_pairs) - paths_and_terms = sort_nested_query_paths_and_terms(nested_query_pairs) + query_info = sort_nested_query_paths_and_terms(nested_query_pairs) + paths_and_terms = query_info[:paths_and_terms] + build_filter_query = query_info[:build_filter_query] paths = Set[] paths_and_terms.each { |hash| paths.add(hash[:path]) } + term_keys = Set[] + paths_and_terms.each { |hash| term_keys.add(hash[:term].keys.first) } + if paths.length == 1 - path = paths.to_a[0] - terms = paths_and_terms.map { |item| item[:term] } - nested_query = get_inner_nested_query(path, terms) + path = paths.to_a[0] + terms = paths_and_terms.map { |item| item[:term] } + + if term_keys.length > 1 + nested_query = get_nested_query_bool_filter_query(path, terms) + elsif term_keys.length == 1 && build_filter_query + nested_query = get_inner_nested_filter_query(path, terms) + else + nested_query = get_inner_nested_query(path, terms) + end end nested_query end + def get_nested_query_bool_filter_query(path, terms) + { + nested: { + path: path, + query: { + bool: { + filter: terms + } + }, + inner_hits: {} + } + } + end + def get_outer_nested_query(inner_queries) { must: inner_queries } end + def get_inner_nested_filter_query(path, terms) + { + nested: { + path: path, + filter: terms, + inner_hits: {} + } + } + end + def get_inner_nested_query(path, matches) { nested: { @@ -389,21 +473,23 @@ def search_location(squery, options) squery end - def set_query_source(query_hash, nested_query, nested_fields, query_fields) + def set_query_source(query_hash, nested_query, nested_fields, query_fields, all_programs_nested) # The distinction between nested datatype query vs non-nested datatype query refers # to the datatype of the field that must be matched. # The distinction between nested_fields vs query_fields refers to the fields returned in the response. The # response fields come from different sources depending on the query. - # if there is a nested_query OR if there are non-nested query_fields AND no nested fields - if nested_query || (!query_fields.empty? && nested_fields.empty?) + # if there is a nested_query && the all_programs_nested is not true + # OR if there are non-nested query_fields AND no nested fields + if nested_query && !all_programs_nested || (!query_fields.empty? && nested_fields.empty?) query_hash[:_source] = false # if this is NOT a nested_query AND there are nested fields, then filter source on those fields - elsif !nested_query && !nested_fields.empty? + # OR if the query includes a nested query AND the all_programs_nested option is passed + elsif !nested_query && !nested_fields.empty? || (nested_query && all_programs_nested) query_hash[:_source] = nested_fields - + # if neither fields, nor a source filter, then exclude fields from source beginning with underscores else query_hash[:_source] = { exclude: ["_*"] } diff --git a/spec/fixtures/nested_data_type/data.yml b/spec/fixtures/nested_data_type/data.yml index 6cec5360..29dbb320 100644 --- a/spec/fixtures/nested_data_type/data.yml +++ b/spec/fixtures/nested_data_type/data.yml @@ -20,6 +20,25 @@ dictionary: source: CITY type: autocomplete description: City + programs.cip_4_digit.code: + source: CIPCODE + type: integer + map: program + description: Classification of Instructional Programs (CIP) code for the field of study + programs.cip_4_digit.credential.level: + source: CREDLEV + type: integer + map: program + descripton: "Level of credential + Credentials are categorized into the following levels: + 1: Undergraduate Certificate or Diploma + 2: Associate's Degree + 3: Bachelor's Degree + 4: Post-baccalaureate Certificate + 5: Master's Degree + 6: Doctoral Degree + 7: First Professional Degree + 8: Graduate/Professional Certificate" shared_config: &shared_config contents: diff --git a/spec/lib/data_magic/nested_data_type_spec.rb b/spec/lib/data_magic/nested_data_type_spec.rb index 36e90c09..b7f156e8 100644 --- a/spec/lib/data_magic/nested_data_type_spec.rb +++ b/spec/lib/data_magic/nested_data_type_spec.rb @@ -32,9 +32,9 @@ end end - describe "builds queries based on nested datatype fields" do + describe "builds queries with on nested datatype fields depending on options passed" do context "in absence of all_programs param" do - subject { { "2016.programs.cip_4_digit" => "1312" } } + subject { { "2016.programs.cip_4_digit.code" => "1312" } } let(:expected_query) { { bool: { filter: { nested: { @@ -43,7 +43,7 @@ query: { bool: { must: [{ - match: { "2016.programs.cip_4_digit" => "1312" } + match: { "2016.programs.cip_4_digit.code" => "1312" } }] } } @@ -54,14 +54,150 @@ end context "in presence of all_programs param" do - subject {{ "2016.programs.cip_4_digit" => "1312" }} + subject {{ "2016.programs.cip_4_digit.code" => "1312" }} let(:options) {{ :all_programs => true }} - let(:expected_query) {{ match: { "2016.programs.cip_4_digit" => "1312" }} } + let(:expected_query) {{ match: { "2016.programs.cip_4_digit.code" => "1312" }} } let(:nested_meta) {{ post_es_response: {}, from: 0, size: 20, _source: {:exclude=>["_*"]} } } it_correctly "builds a query" end + + context "in presence of all_programs_nested param" do + subject {{ "2016.programs.cip_4_digit.code" => "1312" }} + let(:options) {{ :all_programs_nested => true, :fields => ["2016.programs.cip_4_digit.code.earnings.median_earnings"] }} + + let(:expected_query) { + { bool: { filter: { + nested: { + inner_hits: {}, + path: "2016.programs.cip_4_digit", + query: { + bool: { + must: [{ + match: { "2016.programs.cip_4_digit.code" => "1312" } + }] + } + } + } + } } } + } + let(:nested_meta) {{ + post_es_response: {:nested_fields_filter=>["2016.programs.cip_4_digit.code.earnings.median_earnings"]}, + from: 0, + size: 20, + _source: ["2016.programs.cip_4_digit.code.earnings.median_earnings"] + }} + + it_correctly "builds a query" + end + end + + describe "builds correct nested query objects depending on terms passed" do + context "for a single nested datatype query that takes an array of values" do + subject { { "2016.programs.cip_4_digit.credential.level" => "[2,3,5]" } } + let(:expected_query) { + { bool: { filter: { + nested: { + inner_hits: {}, + path: "2016.programs.cip_4_digit", + filter: [ + { "terms": { "2016.programs.cip_4_digit.credential.level" => [2, 3, 5]} } + ] + } + } } } + } + it_correctly "builds a query" + end + + context "when more than one terms and each term has a single value" do + subject { { + "2016.programs.cip_4_digit.code" => "1312", + "2016.programs.cip_4_digit.credential.level" => "2", + } } + let(:expected_query) { + { bool: { filter: { + nested: { + inner_hits: {}, + path: "2016.programs.cip_4_digit", + query: { + bool: { + must: [ + { match: { "2016.programs.cip_4_digit.code" => "1312" }}, + { match: { "2016.programs.cip_4_digit.credential.level" => "2" }} + ] + } + } + } + } } } + } + it_correctly "builds a query" + + end + + context "when more than one term and each term takes an array of values" do + subject { { + "2016.programs.cip_4_digit.credential.level" => "[2,3,5]", + "2016.programs.cip_4_digit.code" => "[1312,4004]", + } } + let(:expected_query) { + { bool: { filter: { + nested: { + inner_hits: {}, + path: "2016.programs.cip_4_digit", + filter: [ + { "terms": { "2016.programs.cip_4_digit.credential.level" => [2, 3, 5]} }, + { "terms": { "2016.programs.cip_4_digit.code" => [1312,4004]} } + ] + } + } } } + } + it_correctly "builds a query" + end + + context "when one term has an array of values and the other has a single value" do + subject { { + "2016.programs.cip_4_digit.credential.level" => "[2,3,5]", + "2016.programs.cip_4_digit.code" => "1312" + } } + let(:expected_query) { + { bool: { filter: { + nested: { + inner_hits: {}, + path: "2016.programs.cip_4_digit", + query: { + bool: { + filter: [ + { terms: { "2016.programs.cip_4_digit.credential.level" => [2, 3, 5]} }, + { match: { "2016.programs.cip_4_digit.code" => "1312" }} + ] + } + } + } + } } } + } + it_correctly "builds a query" + + end + end + + + describe "builds nested filter queries for terms that accept an array of values" do + context "for a single nested datatype query term" do + subject { { "2016.programs.cip_4_digit.credential.level" => "[2,3,5]" } } + let(:expected_query) { + { bool: { filter: { + nested: { + inner_hits: {}, + path: "2016.programs.cip_4_digit", + filter: [ + { "terms": { "2016.programs.cip_4_digit.credential.level" => [2, 3, 5]} } + ] + } + } } } + } + it_correctly "builds a query" + end end describe "builds queries that correctly handle fields in params" do @@ -94,7 +230,7 @@ context "only nested datatype fields are passed in params" do context "the query is NOT a nested query type" do subject {{}} - let(:fields_in_params) { ["2016.programs.cip_4_digit.code"] } + let(:fields_in_params) { ["2016.programs.cip_4_digit.code.code"] } let(:options) {{ :fields => fields_in_params }} it "assigns the fields to _source" do @@ -107,7 +243,7 @@ end context "the query is a nested query type" do - subject {{ "2016.programs.cip_4_digit" => "1312" }} + subject {{ "2016.programs.cip_4_digit.code" => "1312" }} let(:fields_in_params) { ["2016.programs.cip_4_digit.code"] } let(:options) {{ :fields => fields_in_params }} let(:source_value) { false }