diff --git a/deploy/nl/catalog.yaml b/deploy/nl/catalog.yaml index af3a474e90..bc4c3a638d 100644 --- a/deploy/nl/catalog.yaml +++ b/deploy/nl/catalog.yaml @@ -90,7 +90,7 @@ indexes: base_uae_mem: store_type: MEMORY source_path: ../../tools/nl/embeddings/input/base - embeddings_path: gs://datcom-nl-models/base_uae_mem_2024_07_11_08_35_57/embeddings.csv + embeddings_path: gs://datcom-nl-models/base_uae_mem_2024_07_12_09_03_25/embeddings.csv model: uae-large-v1-model healthcheck_query: "Life expectancy" base_mistral_mem: diff --git a/run_test.sh b/run_test.sh index bb6c4aa754..6d37dc3b8b 100755 --- a/run_test.sh +++ b/run_test.sh @@ -64,7 +64,7 @@ function run_lint_fix { pip3 install isort -q fi yapf -r -i -p --style='{based_on_style: google, indent_width: 2}' server/ nl_server/ shared/ tools/ -e=*pb2.py -e=**/.env/** - isort server/ nl_server/ shared/ tools/ --skip-glob *pb2.py --skip-glob **/.env/** --profile google + isort server/ nl_server/ shared/ tools/ --skip-glob=*pb2.py --skip-glob=**/.env/** --profile=google deactivate } diff --git a/server/integration_tests/test_data/demo_fallback/query_2/chart_config.json b/server/integration_tests/test_data/demo_fallback/query_2/chart_config.json index 4debf0ea5a..e604f84cda 100644 --- a/server/integration_tests/test_data/demo_fallback/query_2/chart_config.json +++ b/server/integration_tests/test_data/demo_fallback/query_2/chart_config.json @@ -78,16 +78,16 @@ "geoId/12" ], "statVarKey": [ - "Count_CriminalIncidents_IsHateCrime_multiple_place_bar_block" + "Count_CriminalActivities_MurderAndNonNegligentManslaughter_multiple_place_bar_block" ], - "title": "Hate Crime Incidents (${date})", + "title": "Murder and Non Negligent Manslaughter Cases (${date})", "type": "BAR" } ] } ], "denom": "Count_Person", - "title": "Hate Crime Incidents" + "title": "Murder and Non Negligent Manslaughter Cases" }, { "columns": [ @@ -104,17 +104,16 @@ "geoId/12" ], "statVarKey": [ - "Count_CriminalIncidents_BiasMotivationDisabilityStatus_AggravatedAssault_IsHateCrime_multiple_place_bar_block", - "Count_CriminalIncidents_BiasMotivationDisabilityStatus_Robbery_IsHateCrime_multiple_place_bar_block" + "Count_CriminalIncidents_IsHateCrime_multiple_place_bar_block" ], - "title": "Hate Crime Cases by Type (${date})", + "title": "Hate Crime Incidents (${date})", "type": "BAR" } ] } ], "denom": "Count_Person", - "title": "Hate Crime Cases by Type" + "title": "Hate Crime Incidents" }, { "columns": [ @@ -131,16 +130,17 @@ "geoId/12" ], "statVarKey": [ - "Count_CriminalActivities_MurderAndNonNegligentManslaughter_multiple_place_bar_block" + "Count_CriminalIncidents_BiasMotivationDisabilityStatus_AggravatedAssault_IsHateCrime_multiple_place_bar_block", + "Count_CriminalIncidents_BiasMotivationDisabilityStatus_Robbery_IsHateCrime_multiple_place_bar_block" ], - "title": "Murder and Non Negligent Manslaughter Cases (${date})", + "title": "Hate Crime Cases by Type (${date})", "type": "BAR" } ] } ], "denom": "Count_Person", - "title": "Murder and Non Negligent Manslaughter Cases" + "title": "Hate Crime Cases by Type" }, { "columns": [ diff --git a/server/integration_tests/test_data/demo_fallback/query_3/chart_config.json b/server/integration_tests/test_data/demo_fallback/query_3/chart_config.json index 80fa38be66..5480ad727c 100644 --- a/server/integration_tests/test_data/demo_fallback/query_3/chart_config.json +++ b/server/integration_tests/test_data/demo_fallback/query_3/chart_config.json @@ -71,9 +71,9 @@ "tiles": [ { "statVarKey": [ - "Count_CriminalIncidents_IsHateCrime" + "Count_CriminalActivities_MurderAndNonNegligentManslaughter" ], - "title": "Hate Crime Incidents in California", + "title": "Murder and Non Negligent Manslaughter Cases in California", "type": "LINE" } ] @@ -81,18 +81,18 @@ { "tiles": [ { - "description": "Hate Crime Incidents in California", + "description": "Murder and Non Negligent Manslaughter Cases in California", "statVarKey": [ - "Count_CriminalIncidents_IsHateCrime" + "Count_CriminalActivities_MurderAndNonNegligentManslaughter" ], - "title": "Hate Crime Incidents in California", + "title": "Murder and Non Negligent Manslaughter Cases in California", "type": "HIGHLIGHT" } ] } ], "denom": "Count_Person", - "title": "Hate Crime Incidents" + "title": "Murder and Non Negligent Manslaughter Cases" }, { "columns": [ @@ -100,47 +100,47 @@ "tiles": [ { "statVarKey": [ - "Count_CriminalIncidents_BiasMotivationDisabilityStatus_AggravatedAssault_IsHateCrime", - "Count_CriminalIncidents_BiasMotivationDisabilityStatus_Intimidation_IsHateCrime", - "Count_CriminalIncidents_BiasMotivationDisabilityStatus_Robbery_IsHateCrime" + "Count_CriminalIncidents_IsHateCrime" ], - "title": "Hate Crime Cases by Type in California", + "title": "Hate Crime Incidents in California", "type": "LINE" } ] - } - ], - "denom": "Count_Person", - "title": "Hate Crime Cases by Type" - }, - { - "columns": [ + }, { "tiles": [ { + "description": "Hate Crime Incidents in California", "statVarKey": [ - "Count_CriminalActivities_MurderAndNonNegligentManslaughter" + "Count_CriminalIncidents_IsHateCrime" ], - "title": "Murder and Non Negligent Manslaughter Cases in California", - "type": "LINE" + "title": "Hate Crime Incidents in California", + "type": "HIGHLIGHT" } ] - }, + } + ], + "denom": "Count_Person", + "title": "Hate Crime Incidents" + }, + { + "columns": [ { "tiles": [ { - "description": "Murder and Non Negligent Manslaughter Cases in California", "statVarKey": [ - "Count_CriminalActivities_MurderAndNonNegligentManslaughter" + "Count_CriminalIncidents_BiasMotivationDisabilityStatus_AggravatedAssault_IsHateCrime", + "Count_CriminalIncidents_BiasMotivationDisabilityStatus_Intimidation_IsHateCrime", + "Count_CriminalIncidents_BiasMotivationDisabilityStatus_Robbery_IsHateCrime" ], - "title": "Murder and Non Negligent Manslaughter Cases in California", - "type": "HIGHLIGHT" + "title": "Hate Crime Cases by Type in California", + "type": "LINE" } ] } ], "denom": "Count_Person", - "title": "Murder and Non Negligent Manslaughter Cases" + "title": "Hate Crime Cases by Type" }, { "columns": [ diff --git a/server/integration_tests/test_data/detection_api_multivar/comparemalepopulationwithfemalepopulation/debug_info.json b/server/integration_tests/test_data/detection_api_multivar/comparemalepopulationwithfemalepopulation/debug_info.json index c6f926b5fc..4e8170d3c4 100644 --- a/server/integration_tests/test_data/detection_api_multivar/comparemalepopulationwithfemalepopulation/debug_info.json +++ b/server/integration_tests/test_data/detection_api_multivar/comparemalepopulationwithfemalepopulation/debug_info.json @@ -54,7 +54,7 @@ "Parts": [ { "CosineScore": [ - 0.9257857799530029 + 0.9257858991622925 ], "QueryPart": "male population", "SV": [ @@ -63,8 +63,8 @@ }, { "CosineScore": [ - 0.9296980500221252, - 0.8847433924674988 + 0.9296978712081909, + 0.884743332862854 ], "QueryPart": "female population", "SV": [ @@ -80,7 +80,7 @@ "Parts": [ { "CosineScore": [ - 0.9117770791053772 + 0.911777138710022 ], "QueryPart": "male population female", "SV": [ @@ -89,8 +89,8 @@ }, { "CosineScore": [ - 0.8982378840446472, - 0.8723467588424683 + 0.8982377648353577, + 0.8723466396331787 ], "QueryPart": "population", "SV": [ @@ -106,9 +106,9 @@ "Parts": [ { "CosineScore": [ - 0.8034241199493408, - 0.8017654418945312, - 0.775155782699585, + 0.8034241795539856, + 0.8017653822898865, + 0.7751558423042297, 0.7698014974594116, 0.7586972713470459 ], diff --git a/server/integration_tests/test_data/detection_api_multivar/howarefactorslikeobesity,bloodpressureandasthmaimpactedbyclimatechange/debug_info.json b/server/integration_tests/test_data/detection_api_multivar/howarefactorslikeobesity,bloodpressureandasthmaimpactedbyclimatechange/debug_info.json index 0c125dcdfd..4f89c0a024 100644 --- a/server/integration_tests/test_data/detection_api_multivar/howarefactorslikeobesity,bloodpressureandasthmaimpactedbyclimatechange/debug_info.json +++ b/server/integration_tests/test_data/detection_api_multivar/howarefactorslikeobesity,bloodpressureandasthmaimpactedbyclimatechange/debug_info.json @@ -113,10 +113,10 @@ "Parts": [ { "CosineScore": [ - 0.7768431901931763, - 0.7495046257972717, - 0.7342654466629028, - 0.7325114011764526 + 0.7768430709838867, + 0.749504566192627, + 0.7342653274536133, + 0.7325113415718079 ], "QueryPart": "factors like obesity blood pressure asthma", "SV": [ @@ -128,7 +128,7 @@ }, { "CosineScore": [ - 0.9056920409202576 + 0.9056921005249023 ], "QueryPart": "impacted climate change", "SV": [ diff --git a/server/integration_tests/test_data/detection_translate_chinese/chart_config.json b/server/integration_tests/test_data/detection_translate_chinese/chart_config.json index 9e7293495c..b2885090eb 100644 --- a/server/integration_tests/test_data/detection_translate_chinese/chart_config.json +++ b/server/integration_tests/test_data/detection_translate_chinese/chart_config.json @@ -11,9 +11,6 @@ "contained_in_place_type": "City", "had_default_type": false, "type": 4 - }, - { - "type": 14 } ], "client": "test_detect", diff --git a/server/integration_tests/test_data/e2e_edge_cases2/povertyvs.unemploymentrateindistrictsoftamilnadu/chart_config.json b/server/integration_tests/test_data/e2e_edge_cases2/povertyvs.unemploymentrateindistrictsoftamilnadu/chart_config.json index b053c625e5..9af6fd1c7c 100644 --- a/server/integration_tests/test_data/e2e_edge_cases2/povertyvs.unemploymentrateindistrictsoftamilnadu/chart_config.json +++ b/server/integration_tests/test_data/e2e_edge_cases2/povertyvs.unemploymentrateindistrictsoftamilnadu/chart_config.json @@ -31,7 +31,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population Below Poverty Line" }, { diff --git a/server/integration_tests/test_data/e2e_high_sv_threshold/chart_config.json b/server/integration_tests/test_data/e2e_high_sv_threshold/chart_config.json index a78e115aac..fc7fd99909 100644 --- a/server/integration_tests/test_data/e2e_high_sv_threshold/chart_config.json +++ b/server/integration_tests/test_data/e2e_high_sv_threshold/chart_config.json @@ -62,7 +62,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Count of Mortality Event: 1 Years or Less in Counties of Massachusetts" }, { @@ -83,7 +82,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Causes of Infant Mortality" }, { @@ -116,7 +114,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Mortality Events (1 Years or Less): Symptoms, Signs and Abnormal Clinical and Laboratory Findings, Not Elsewhere Classified) in Counties of Massachusetts" }, { @@ -149,7 +146,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Mortality Events (1 Years or Less): Certain Conditions Originating in the Perinatal Period) in Counties of Massachusetts" }, { diff --git a/server/integration_tests/test_data/e2e_india_demo/howdoesliteracyratecomparetopovertyinindia/chart_config.json b/server/integration_tests/test_data/e2e_india_demo/howdoesliteracyratecomparetopovertyinindia/chart_config.json index be3e6d0b80..9425dcb40d 100644 --- a/server/integration_tests/test_data/e2e_india_demo/howdoesliteracyratecomparetopovertyinindia/chart_config.json +++ b/server/integration_tests/test_data/e2e_india_demo/howdoesliteracyratecomparetopovertyinindia/chart_config.json @@ -23,7 +23,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Literate vs. Population Below poverty line" }, { @@ -56,7 +55,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Literate in Administrative Area 1 Places of India" }, { @@ -89,7 +87,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population Below Poverty Line in Administrative Area 1 Places of India" }, { @@ -111,7 +108,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Illiterate vs. Population Below poverty line (Per Capita)" }, { @@ -195,7 +191,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Literate vs. Population Below poverty line (Per Capita)" }, { @@ -217,7 +212,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Illiterate vs. Population Below poverty line" } ], diff --git a/server/integration_tests/test_data/e2e_india_demo/howdoestheliteracyratecompare/chart_config.json b/server/integration_tests/test_data/e2e_india_demo/howdoestheliteracyratecompare/chart_config.json index bbe01500ef..c0e50c2620 100644 --- a/server/integration_tests/test_data/e2e_india_demo/howdoestheliteracyratecompare/chart_config.json +++ b/server/integration_tests/test_data/e2e_india_demo/howdoestheliteracyratecompare/chart_config.json @@ -36,7 +36,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Literate" }, { @@ -71,7 +70,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Illiterate" }, { @@ -139,7 +137,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Literate, Urban" }, { @@ -174,7 +171,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Male, Literate" }, { @@ -209,7 +205,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Literate, Rural" }, { @@ -244,7 +239,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Illiterate, Urban" }, { @@ -279,7 +273,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Female, Literate" }, { @@ -314,7 +307,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Male, Illiterate" }, { @@ -349,7 +341,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Illiterate, Rural" }, { @@ -384,7 +375,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Female, Illiterate" }, { @@ -419,7 +409,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Houseless, Literate" }, { @@ -454,7 +443,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Houseless, Illiterate" }, { @@ -489,7 +477,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Houseless, Literate, Urban" }, { @@ -524,7 +511,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Houseless, Illiterate, Urban" }, { @@ -559,7 +545,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population: Houseless, Literate, Rural" } ], diff --git a/server/integration_tests/test_data/e2e_toolformer_rig_mode/whatistheinfantmortalityrateinmassachusetts/chart_config.json b/server/integration_tests/test_data/e2e_toolformer_rig_mode/whatistheinfantmortalityrateinmassachusetts/chart_config.json index d15f3e6c0a..5ef8a767ca 100644 --- a/server/integration_tests/test_data/e2e_toolformer_rig_mode/whatistheinfantmortalityrateinmassachusetts/chart_config.json +++ b/server/integration_tests/test_data/e2e_toolformer_rig_mode/whatistheinfantmortalityrateinmassachusetts/chart_config.json @@ -115,7 +115,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Mortality Events (1 Years or Less): Certain Conditions Originating in the Perinatal Period)" }, { @@ -148,7 +147,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Mortality Events (1 Years or Less): Certain Conditions Originating in the Perinatal Period) in Counties of Massachusetts" }, { @@ -206,7 +204,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Mortality Events (1 Years or Less): Diseases of the Circulatory System" }, { @@ -236,7 +233,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Mortality Events (1 Years or Less): Q00-Q99 ( Congenital Malformations, Deformations and Chromosomal Abnormalities)" }, { @@ -269,7 +265,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Mortality Events (1 Years or Less): Q00-Q99 ( Congenital Malformations, Deformations and Chromosomal Abnormalities) in Counties of Massachusetts" } ], diff --git a/server/integration_tests/test_data/e2e_toolformer_rig_mode/whatisthepovertyrateinseattle/chart_config.json b/server/integration_tests/test_data/e2e_toolformer_rig_mode/whatisthepovertyrateinseattle/chart_config.json index e37e40b845..84f31e7a88 100644 --- a/server/integration_tests/test_data/e2e_toolformer_rig_mode/whatisthepovertyrateinseattle/chart_config.json +++ b/server/integration_tests/test_data/e2e_toolformer_rig_mode/whatisthepovertyrateinseattle/chart_config.json @@ -31,7 +31,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Population Below Poverty Line" }, { @@ -61,7 +60,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Below Poverty Line (Two or More Races)" }, { @@ -91,7 +89,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Males in Poverty (In Last Year)" }, { @@ -121,7 +118,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Women Below Poverty Line" } ], diff --git a/server/integration_tests/test_data/e2e_us_demo/howdoobesityratescomparewithratesofdiabetesinusacounties/chart_config.json b/server/integration_tests/test_data/e2e_us_demo/howdoobesityratescomparewithratesofdiabetesinusacounties/chart_config.json index 459d5a0bc9..bedf01a74e 100644 --- a/server/integration_tests/test_data/e2e_us_demo/howdoobesityratescomparewithratesofdiabetesinusacounties/chart_config.json +++ b/server/integration_tests/test_data/e2e_us_demo/howdoobesityratescomparewithratesofdiabetesinusacounties/chart_config.json @@ -173,7 +173,6 @@ } ], "denom": "Count_Person", - "startWithDenom": true, "title": "Prevalence: Male, Obesity vs. Diabetes" }, { diff --git a/server/lib/nl/detection/query_util.py b/server/lib/nl/detection/query_util.py index 0d3972ff12..29067c404b 100644 --- a/server/lib/nl/detection/query_util.py +++ b/server/lib/nl/detection/query_util.py @@ -20,12 +20,6 @@ from shared.lib import utils -# TODO: decouple words removal from detected attributes. Today, the removal -# blanket removes anything that matches, including the various attribute/ -# classification triggers and contained_in place types (and their plurals). -# This may not always be the best thing to do. -ALL_STOP_WORDS = utils.combine_stop_words() - # Use comma, "vs.", semi-colon, "and", ampersand as delimiters. _REGEX_DELIMITERS = r',|vs|;|and|&' # Regex to extract out substrings within double quotes. @@ -100,14 +94,14 @@ def get_parts_via_delimiters(query): return parts -def _prepare_queryset_via_delimiters(query: str, - querysets: List[QuerySet]) -> int: +def _prepare_queryset_via_delimiters(query: str, querysets: List[QuerySet], + stop_words: List[str]) -> int: parts = get_parts_via_delimiters(query) if len(parts) == 1: return 0 cleaned_parts = [] for p in parts: - p = utils.remove_stop_words(utils.remove_punctuations(p), ALL_STOP_WORDS) + p = utils.remove_stop_words(utils.remove_punctuations(p), stop_words) if p: cleaned_parts.append(p) if not cleaned_parts: @@ -123,13 +117,14 @@ def _prepare_queryset_via_delimiters(query: str, # # Returns combinations of |query| string parts of upto _MAX_SVS splits. # -def prepare_multivar_querysets(query: str, max_svs: int) -> List[QuerySet]: +def prepare_multivar_querysets(query: str, max_svs: int, + stop_words: List[str]) -> List[QuerySet]: querysets: List[QuerySet] = [] - delim_nsplits = _prepare_queryset_via_delimiters(query, querysets) + delim_nsplits = _prepare_queryset_via_delimiters(query, querysets, stop_words) query = utils.remove_punctuations(query) - query = utils.remove_stop_words(query, ALL_STOP_WORDS) + query = utils.remove_stop_words(query, stop_words) query_parts = [x.strip() for x in query.split(' ') if x.strip()] max_splits = min(max_svs, len(query_parts)) diff --git a/server/lib/nl/detection/variable.py b/server/lib/nl/detection/variable.py index 81e19f6e8a..800add777d 100644 --- a/server/lib/nl/detection/variable.py +++ b/server/lib/nl/detection/variable.py @@ -17,7 +17,6 @@ from typing import Dict, List -import server.lib.nl.common.counters as ctr from server.lib.nl.detection import query_util from server.lib.nl.detection.types import DetectionArgs import server.lib.nl.detection.utils as dutils @@ -27,12 +26,6 @@ import shared.lib.detected_variables as vars import shared.lib.utils as shared_utils -# TODO: decouple words removal from detected attributes. Today, the removal -# blanket removes anything that matches, including the various attribute/ -# classification triggers and contained_in place types (and their plurals). -# This may not always be the best thing to do. -ALL_STOP_WORDS = shared_utils.combine_stop_words() - # A value higher than the highest score. _HIGHEST_SCORE = 1.0 _INIT_SCORE = (_HIGHEST_SCORE + 0.1) @@ -49,6 +42,15 @@ # def detect_vars(orig_query: str, debug_logs: Dict, dargs: DetectionArgs) -> vars.VarDetectionResult: + + # Get the list of stop words to use depending on if this is toolformer mode + # or not. + if params.is_toolformer_mode(dargs.mode): + stop_words = shared_utils.combine_stop_words( + constants.HEURISTIC_TYPES_IN_VARIABLES_TOOLFORMER) + else: + stop_words = shared_utils.combine_stop_words() + # # 1. Prepare all the queries for embeddings lookup, both mono-var and multi-var. # @@ -60,8 +62,7 @@ def detect_vars(orig_query: str, debug_logs: Dict, if dargs.include_stop_words: query_monovar = orig_query else: - query_monovar = shared_utils.remove_stop_words(orig_query, - query_util.ALL_STOP_WORDS) + query_monovar = shared_utils.remove_stop_words(orig_query, stop_words) if not query_monovar.strip(): # Empty user query! Return empty results return dutils.empty_var_detection_result() @@ -70,7 +71,8 @@ def detect_vars(orig_query: str, debug_logs: Dict, # Try to detect multiple SVs. Use the original query so that # the logic can rely on stop-words like `vs`, `and`, etc as hints # for SV delimiters. - multi_querysets, multi_queries = _prepare_multivar_queries(orig_query) + multi_querysets, multi_queries = _prepare_multivar_queries( + orig_query, stop_words) all_queries.extend(multi_queries) # @@ -108,12 +110,14 @@ def detect_vars(orig_query: str, debug_logs: Dict, # TODO: Fix the query upstream to ensure the punctuations aren't stripped. # def _prepare_multivar_queries( - query: str) -> tuple[List[query_util.QuerySet], List[str]]: + query: str, + stop_words: List[str]) -> tuple[List[query_util.QuerySet], List[str]]: # # Prepare a combination of query-sets. # querysets = query_util.prepare_multivar_querysets(query, - max_svs=_MAX_MULTIVAR_PARTS) + max_svs=_MAX_MULTIVAR_PARTS, + stop_words=stop_words) # Make a unique list of query strings all_queries = set() diff --git a/server/tests/lib/nl/detection/query_util_test.py b/server/tests/lib/nl/detection/query_util_test.py index 3e95c69907..bef03b2772 100644 --- a/server/tests/lib/nl/detection/query_util_test.py +++ b/server/tests/lib/nl/detection/query_util_test.py @@ -21,6 +21,7 @@ from server.lib.nl.detection.query_util import prepare_multivar_querysets from server.lib.nl.detection.query_util import QuerySet from server.lib.nl.detection.query_util import QuerySplit +from shared.lib.utils import combine_stop_words class TestGetPartsViaDelimiters(unittest.TestCase): @@ -181,4 +182,7 @@ class TestPrepareMultivarQuerysets(unittest.TestCase): ]]) def test_prepare_multivar_querysets(self, query, expected): self.maxDiff = None - self.assertEqual(prepare_multivar_querysets(query, max_svs=4), expected) + self.assertEqual( + prepare_multivar_querysets(query, + max_svs=4, + stop_words=combine_stop_words()), expected) diff --git a/shared/lib/constants.py b/shared/lib/constants.py index 81711eef1c..0efb69859e 100644 --- a/shared/lib/constants.py +++ b/shared/lib/constants.py @@ -15,6 +15,8 @@ from typing import Dict, FrozenSet, List, Set, Union +_RATE_WORDS_TO_SKIP = "(birth|change|death|exchange|fertility|literacy|mortality|participation|unemployment|withdrawal)" + STOP_WORDS: Set[str] = { 'ourselves', 'hers', @@ -272,8 +274,14 @@ # together with ContainedInPlace. "AnswerPlacesReference": ["these", "those"], "PerCapita": [ - "fraction", "percent", "percentage", "per capita", "percapita", - "per person", "rate", "rates" + "fraction", + "percent", + "percentage", + "per capita", + "percapita", + "per person", + # remove "rate" or "rates" if is not followed by certain words (used as one metric) + f"\brate(s)?\b(?!\s*{_RATE_WORDS_TO_SKIP}\s+rate(s)?)", ], "Temporal": [ # Day of week @@ -309,10 +317,17 @@ ] } -# We do not want to strip words from events / superlatives / temporal -# since we want those to match SVs too! +# By default, we do not want to strip words from these heuristics because they +# can match SVs too: events / superlatives / temporal / percapita. +# We want to keep per capita because queries like "theft rates" without the +# per capita stop words will become "theft" which has trouble matching plurals +# and stat based descriptions. HEURISTIC_TYPES_IN_VARIABLES = frozenset( ["Event", "Superlative", "Temporal", "PerCapita"]) +# For toolformer, we do want to strip words from PerCapita heuristics because +# we care about top matches being more accurate. +HEURISTIC_TYPES_IN_VARIABLES_TOOLFORMER = frozenset( + ["Event", "Superlative", "Temporal"]) PLACE_TYPE_TO_PLURALS: Dict[str, str] = { "place": "places", diff --git a/shared/lib/utils.py b/shared/lib/utils.py index a01b1d84f9..7b706857f0 100644 --- a/shared/lib/utils.py +++ b/shared/lib/utils.py @@ -38,14 +38,15 @@ def _add_to_set_from_list(set_strings: Set[str], set_strings.add(v_str.lower()) -def _add_classification_heuristics(set_strings: Set[str]) -> None: +def _add_classification_heuristics(set_strings: Set[str], + heuristics_to_skip: Set[str]) -> None: """Adds (in place) relevant stop words in QUERY_CLASSIFICATION_HEURISTICS. Args: set_strings: the set of Strings to add to. """ for (ctype, v) in constants.QUERY_CLASSIFICATION_HEURISTICS.items(): - if ctype in constants.HEURISTIC_TYPES_IN_VARIABLES: + if ctype in heuristics_to_skip: continue if isinstance(v, list): # If 'v' is a list, add all the words. @@ -131,13 +132,19 @@ def list_place_type_stopwords() -> List[str]: return place_type_stop_words -def combine_stop_words() -> Set[str]: +# TODO: decouple words removal from detected attributes. Today, the removal +# blanket removes anything that matches, including the various attribute/ +# classification triggers and contained_in place types (and their plurals). +# This may not always be the best thing to do. +def combine_stop_words( + heuristics_to_skip: Set[str] = constants.HEURISTIC_TYPES_IN_VARIABLES +) -> List[str]: """Returns all the combined stop words from the various constants.""" # Make a copy. stop_words = copy.deepcopy(constants.STOP_WORDS) # Now add the words in the classification heuristics. - _add_classification_heuristics(stop_words) + _add_classification_heuristics(stop_words, heuristics_to_skip) _add_to_set_from_list(stop_words, list_place_type_stopwords()) diff --git a/tools/nl/embeddings/input/base/_preindex.csv b/tools/nl/embeddings/input/base/_preindex.csv index 29252d54bb..303ef92200 100644 --- a/tools/nl/embeddings/input/base/_preindex.csv +++ b/tools/nl/embeddings/input/base/_preindex.csv @@ -1802,7 +1802,7 @@ Number of Merchant Wholesalers establishments,Count_Establishment_USC_MerchantWh Number of Milk Cows in Farm Inventory,Count_FarmInventory_MilkCows "Number of Mining, Quarrying, And Oil And Gas Extraction establishments",Count_Establishment_NAICSMiningQuarryingOilGasExtraction;dc/br6elkd593zs1 Number of Mumps Cases,Count_MedicalConditionIncident_ConditionMumps -Number of Murder and Non Negligent Manslaughter per capita,Count_CriminalActivities_MurderAndNonNegligentManslaughter_AsFractionOf_Count_Person +Number of Murder and non-negligent manslaughter per capita,Count_CriminalActivities_MurderAndNonNegligentManslaughter_AsFractionOf_Count_Person Number of Muslim boys,indianCensus/Count_Person_Religion_Muslim_YearsUpto6_Male Number of Muslim children,indianCensus/Count_Person_Religion_Muslim_YearsUpto6 Number of Muslim children in rural areas,indianCensus/Count_Person_Religion_Muslim_YearsUpto6_Rural @@ -2510,7 +2510,7 @@ Number of married couple households with a householder with some college or an a Number of married females,Count_Person_Female_MarriedAndNotSeparated Number of married foreign born people,Count_Person_15OrMoreYears_MarriedAndNotSeparated_ForeignBorn Number of married native people,Count_Person_15OrMoreYears_MarriedAndNotSeparated_Native -Number of married people,Count_Person_MarriedAndNotSeparated;Count_Person_NowMarried +Number of married people,Count_Person_MarriedAndNotSeparated Number of married people of multi-races,Count_Person_15OrMoreYears_NeverMarried_TwoOrMoreRaces Number of married people of one race,Count_Person_15OrMoreYears_MarriedAndNotSeparated_OneRace Number of married people of two or more races,Count_Person_15OrMoreYears_MarriedAndNotSeparated_TwoOrMoreRaces @@ -2520,6 +2520,7 @@ Number of married people who reside in group quarters,Count_Person_MarriedAndNot Number of married people who reside in institutionalized group quarters,Count_Person_MarriedAndNotSeparated_ResidesInInstitutionalizedGroupQuarters Number of married people who reside in non-institutionalized group quarters,Count_Person_MarriedAndNotSeparated_ResidesInNoninstitutionalizedGroupQuarters Number of married people who reside in nursing facilities,Count_Person_MarriedAndNotSeparated_ResidesInNursingFacilities +"Number of married people, including separated",Count_Person_NowMarried Number of married white people,Count_Person_15OrMoreYears_MarriedAndNotSeparated_WhiteAlone Number of mobile phone subscriptions per person,Count_Product_MobileCellularSubscription_AsFractionOf_Count_Person Number of multi race people with disabilities,Count_Person_WithDisability_TwoOrMoreRaces @@ -2882,7 +2883,6 @@ People with health insurance split by race,dc/topic/WithHealthInsuranceByRace People with high blood pressure,dc/topic/BloodPressure People with high cholestrol,dc/topic/Cholesterol Percent of area that are burned in land fire,Percent_BurnedArea_FireEvent -Percent of females commiting intentional homicide,Count_CriminalActivities_MurderAndNonNegligentManslaughter_Female_AsFractionOf_Count_Person_Female Percent of forest area that burned,Percent_BurnedArea_FireEvent_Forest Percent of forest area that experienced high-severity fire,Percent_BurnedArea_FireEvent_Forest_HighSeverity Percent of illiterate people older than 60,Count_Person_60OrMoreYears_Illiterate_AsAFractionOf_Count_Person_60OrMoreYears @@ -2896,7 +2896,6 @@ Percent of land covered by Permanent Water,LandCoverFraction_PermanentWater Percent of land covered by Seasonal Water,LandCoverFraction_SeasonalWater Percent of land covered by Shrubland,LandCoverFraction_Shrubland Percent of land covered by Snow Ice,LandCoverFraction_SnowIce -Percent of males commiting intentional homicide,Count_CriminalActivities_MurderAndNonNegligentManslaughter_Male_AsFractionOf_Count_Person_Male Percentage of Bcg Immunization Coverage Among 1 Year Old female,WHO/bcgv_Female Percentage of Bcg Immunization Coverage Among 1 Year Old in Rural Areas,WHO/bcgv_Rural Percentage of Bcg Immunization Coverage Among 1 Year Old in Urban areas,WHO/bcgv_Urban @@ -4181,7 +4180,6 @@ health in the world,dc/topic/GlobalHealth health insurance private type,dc/topic/PrivateHealthInsuranceType health insurance public type,dc/topic/PublicHealthInsuranceType health of mothers,dc/topic/MaternalHealth -heart attack,dc/topic/Stroke high cholestrol condition,dc/topic/Cholesterol high humidity and high heat,dc/topic/WetBulbEvent high rainfall,dc/topic/Precipitation @@ -4419,6 +4417,8 @@ number of males without health insurance,Count_Person_Male_NoHealthInsurance number of multi race females living Above Poverty Level in the past 12 months,Count_Person_Female_AbovePovertyLevelInThePast12Months_TwoOrMoreRaces number of multi races females living below poverty level in the past 12 months,Count_Person_Female_BelowPovertyLevelInThePast12Months_TwoOrMoreRaces number of multi-race males,Count_Person_Male_TwoOrMoreRaces +number of murders,Count_CriminalActivities_MurderAndNonNegligentManslaughter +number of murders per capita,Count_CriminalActivities_MurderAndNonNegligentManslaughter_AsFractionOf_Count_Person number of native hawaiian or other pacific islander females living Above Poverty Level in the Past 12 months,Count_Person_Female_AbovePovertyLevelInThePast12Months_NativeHawaiianOrOtherPacificIslanderAlone number of native hawaiian or other pacific islander females living below poverty level in the past 12 months,Count_Person_Female_BelowPovertyLevelInThePast12Months_NativeHawaiianOrOtherPacificIslanderAlone number of never married males aged 15 years or older,Count_Person_Male_NeverMarried @@ -4632,10 +4632,14 @@ proportion of children aged 5 and under who are overweight,WHO/NUTOVERWEIGHTPREV proportion of children aged 6-59 Months with anaemia,WHO/NUTRITION_ANAEMIA_CHILDREN_PREV proportion of children with asthma,Percent_Person_Children_WithAsthma proportion of females aged 15+ who smoke,Count_Person_15OrMoreYears_Female_Smoking_AsFractionOf_Count_Person_15OrMoreYears_Female +proportion of females commiting murder,Count_CriminalActivities_MurderAndNonNegligentManslaughter_Female_AsFractionOf_Count_Person_Female +proportion of females commiting murder and non-negligent manslaughter,Count_CriminalActivities_MurderAndNonNegligentManslaughter_Female_AsFractionOf_Count_Person_Female proportion of females who use tobacco,Percent_TobaccoUsing_In_Count_Person_Female proportion of females with obesity,dc/4lvmzr1h1ylk1 proportion of low birth weight,WHO/LBW_PREVALENCE proportion of males aged 15+ who smoke,Count_Person_15OrMoreYears_Male_Smoking_AsFractionOf_Count_Person_15OrMoreYears_Male +proportion of males commiting murder,Count_CriminalActivities_MurderAndNonNegligentManslaughter_Male_AsFractionOf_Count_Person_Male +proportion of males commiting murder and non-negligent manslaughter,Count_CriminalActivities_MurderAndNonNegligentManslaughter_Male_AsFractionOf_Count_Person_Male proportion of males who use tobacco,Percent_TobaccoUsing_In_Count_Person_Male proportion of men online,dc/topic/sdg_17.8.1 proportion of people Who Are Physically Inactive,Percent_Person_PhysicalInactivity diff --git a/tools/nl/embeddings/input/base/main_topics.csv b/tools/nl/embeddings/input/base/main_topics.csv index 1745a7402d..2ac9a7bf7f 100644 --- a/tools/nl/embeddings/input/base/main_topics.csv +++ b/tools/nl/embeddings/input/base/main_topics.csv @@ -297,7 +297,7 @@ dc/topic/SocialSupport,Social Support;Social Support dc/topic/SolarConsumption,Solar Consumption;Solar Consumption dc/topic/SolarPotential,Solar Potential;Solar Potential;Solar potential dc/topic/Storm,Storm;Storm -dc/topic/Stroke,Stroke;Stroke;stroke attack;heart attack +dc/topic/Stroke,Stroke;Stroke;stroke attack dc/topic/StrokeFemalePopulationByAge,Female Population With Stroke By Age;Female Population With Stroke By Age dc/topic/StrokeMalePopulationByAge,Male Population With Stroke By Age;Male Population With Stroke By Age dc/topic/StudentEnrollmentLevels,Student Enrollment Levels;Student Enrollment Levels;enrollement levels of students;number of students enrolled;students currently enrolled diff --git a/tools/nl/embeddings/input/base/sheets_svs.csv b/tools/nl/embeddings/input/base/sheets_svs.csv index a28c323a84..88759416b6 100644 --- a/tools/nl/embeddings/input/base/sheets_svs.csv +++ b/tools/nl/embeddings/input/base/sheets_svs.csv @@ -1142,10 +1142,10 @@ Count_CriminalActivities_CombinedCrime,Number of crimes Count_CriminalActivities_ForcibleRape,Number of Rapes Count_CriminalActivities_LarcenyTheft,Number of larceny theft Count_CriminalActivities_MotorVehicleTheft,Number of car thefts -Count_CriminalActivities_MurderAndNonNegligentManslaughter,Number of murders and non-negligent manslaughters -Count_CriminalActivities_MurderAndNonNegligentManslaughter_AsFractionOf_Count_Person,Number of Murder and Non Negligent Manslaughter per capita -Count_CriminalActivities_MurderAndNonNegligentManslaughter_Female_AsFractionOf_Count_Person_Female,Percent of females commiting intentional homicide -Count_CriminalActivities_MurderAndNonNegligentManslaughter_Male_AsFractionOf_Count_Person_Male,Percent of males commiting intentional homicide +Count_CriminalActivities_MurderAndNonNegligentManslaughter,Number of murders and non-negligent manslaughters;number of murders +Count_CriminalActivities_MurderAndNonNegligentManslaughter_AsFractionOf_Count_Person,Number of Murder and non-negligent manslaughter per capita;number of murders per capita +Count_CriminalActivities_MurderAndNonNegligentManslaughter_Female_AsFractionOf_Count_Person_Female,proportion of females commiting murder and non-negligent manslaughter;proportion of females commiting murder +Count_CriminalActivities_MurderAndNonNegligentManslaughter_Male_AsFractionOf_Count_Person_Male,proportion of males commiting murder and non-negligent manslaughter;proportion of males commiting murder Count_CriminalActivities_PropertyCrime,Number of property crime incidents Count_CriminalActivities_Robbery,Number of robberies Count_CriminalActivities_ViolentCrime,Number of violent crimes