From 0b69cfc5b37f248716ff454b3315fd006486386c Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Mon, 9 Dec 2024 13:54:29 +0530 Subject: [PATCH 1/3] Utils for statvar processor for strng lookup --- scripts/statvar/mcf_file_util.py | 334 ++++++---- scripts/statvar/ngram_matcher.py | 225 +++++++ scripts/statvar/ngram_matcher_test.py | 42 ++ scripts/statvar/property_value_cache.py | 447 +++++++++++++ scripts/statvar/property_value_cache_test.py | 127 ++++ scripts/statvar/property_value_mapper.py | 627 +++++++++++++++++++ scripts/statvar/property_value_utils.py | 154 +++++ 7 files changed, 1831 insertions(+), 125 deletions(-) create mode 100644 scripts/statvar/ngram_matcher.py create mode 100644 scripts/statvar/ngram_matcher_test.py create mode 100644 scripts/statvar/property_value_cache.py create mode 100644 scripts/statvar/property_value_cache_test.py create mode 100644 scripts/statvar/property_value_mapper.py create mode 100644 scripts/statvar/property_value_utils.py diff --git a/scripts/statvar/mcf_file_util.py b/scripts/statvar/mcf_file_util.py index 14dabe0685..95c83ce3a4 100644 --- a/scripts/statvar/mcf_file_util.py +++ b/scripts/statvar/mcf_file_util.py @@ -147,15 +147,15 @@ def strip_namespace(value: str) -> str: def strip_value(value: str) -> str: """Returns the string value with spacesding/trailing space stripped. - Args: - value: string to be cleaned. + Args: + value: string to be cleaned. - Returns: - string without extra leading and trailing spaces. - """ + Returns: + string without extra leading and trailing spaces. + """ if value and isinstance(value, str): value = value.strip() - if value[0] == '"' and value[-1] == '"': + if value and value[0] == '"' and value[-1] == '"': value_str = value[1:-1] value_str.strip() value = '"' + value_str + '"' @@ -213,21 +213,31 @@ def add_pv_to_node( if value and ',' in value: # Split the comma separated value into a list. value = normalize_list(value, False) - if not value: - return node + # Allow empty value + # if not value: + # return node + value_list = [] if isinstance(value, list): - # Add each value recursively. - for v in value: + value_list = value + elif isinstance(value, str) and ',' in value: + value_list = get_value_list(value) + if value_list: + if len(value_list) == 1: + value = value_list[0] + else: + # Add each value recursively. + for v in value_list: add_pv_to_node(prop, v, node, append_value, strip_namespaces, normalize) - return node - if not value: - return node + return node + # allow empty values + # if not value: + # return node existing_value = node.get(prop) - if existing_value and prop != 'Node' and prop != 'dcid': + if existing_value is not None and prop != 'Node' and prop != 'dcid': # Property already exists. Add value to a list if not present. - if value and value != existing_value and value not in existing_value.split( - ','): + if (value is not None and value != existing_value and + value not in existing_value.split(',')): if append_value: # Append value to a list of existing values node[prop] = f'{node[prop]},{value}' @@ -254,8 +264,15 @@ def add_comment_to_node(comment: str, node: dict) -> dict: for example, '# comment1', '# comment2'. """ # Count the existing comments in the node. - comments = [c for c in node.keys() if c and c[0] == '#'] - next_comment_index = len(comments) + 1 + num_comments = 0 + for c, v in node.items(): + if not c or c[0] != '#': + continue + if v == comment: + # skip existing comment + return node + num_comments += 1 + next_comment_index = num_comments + 1 # Add the new comment with the next index. node[f'# comment{next_comment_index}'] = comment return node @@ -289,14 +306,16 @@ def add_mcf_node( normalize: bool = True, ) -> dict: """Add a node with property values into the nodes dict + If the node exists, the PVs are added to the existing node. Args: pvs: dictionary of property: values of the new node to be added. nodes: dictionary of existing nodes with property:value dict for each node. strip_namespaces: if True, strip namespace from the dcid key and values. - append_values: if True, append new value for an exsting property, else replace - with new value. + append_values: if True, append new value for an exsting property, else + replace with new value. + Returns nodes dictionary to which the new node is added. """ @@ -315,11 +334,47 @@ def add_mcf_node( for prop, value in pvs.items(): add_pv_to_node(prop, value, node, append_values, strip_namespaces, normalize) - logging.level_debug() and logging.debug( - f'Added node {dcid} with properties: {pvs.keys()}') + logging.level_debug() and logging.log( + 2, f'Added node {dcid} with properties: {pvs.keys()}') return nodes +def update_mcf_nodes( + nodes: dict, + output_nodes: dict, + strip_namespaces: bool = False, + append_values: bool = True, + normalize: bool = True, +) -> dict: + """Returns output_nodes with Property:values form nodes added. + + Args: + nodes: dictionary of MCF nodes in the form: + { : { : ...} ... } + output_nodes: Nodes to be updated + strip_namespaces: if True, strip namespace from the dcid key and values. + append_values: if True, append new value for an exsting property, else + replace with new value. + normalize: if True, values are normalized. + + Returns: + dictionary of output_nodes updated with property:values from nodes. + """ + index = 0 + for key, node in nodes.items(): + # Set the node dcid if not present. + dcid = get_node_dcid(node) + if not dcid: + dcid = key + if not key: + dcid = str(index) + node['Node'] = add_namespace(dcid) + # Add PVs from node to output_nodes + add_mcf_node(node, output_nodes, strip_namespaces, append_values, + normalize) + return output_nodes + + def load_mcf_nodes( filenames: Union[str, list], nodes: dict = None, @@ -328,16 +383,17 @@ def load_mcf_nodes( normalize: bool = True, ) -> dict: """Return a dict of nodes from the MCF file with the key as the dcid + and a dict of property:value for each node. Args: filenames: command seperated string or a list of MCF filenames - nodes: dictonary to which new nodes are added. - If a node with dcid exists, the new properties are added to the existing node. - strip_namespace: if True, strips namespace from the value for node properties - as well as the dcid key for the nodes dict. - append_values: if True, appends new values for existing properties - into a comma seperated list, else replaces existing value. + nodes: dictonary to which new nodes are added. If a node with dcid exists, + the new properties are added to the existing node. + strip_namespace: if True, strips namespace from the value for node + properties as well as the dcid key for the nodes dict. + append_values: if True, appends new values for existing properties into a + comma seperated list, else replaces existing value. Returns: dictionary with dcid as the key and a values as a dict of property:values @@ -353,8 +409,10 @@ def load_mcf_nodes( ... } """ + if nodes is None: + nodes = {} if not filenames: - return {} + return nodes # Load files in order of input files = [] if isinstance(filenames, str): @@ -364,9 +422,22 @@ def load_mcf_nodes( if nodes is None: nodes = _get_new_node(normalize) for file in files: - if file: - num_nodes = 0 - num_props = 0 + if not file: + continue + num_nodes = 0 + num_props = 0 + if file.endswith('.csv'): + # Load nodes from CSV + file_nodes = file_util.file_load_csv_dict(file) + for key, pvs in file_nodes.items(): + if 'Node' not in pvs: + pvs['Node'] = key + num_props += len(pvs) + add_mcf_node(pvs, nodes, strip_namespaces, append_values, + normalize) + num_nodes = len(file_nodes) + else: + # Load nodes from MCF file. with file_util.FileIO(file, 'r', errors='ignore') as input_f: pvs = _get_new_node(normalize) for line in input_f: @@ -399,9 +470,9 @@ def load_mcf_nodes( add_mcf_node(pvs, nodes, strip_namespaces, append_values, normalize) num_nodes += 1 - logging.info( - f'Loaded {num_nodes} nodes with {num_props} properties from file' - f' {file}') + logging.info( + f'Loaded {num_nodes} nodes with {num_props} properties from file {file}' + ) return nodes @@ -413,17 +484,17 @@ def filter_mcf_nodes( ) -> dict: """Filter dictionary of Nodes to a subset of allowed dcids. - Args: - nodes: dictionary of nodes keyed by dcid. - allow_dcids: list of dcids to be returned. - allow_nodes_with_pv: list of properties - nodes with any of the properties in the list are returned. - ignore_nodes_with_pv: list of properties to be ignored. - nodes with any of the properties in the list are dropped. - - Returns: - dictionary with the filtered nodes. - """ + Args: + nodes: dictionary of nodes keyed by dcid. + allow_dcids: list of dcids to be returned. + allow_nodes_with_pv: list of properties nodes with any of the properties in + the list are returned. + ignore_nodes_with_pv: list of properties to be ignored. nodes with any of + the properties in the list are dropped. + + Returns: + dictionary with the filtered nodes. + """ # Normalize ignored PVs. ignored_pvs = set() ignored_pvs = _pv_list_to_dict(ignore_nodes_with_pv) @@ -432,22 +503,24 @@ def filter_mcf_nodes( for k, v in nodes.items(): # Drop nodes with dcid not in allowed list. if allow_dcids and strip_namespace(k) in allow_dcids: - logging.debug(f'Dropping dcid not in compare_dcid: {k}, {v}') + logging.log(2, f'Dropping dcid not in compare_dcid: {k}, {v}') continue # Drop nodes containing any ignored property value. drop_node = False for prop, value in v.items(): if prop and prop[0] != '#': if _is_pv_in_dict(prop, value, ignored_pvs): - logging.debug( + logging.log( + 2, f'Dropping dcid with ignored pv {prop}:{value}: {k}, {v}' ) drop_node = True break if compared_pvs and not _is_pv_in_dict(prop, value, compared_pvs): - logging.debug( - f'Dropping dcid without any compared pv {prop}:{value}: {k}, {v}' + logging.log( + 2, + f'Dropping dcid without any compared pv {prop}:{value}: {k}, {v}', ) drop_node = True break @@ -461,16 +534,16 @@ def get_numeric_value(value: str, separator_chars: str = ' ,$%') -> Union[int, float, None]: """Returns the float value from string or None. - Args: - value: string to be converted into a number. - It can have comma separted digits with decimal points, for eg: NN,NNN.NNN - decimal_char: character used for decimal place seperator, default: '.' - seperator_char: seperator characters for 1000s or 100s - for example: NNN,NNN,NNN + Args: + value: string to be converted into a number. It can have comma separted + digits with decimal points, for eg: NN,NNN.NNN + decimal_char: character used for decimal place seperator, default: '.' + seperator_char: seperator characters for 1000s or 100s for example: + NNN,NNN,NNN - Returns: - number as a float or int if the value is a number, None otherwise - """ + Returns: + number as a float or int if the value is a number, None otherwise + """ if isinstance(value, int) or isinstance(value, float): return value if value and isinstance(value, str): @@ -501,13 +574,13 @@ def get_numeric_value(value: str, def get_quoted_value(value: str, is_quoted: bool = None) -> str: """Returns a quoted string if there are spaces and special characters. - Args: - value: string value to be quoted if necessary. - is_quoted: if True, returns values as quotes strings. + Args: + value: string value to be quoted if necessary. + is_quoted: if True, returns values as quotes strings. - Returns: - value with optional double quotes. - """ + Returns: + value with optional double quotes. + """ if not value or not isinstance(value, str): return value @@ -526,6 +599,7 @@ def get_value_list(value: str) -> list: Args: value: string with a single value or comma seperated list of values + Returns: value as a list. """ @@ -545,14 +619,14 @@ def get_value_list(value: str) -> list: def normalize_list(value: str, sort: bool = True) -> str: """Normalize a comma separated list of sorting strings. - Args: - value: string value to be normalized. - Can be a comma separated list or a sequence of characters. - sort: if True, lists are sorted alphabetically. + Args: + value: string value to be normalized. Can be a comma separated list or a + sequence of characters. + sort: if True, lists are sorted alphabetically. - Returns: - string that is a normalized version of value with duplicates removed. - """ + Returns: + string that is a normalized version of value with duplicates removed. + """ if ',' in value: has_quotes = False if '"' in value: @@ -570,10 +644,12 @@ def normalize_list(value: str, sort: bool = True) -> str: value_list = sorted(value_list) for v in value_list: if v not in values: - normalized_v = normalize_value(v, - quantity_range_to_dcid=False, - maybe_list=False, - is_quoted=has_quotes) + normalized_v = normalize_value( + v, + quantity_range_to_dcid=False, + maybe_list=False, + is_quoted=has_quotes, + ) normalized_v = str(normalized_v) values.append(normalized_v) return ','.join(values) @@ -584,16 +660,16 @@ def normalize_list(value: str, sort: bool = True) -> str: def normalize_range(value: str, quantity_range_to_dcid: bool = False) -> str: """Normalize a quantity range into [ Unit]. - Args: - value: quantity or quantity range as a string. - quantity_range_to_dcid: if True, converts quantity range to a dcid - [ ] is converted to dcid:UnitTo - if False, the quantity range is returned with unit at the end. - - Retruns: - string with quantity range of the form '[ ]' - or dcid:UnitStartToEnd if quantity_range_to_dcid is True. - """ + Args: + value: quantity or quantity range as a string. + quantity_range_to_dcid: if True, converts quantity range to a dcid [ + ] is converted to dcid:UnitTo if False, the + quantity range is returned with unit at the end. + + Retruns: + string with quantity range of the form '[ ]' + or dcid:UnitStartToEnd if quantity_range_to_dcid is True. + """ # Check if value is a quantity range quantity_pat = ( r'\[ *(?P[A-Z][A-Za-z0-9_/]*)? *(?P[0-9\.]+|-)?' @@ -606,7 +682,7 @@ def normalize_range(value: str, quantity_range_to_dcid: bool = False) -> str: if not match_dict: return value - logging.debug(f'Matched range: {match_dict}') + logging.log(2, f'Matched range: {match_dict}') # Value is a quantity range. Get the start, end and unit. start = match_dict.get('start', '') @@ -642,24 +718,28 @@ def normalize_range(value: str, quantity_range_to_dcid: bool = False) -> str: return normalized_range -def normalize_value(value, - quantity_range_to_dcid: bool = False, - maybe_list: bool = True, - is_quoted: bool = False) -> str: +def normalize_value( + value, + quantity_range_to_dcid: bool = False, + maybe_list: bool = True, + is_quoted: bool = False, +) -> str: """Normalize a property value adding a standard namespace prefix 'dcid:'. - Args: - value: string as a value of a property to be normalized. - quantity_range_to_dcid: if True, convert quantity range to a dcid. - maybe_list: if True, values with ',' are converted to a normalized list. + Args: + value: string as a value of a property to be normalized. + quantity_range_to_dcid: if True, convert quantity range to a dcid. + maybe_list: if True, values with ',' are converted to a normalized list. - Returns: - normalized value with namespace 'dcid' for dcid values - sorted list for comma separated values. - """ + Returns: + normalized value with namespace 'dcid' for dcid values + sorted list for comma separated values. + """ if value: if isinstance(value, str): value = value.strip() + if not value: + return '' if value[0] == '"' and value[-1] == '"' and len(value) > 100: # Retain very long strings, such as geoJsonCoordinates, as is. return value @@ -674,6 +754,10 @@ def normalize_value(value, if ' ' in value or ',' in value or is_quoted: return get_quoted_value(value, is_quoted) # Normalize string with a standardized namespace prefix. + if '__' in value: + # For concatenated sequence of dcids, keep them sorted. + values = strip_namespace(value).split('__') + value = '__'.join(sorted(values)) return add_namespace(strip_namespace(value)) elif isinstance(value, float): # Return a fixed precision float string. @@ -691,13 +775,13 @@ def normalize_value(value, def normalize_pv(prop: str, value: str) -> str: """Returns a normalized property:value string. - Args: - prop: property name as a string - value: property value as a string + Args: + prop: property name as a string + value: property value as a string - Returns: - string of the form ':' where value is normalized. - """ + Returns: + string of the form ':' where value is normalized. + """ return ':'.join([prop.strip(), normalize_value(value)]) @@ -707,6 +791,7 @@ def normalize_mcf_node( quantity_range_to_dcid: bool = False, ) -> dict: """Returns a normalized MCF node with all PVs in alphabetical order, + a common namespace of 'dcid' and comma separated lists also sorted. Args: @@ -743,14 +828,14 @@ def normalize_mcf_node( def node_dict_to_text(node: dict, default_pvs: dict = _DEFAULT_NODE_PVS) -> str: """Convert a dictionary node of PVs into text. - Args: - node: dictionary of property: values. - default_pvs: dictionary with default property:values. - These properties are added to the node if not present. + Args: + node: dictionary of property: values. + default_pvs: dictionary with default property:values. These properties are + added to the node if not present. - Returns: - node as a text string with a property:value per line - """ + Returns: + node as a text string with a property:value per line + """ props = list(node.keys()) pvs = [] # Add any initial comments @@ -793,19 +878,18 @@ def write_mcf_nodes( ): """Write the nodes to an MCF file. - Args: - node_dicts: dictionary of nodes keyed by dcid and - each node as a dictionary of property:value. - filename: output MCF file to be written - mode: if 'a', nodes are appended to existing file. - else file is overwritten with the nodes. - default_pvs: dictionary of default property:value to be - added to all nodes. - header: string written as a comment at the begining of the file. - ignore_comments: if True, drop comments that begin with '#' in the property. - sort: if True, nodes in the output file are sorted by dcid. - the properties in the node are also sorted. - """ + Args: + node_dicts: dictionary of nodes keyed by dcid and each node as a dictionary + of property:value. + filename: output MCF file to be written + mode: if 'a', nodes are appended to existing file. else file is overwritten + with the nodes. + default_pvs: dictionary of default property:value to be added to all nodes. + header: string written as a comment at the begining of the file. + ignore_comments: if True, drop comments that begin with '#' in the property. + sort: if True, nodes in the output file are sorted by dcid. the properties + in the node are also sorted. + """ if not node_dicts: return if isinstance(node_dicts, dict): diff --git a/scripts/statvar/ngram_matcher.py b/scripts/statvar/ngram_matcher.py new file mode 100644 index 0000000000..4f375aea33 --- /dev/null +++ b/scripts/statvar/ngram_matcher.py @@ -0,0 +1,225 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Class to match sub-strings using ngrams. + +Example: + # Load the matcher with search-key: values + matcher = NgramMatcher({'ngram-size': 4}) + matcher.add_key_value('California', 'dcid:geoId/06') + matcher.add_key_value('San Jose California', 'dcid:geoId/0668000') + matcher.add_key_value('San Jose Costa Rica', 'dcid:wikidataId/Q647808') + + # Look for matching keys + results = matcher.lookup('SanJose') + # returns a ranked list of (key, value) tuples: + # [('San Jose California', 'dcid:geoId/0668000'), ('San Jose Costa Rica', + 'dcid:wikidataId/Q647808')] + + # To get top 10 results with match details: + results = matcher.lookup('SanJose', 10, True) + # Returns a list of tuples with (key,
): + # [(, { 'value': , 'info': {'score': 1.2, 'ngram_matches': 3} }), + # ...] +""" + +import unicodedata + +from absl import logging + +# Default configuration settings for NgramMatcher +_DEFAULT_CONFIG = { + 'ngram_size': 4, + 'ignore_non_alphanum': True, + 'min_match_fraction': 0.8, +} + + +class NgramMatcher: + + def __init__(self, config: dict = {}): + self._config = dict(_DEFAULT_CONFIG) + if config: + self._config.update(config) + self._ngram_size = self._config.get('ngram_size', 4) + # List of (key, value) tuples. + self._key_values = list() + # Dictionary of ngram to set of string ids that contain the ngram. + # { '': { (id1, pos1), (id2, pos2), ...}, ...} + self._ngram_dict = {} + + def get_tuples_count(self): + return len(self._key_values) + + def get_key_values(self): + return dict(self._key_values) + + def add_keys_values(self, kvs: dict[str, any]) -> None: + for key, value in kvs.items(): + self.add_key_value(key, value) + + def add_key_value(self, key: str, value): + """Add a key and value. + + When the key matches a lookup string, the key and corresponding value is + returned. + + Args: + key: string to be looked up + value: value to be returned on key match. + """ + self._key_values.append((key, value)) + key_index = len(self._key_values) - 1 + self._add_key_index(key, key_index) + + def get_ngrams_count(self) -> int: + """Returns the number of ngrams in the index.""" + return len(self._ngram_dict) + + def lookup( + self, + key: str, + num_results: int = None, + return_score: bool = False, + config: dict = None, + ) -> list: + """Lookup a key string. + + Returns an ordered list of (key, value) tuples matching the key. + """ + normalized_key = self._normalize_string(key) + ngrams = self._get_ngrams(normalized_key) + logging.level_debug() and logging.log(2, + f'looking up ngrams {ngrams} for {key}') + lookup_config = self._config + if config: + # Use the match config passed in. + lookup_config = dict(self._config) + lookup_config.update(config) + # Get the matching key indices for all ngrams. + matches = dict() + for ngram in ngrams: + ngram_matches = self._ngram_dict.get(ngram, {}) + if ngram_matches: + # Use IDF score for each ngram + ngram_score = 1 / len(ngram_matches) + for key_index, ngram_pos in ngram_matches: + # Collect matches and update score for each ngram + if key_index not in matches: + matches[key_index] = { + 'score': ngram_score, + 'ngram_matches': 1, + 'ngram_pos': ngram_pos, + } + else: + key_match = matches[key_index] + key_match['score'] = key_match['score'] + ngram_score + key_match[ + 'ngram_matches'] = key_match['ngram_matches'] + 1 + key_match['ngram_pos'] = min(key_match['ngram_pos'], + ngram_pos) + + logging.level_debug() and logging.log(2, f'Matches for {key}: {matches}') + # Collect all key indices that matches with counts. + match_indices = list() + min_matches = max( + 1, + len(ngrams) * lookup_config.get('min_match_fraction', 0.8)) + for key_index, result in matches.items(): + if result['ngram_matches'] >= min_matches: + match_indices.append((key_index, result)) + + # Order key_index by decreasing number of matches. + key_len = len(normalized_key) + match_indices.sort( + key=lambda x: self._get_ngram_match_score(x[1], key_len), + reverse=True) + logging.level_debug() and logging.log(2, + f'Sorted matches for {key}: {match_indices}') + + # Collect results in sorted order + results = list() + for match in match_indices: + result_key, result_value = self._key_values[match[0]] + if return_score: + results.append((result_key, { + 'value': result_value, + 'info': match[1] + })) + else: + results.append((result_key, result_value)) + if num_results and len(results) >= num_results: + # There are enough results. Return these. + break + return results + + def _get_ngrams(self, key: str) -> list: + """Returns a list of ngrams for the key.""" + normalized_key = self._normalize_string(key) + ngrams = normalized_key.split(' ') + max_index = max(len(normalized_key) - self._ngram_size, 0) + 1 + for pos in range(max_index): + ngram = normalized_key[pos:pos + self._ngram_size] + if ngram not in ngrams: + ngrams.append(ngram) + return ngrams + + def _add_key_index(self, key: str, key_index: int): + """Adds the key into the ngrams index.""" + # Remove extra characters and convert to lower case. + normalized_key = self._normalize_string(key) + # index by all unique ngrams in the key + ngrams = self._get_ngrams(normalized_key) + for ngram in ngrams: + if ngram not in self._ngram_dict: + self._ngram_dict[ngram] = set() + ngram_pos = normalized_key.find(ngram) + self._ngram_dict[ngram].add((key_index, ngram_pos)) + logging.level_debug() and logging.log( + 3, f'Added ngram "{ngram}" for {key}:{key_index}') + + def _normalize_string(self, key: str) -> str: + """Returns a normalized string removing special characters""" + return normalized_string(key, + self._config.get('ignore_non_alphanum', True)) + + def _get_ngram_match_score(self, match: dict, key_len: int) -> float: + """Returns a score for the ngram match components.""" + # IDF score + score = match['score'] + # Boost for match at the beginning of the key. + score += (key_len - match['ngram_pos']) * 10000 + # DF score + score += match['ngram_matches'] * 100 + return score + + +def normalized_string(key: str, ignore_non_alnum: bool = True) -> str: + """Returns a normalized string for match. + + Args: + key: string to be normalized. + ignore_non_alnum: if True, non alpha numeric characters are removed. + + Returns: + normalized string + """ + normalized_key = unicodedata.normalize('NFKD', key) + normalized_key = normalized_key.lower() + # Remove extra spaces + normalized_key = ' '.join([w for w in normalized_key.split(' ') if w]) + # Remove extra punctuation. + if ignore_non_alnum: + normalized_key = ''.join( + [c for c in normalized_key if c.isalnum() or c == ' ']) + return normalized_key diff --git a/scripts/statvar/ngram_matcher_test.py b/scripts/statvar/ngram_matcher_test.py new file mode 100644 index 0000000000..557624a4c9 --- /dev/null +++ b/scripts/statvar/ngram_matcher_test.py @@ -0,0 +1,42 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for NgramMatcher.""" + +import unittest + +from absl import app +from absl import logging +import ngram_matcher + + +class NgramMatcherTest(unittest.TestCase): + + def setUp(self): + # logging.set_verbosity(2) + return + + def test_lookup_string(self): + matcher = ngram_matcher.NgramMatcher(config={'ngram_size': 4}) + matcher.add_key_value('Test Key 1', 1) + matcher.add_key_value('TESTKey Two', 'two') + matches = matcher.lookup('Test') + self.assertEqual([('TESTKey Two', 'two'), ('Test Key 1', 1)], matches) + self.assertTrue( + matcher.lookup('Tester', config={'min_match_fraction': 0.1})) + self.assertFalse(matcher.lookup('ABCDEF')) + + +if __name__ == '__main__': + app.run() + unittest.main() diff --git a/scripts/statvar/property_value_cache.py b/scripts/statvar/property_value_cache.py new file mode 100644 index 0000000000..8c1d9592f6 --- /dev/null +++ b/scripts/statvar/property_value_cache.py @@ -0,0 +1,447 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Class to store set of property:value for multiple keys. + +The values are stored as a dict with any selected property such as dcid as the +key. The cache is persisted in a file. +""" + +import csv +import os +import sys +import unicodedata + +from absl import app +from absl import flags +from absl import logging + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util')) + +import file_util +from mcf_file_util import add_pv_to_node +from counters import Counters + +# Indexed properties in order for lookup. +_DEFAULT_KEY_PROPS = [ + 'key', + 'dcid', + 'placeId', + 'wikidataId', + 'name', + 'place_name', +] + + +class PropertyValueCache: + """Class to store property:values for a key. + + It allows lookup for an entry by any of the values for a set of key + properties. The entries are loaded from a file and persisted in the file after + updates. + + Example usage: + pv_cache = PropertyValueCache('/tmp/pv-cache.csv', + key_props=['name', 'dcid', 'isoCode'], + normalize_key=True) + + # Add an entry to cache + pv_cache.add( { + 'dcid': 'country/IND', + 'typeOf': 'Country', + 'name': 'India', + 'isoCode': 'IND' + }) + + # Lookup above entry by any value of a property + india_entry = pv_cache.get_entry(prop='isoCode', value='IND') + # Lookup by value of any key property + india_entry = pv_cache.get_entry('india') + """ + + def __init__( + self, + filename: str = '', + key_props: list = _DEFAULT_KEY_PROPS, + props: list = [], + normalize_key: bool = True, + counters: Counters = None, + ): + """Initialize the PropertyValueCache. + + Args: + filename: CSV file with one row per cache entry with + properties as columns. + The entries in the file are loaded on init and + saved periodically and on exit. + key_props: list of properties that can be used for lookup. + The values of these properties are assumed to be unique and + values are stored in an index per property for lookup by value. + props: List of properties across entries. + normalize_key: if True, values are normalized (lower case) + before lookup in the per-property index. + counters: Counters object for cache hits and misses. + """ + self._filename = filename + self._normalize_key = normalize_key + + # List of properties that can be used as keys. + # Each values for the keys are assumed to be unique across entries. + self._key_props = [] + + # Index per key_property. + # Mapping from a key property to entry in the _entries list + # { '': { '': { }, + # '': { } + # ...}, + # '': { ' : { } ,... } + # } + self._prop_index = {} + + if not self._key_props: + self._key_props = [] + self._counters = counters + if counters is None: + self._counters = Counters() + + # List of cache entries, each with a dict of property:values + # The property indexes have references to the entry + self._entries = {} + + # List of properties across all entries + self._props = [] + self._add_props(key_props=key_props, props=props) + + # Load entries from file. + self.load_cache_file(filename) + # Flag to indicate cache has been updated and has changed from file. + self._is_modified = False + + def __del__(self): + self.save_cache_file() + + def load_cache_file(self, filename: str): + """Load entries of property:value dicts from files. + + Args: + filename: CSV file(s) from which property:values are loaded + with one row per entry. + """ + for file in file_util.file_get_matching(filename): + with file_util.FileIO(filename) as csv_file: + csv_reader = csv.DictReader(csv_file) + # Add columns as properties in order of input. + self._add_props(props=csv_reader.fieldnames) + + # Add an entry for each row in the file. + num_rows = 0 + for row in csv_reader: + num_rows += 1 + self.add(row) + logging.info( + f'Loaded {num_rows} with columns: {self._props} from {filename} into' + ' cache') + + def get_entry(self, value: str, prop: str = '') -> dict: + """Returns a dict entry that contains the prop:value. + + Args: + value: value to be looke dup in the property index + If normalize_key was set in init(), + value is converted to lower case string. + prop: One of the key-properties in which the value is looked up. + If not set, value is looked up in all key properties in order. + + Returns: + dict entry that contains the prop:value if it exists. + """ + if isinstance(value, list): + logging.error(f'Cannot lookup {value} for {prop}') + return {} + key = self.get_lookup_key(prop=prop, value=value) + if not prop or prop not in self._key_props: + # Property is not a key property + # Lookup value in map for all key properties. + for prop in self._key_props: + entry = self._get_prop_key_entry(prop, key) + if entry: + return entry + return self._get_prop_key_entry(prop, key) + + def get_entry_for_dict(self, pvs: dict) -> dict: + """Return the entry for the pvs in the dict. + + Args: + pvs: dictionary with partial set of property:values. + The values of any of the key properties is used to lookup. + Returns: + dict of cache entry that matches the first prop:value in pvs. + """ + for prop in self._key_props: + value = pvs.get(prop, None) + if value is not None: + cached_entry = self.get_entry(prop=prop, value=value) + if cached_entry: + return cached_entry + return {} + + def add(self, entry: dict) -> dict: + """Add a dict of property:values into the cache. + If the entry already exists for an existing key, + the entry is merged with the new values. + + Args: + entry: dict of property:values. + The entry is cached and values and entry is also indexed + by value of each key-property. + + Returns: + dict that was added or merged into. + """ + # Add any new properties + self._add_props(props=entry.keys()) + + # Check if an entry exists, matching any of the key prop:value. + cached_entry = self.get_entry_for_dict(entry) + if cached_entry: + # Merge new PVs into the existing entry + self.update_entry(entry, cached_entry) + entry = cached_entry + else: + # Add a new entry + cached_entry = dict(entry) + self._entries[len(self._entries)] = cached_entry + self._counters.add_counter('pv-cache-entries', 1) + + # Add entry to the lookup index for all key properties. + for prop in self._key_props: + values = entry.get(prop, None) + if values is not None: + # Add the entry to the lookup index with each of the values + # for key property. + if not isinstance(values, list): + values = [values] + for value in values: + self._add_prop_key_entry(prop, value, entry) + self._is_modified = True + logging.level_debug() and logging.log( + 2, f'Added cache entry {cached_entry}') + return cached_entry + + def update_entry(self, src: dict, dst: dict): + """Add PVs from src to dst. + + If a property exists with a value, add new values to a list. + """ + #for prop, values in src.items(): + # add_pv_to_node(prop, + # values, + # dst, + # append_value=True, + # normalize=self._normalize_key) + #return dst + for prop, values in src.items(): + # Add new values to list of existing values. + dst_value = dst.get(prop, None) + if dst_value: + value_added = False + dst_value = _get_value_list(dst_value) + values = _get_value_list(values) + for value in values: + if value not in dst_value: + dst_value.append(value) + value_added = True + if value_added: + # New values were added. + dst[prop] = dst_value + else: + # Add the new prop:value to dst dict + dst[prop] = values + self._is_modified = True + logging.level_debug() and logging.debug(f'Merged {src} into {dst}') + return dst + + def save_cache_file(self): + """Save the cache entries into the CSV file. + + File is only written into if cache has been modified + by adding a new entry since the last write. + """ + if not self.is_dirty(): + # No change in cache. Skip writing to file. + return + # Get the cache filename. + # Save cache to the last file loaded in case of multiple files. + filename = file_util.file_get_matching(self._filename) + if filename: + filename = filename[-1] + else: + filename = self._filename + + if not filename: + return + + logging.info(f'Writing {len(self._entries)} cache entries with columns' + f' {self._props} into file {filename}') + logging.debug(f'Writing cache entries: {self._entries}') + with file_util.FileIO(filename, mode='w') as cache_file: + csv_writer = csv.DictWriter( + cache_file, + fieldnames=self._props, + escapechar='\\', + quotechar='"', + quoting=csv.QUOTE_NONNUMERIC, + extrasaction='ignore', + ) + csv_writer.writeheader() + for entry in self._entries.values(): + # Flatten key properties with multiple values to + # rows with one value per property. + for pvs in flatten_dict(entry, self._key_props): + logging.debug(f'Saving cache entry: {pvs}') + csv_writer.writerow(pvs) + self._is_modified = False + + def is_dirty(self): + """Returns True if the cache has been modified since the last write.""" + return self._is_modified + + def normalize_string(self, key: str) -> str: + """Returns a normalized string for lookup. + The key has special characters removed and converted to lower case. + + Args: + key: string to be normalized for lookup. + Returns: + normalized key + """ + if not isinstance(key, str): + key = str(key) + normalized_key = unicodedata.normalize('NFKD', key) + normalized_key = normalized_key.lower() + # Remove extra spaces + normalized_key = ' '.join([w for w in normalized_key.split(' ') if w]) + # Remove extra punctuation. + normalized_key = ''.join( + [c for c in normalized_key if c.isalnum() or c == ' ']) + return normalized_key + + def get_lookup_key(self, value: str, prop: str = '') -> str: + """Returns key for lookup, normalizing if needed. + + Args: + value: string value to be looked up in the index. + The value is notmalized if needed. + prop: (optional) property for the value. + + Returns: + string to be looked up in the property index + which is value normalized if needed. + """ + if isinstance(value, list): + value = value[0] + if self._normalize_key: + return self.normalize_string(value) + return value + + def _add_props(self, key_props: list = [], props: list = []): + # Add any new key property. + if key_props: + for prop in key_props: + if prop not in self._key_props: + self._key_props.append(prop) + self._prop_index[prop] = dict() + if prop not in self._props: + self._props.append(prop) + + # Add remaining properties across entries. + if props: + for prop in props: + if prop not in self._props: + self._props.append(prop) + if not self._key_props and self._props: + # No key properties set. Use the first property as key. + self._key_props.append(self._props[1]) + + def _add_prop_key_entry(self, prop: str, value: str, entry: dict) -> bool: + """Adds the entry to the lookup map for property with the key.""" + if not value: + return False + key = self.get_lookup_key(prop=prop, value=value) + prop_index = self._prop_index.get(prop) + if prop_index is None: + logging.error(f'Invalid key prop {prop}:{key} for {entry}') + return False + existing_entry = prop_index.get(key) + if existing_entry and existing_entry.get(prop) != value: + logging.error( + f'Conflicting {prop}:{key} old:{existing_entry} new:{entry}') + prop_index[key] = entry + return True + + def _get_prop_key_entry(self, prop: str, key: str) -> dict: + """Returns the entry for the key in the lookup map for prop.""" + entry = self._prop_index.get(prop, {}).get(key, {}) + if entry: + self._counters.add_counter(f'pv-cache-hits-{prop}', 1) + else: + self._counters.add_counter(f'pv-cache-misses-{prop}', 1) + return entry + + +def flatten_dict(pvs: dict, props: list) -> list: + """Returns a list of dicts, flattening out props with multiple values.""" + # Get dictionary with prop:value not to be flattend + base_pvs = {} + for prop, value in pvs.items(): + if prop not in props: + if isinstance(value, list) or isinstance(value, set): + base_pvs[prop] = ','.join([str(v) for v in value]) + else: + base_pvs[prop] = value + + # List of dicts with expanded prop:values + pvs_list = [base_pvs] + for prop in props: + values = pvs.get(prop, '') + if not values: + continue + if not isinstance(values, list) and not isinstance(values, set): + values = [values] + list_with_prop = [] + for value in values: + for item in pvs_list: + pvs_with_prop = {prop: value} + pvs_with_prop.update(item) + list_with_prop.append(pvs_with_prop) + pvs_list = list_with_prop + return pvs_list + + +def _get_value_list(values: str) -> list: + """Returns a list of unique values from a comma separated string.""" + if not values: + return [] + values_list = [] + if isinstance(values, str): + values = values.split(',') + if not isinstance(values, list) and not isinstance(values, set): + values = [values] + for value in values: + if value not in values_list: + values_list.append(value) + return values_list diff --git a/scripts/statvar/property_value_cache_test.py b/scripts/statvar/property_value_cache_test.py new file mode 100644 index 0000000000..c2e54f824e --- /dev/null +++ b/scripts/statvar/property_value_cache_test.py @@ -0,0 +1,127 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for property_value_cache.py.""" + +import unittest + +from absl import app +from absl import logging +from property_value_cache import PropertyValueCache, flatten_dict + + +class PropertyValueCacheTest(unittest.TestCase): + + def test_add_entry(self): + pv_cache = PropertyValueCache() + + # Add an entry with name and dcid + pv_cache.add({'name': 'California', 'dcid': 'geoId/06'}) + pv_cache.add({'name': 'India', 'dcid': 'country/IND'}) + + # Add entry with additional properties + pv_cache.add({'dcid': 'geoId/06', 'typeOf': 'AdministrativeArea1'}) + pv_cache.add({'dcid': 'geoId/06', 'typeOf': 'State', 'name': 'CA'}) + pv_cache.add({ + 'dcid': 'country/IND', + 'placeId': 'ChIJkbeSa_BfYzARphNChaFPjNc' + }) + + expected_entry1 = { + 'name': ['California', 'CA'], + 'dcid': 'geoId/06', + 'typeOf': ['AdministrativeArea1', 'State'], + } + self.assertEqual(expected_entry1, + pv_cache.get_entry(prop='name', value='California')) + self.assertEqual(expected_entry1, + pv_cache.get_entry('geoId/06', 'dcid')) + + expected_entry2 = { + 'name': 'India', + 'dcid': 'country/IND', + 'placeId': 'ChIJkbeSa_BfYzARphNChaFPjNc', + } + self.assertEqual(expected_entry2, pv_cache.get_entry('India', 'name')) + self.assertEqual(expected_entry2, + pv_cache.get_entry('country/IND', 'dcid')) + self.assertEqual(expected_entry2, pv_cache.get_entry('India')) + + # Lookup by dict with placeId + # Match of one property, placeId is sufficient. + self.assertEqual( + expected_entry2, + pv_cache.get_entry_for_dict({ + # Matching key + 'placeId': 'ChIJkbeSa_BfYzARphNChaFPjNc', + # Key not matching + 'name': 'IND', + }), + ) + self.assertFalse({}, pv_cache.get_entry_for_dict({'name': 'IND'})) + + def test_flatten_dict(self): + pvs = { + 'name': ['California', 'CA'], + 'dcid': 'geoId/06', + 'typeOf': ['AdministrativeArea1', 'State'], + } + flattened_pvs = flatten_dict(pvs, ['name']) + self.assertEqual( + [ + { + 'name': 'California', + 'dcid': 'geoId/06', + 'typeOf': 'AdministrativeArea1,State', + }, + { + 'name': 'CA', + 'dcid': 'geoId/06', + 'typeOf': 'AdministrativeArea1,State', + }, + ], + flattened_pvs, + ) + # expected pvs have lists joined with ',' + merged_pvs = {} + for p, v in pvs.items(): + if isinstance(v, list): + v = ','.join(v) + merged_pvs[p] = v + self.assertEqual([merged_pvs], flatten_dict(pvs, ['dcid'])) + name_type_pvs = flatten_dict(pvs, ['name', 'typeOf']) + self.assertEqual( + [ + { + 'name': 'California', + 'dcid': 'geoId/06', + 'typeOf': 'AdministrativeArea1', + }, + { + 'name': 'CA', + 'dcid': 'geoId/06', + 'typeOf': 'AdministrativeArea1' + }, + { + 'name': 'California', + 'dcid': 'geoId/06', + 'typeOf': 'State' + }, + { + 'name': 'CA', + 'dcid': 'geoId/06', + 'typeOf': 'State' + }, + ], + name_type_pvs, + ) diff --git a/scripts/statvar/property_value_mapper.py b/scripts/statvar/property_value_mapper.py new file mode 100644 index 0000000000..b58356fcf5 --- /dev/null +++ b/scripts/statvar/property_value_mapper.py @@ -0,0 +1,627 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utility class to store property:value mappings for data strings.""" + +import csv +import os +import re +import sys + +from absl import app +from absl import flags +from absl import logging +from collections import OrderedDict + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util')) + +import config_flags +import eval_functions +import file_util + +import property_value_utils as pv_utils + +from config_map import ConfigMap, read_py_dict_from_file +from counters import Counters, CounterOptions + + +class PropertyValueMapper: + """Class to map strings to set of property values. + + Supports multiple maps with a namespace or context string. Stores string to + property:value maps as a dictionary: _pv_map = { + + 'GLOBAL': { + '': { + '': '' + '': '' + ... + }, + ... + }, + '' : { + '': { + '': '' + ... + }, + ... + }, + } + + The first level keys in _pv_map are namespaces that are column-headers or + 'GLOBAL'. + When looking up PVs for an input string, such as a column header or a cell + value, + first the namespace column-header is tried. + If there are no values then other namespacs such as 'GLOBAL are tried. + + within the PV can have a reference to another property. + Such reference are replaced with that property's value after + all PVs for a data cell have been collected. + + The references are indicated with the syntax '{Variable}' or '@Variable'. + where 'Variable' is expected to be another property in the cell's PVs. + + Internal properties that require special processing begin with '#', such as: + '#Regex': refers to a regular expression with names match groups + to be applied on a cell value + '#Format': a format string to be processed with other parameters + '#Eval': a python statement to be evaluated. It could have some computations + of the form = where the '' is evaluated and + assigned to property or to 'Data'. + + The cell value is mapped to the following default properties: + 'Data': the string value in the cell + 'Number': the numeric value if the cell is a number. + """ + + def __init__( + self, + pv_map_files: list = [], + config_dict: dict = None, + counters_dict: dict = None, + ): + self._config = ConfigMap(config_dict=config_dict) + self._counters = Counters( + counters_dict=counters_dict, + options=CounterOptions(debug=self._config.get('debug', False)), + ) + # Map from a namespace to dictionary of string-> { p:v} + self._pv_map = OrderedDict({'GLOBAL': {}}) + self._num_pv_map_keys = 0 + self._max_words_in_keys = 0 + for filename in pv_map_files: + namespace = 'GLOBAL' + if not file_util.file_get_matching(filename): + if ':' in filename: + namespace, filename = filename.split(':', 1) + self.load_pvs_from_file(filename, namespace) + logging.level_debug() and logging.debug( + f'Loaded PV map {self._pv_map} with max words {self._max_words_in_keys}' + ) + + def load_pvs_from_file(self, filename: str, namespace: str = 'GLOBAL'): + """Loads a map of the form 'string -> { P: V }' from a file. + + File is a python dictionary or a JSON file with python equivalents such as + True(true), False(false), None(null). + + Args: + filename: file containing the dictionary of string to dictionary of PVs + namespace: the namespace key for the dictionary to be loaded against. the + namespace is the first level key in the _pv_map. + """ + # Append new PVs to existing map. + pv_map_input = {} + if file_util.file_is_csv(filename): + # Load rows into a dict of prop,value + # if the first col is a config key, next column is its value + logging.info( + f'Loading PV maps for {namespace} from csv file: {filename}') + with file_util.FileIO(filename) as csvfile: + csv_reader = csv.reader(csvfile, + skipinitialspace=True, + escapechar='\\') + for row in csv_reader: + # Drop trailing empty columns in the row + last_col = len(row) - 1 + while last_col >= 0 and row[last_col].strip() == '': + last_col -= 1 + row = row[:last_col + 1] + if not row: + continue + key = row[0].strip() + if key in self._config.get_configs(): + # Add value to the config with same type as original. + value = ','.join(row[1:]) + config_flags.set_config_value(key, value, self._config) + else: + # Row is a pv map + pvs_list = row[1:] + if len(pvs_list) == 1: + # PVs list has no property, just a value. + # Use the namespace as the property + pvs_list = [namespace] + pvs_list.append(row[1]) + if len(pvs_list) % 2 != 0: + raise RuntimeError( + f'Invalid list of property value: {row} in {filename}' + ) + # Get property,values from the columns + pvs = {} + for i in range(0, len(pvs_list), 2): + prop = pvs_list[i].strip() + if not prop: + continue + value = pvs_list[i + 1].strip() + if value == '""': + value = '' + # Remove extra quotes around schema values. + # if value and value[0] == '"' and value[-1] == '"': + # value = value[1:-1].strip() + if value and value[0] != '[' and prop[0] != '#': + # Add quotes around text strings + # with spaces without commas. + # if re.search('[^,] +', value): + # value = f'"{value}"' + if value[0] == "'" and value[-1] == "'": + # Replace single quote with double quotes + # To distinguish quote as delimiter vs value in CSVs + # single quote is used instead of double quote in CSV values. + value[0] = '"' + value[-1] = '"' + #pvs[prop] = value + normalize = True + if '#' in prop or '=' in value: + # Value is a formula. e value as a string. + normalize = False + pv_utils.add_key_value( + prop, + value, + pvs, + self._config.get('multi_value_properties', {}), + normalize=normalize + ) + pv_map_input[key] = pvs + else: + logging.info( + f'Loading PV maps for {namespace} from dictionary file: {filename}' + ) + pv_map_input = read_py_dict_from_file(filename) + self.load_pvs_dict(pv_map_input, namespace) + + def load_pvs_dict(self, pv_map_input: dict, namespace: str = 'GLOBAL'): + if namespace not in self._pv_map: + self._pv_map[namespace] = {} + pv_map = self._pv_map[namespace] + word_delimiter = self._config.get('word_delimiter', ' ') + num_keys_added = 0 + for key, pvs_input in pv_map_input.items(): + if key not in pv_map: + pv_map[key] = {} + pvs_dict = pv_map[key] + if isinstance(pvs_input, str): + pvs_input = {namespace: pvs_input} + for p, v in pvs_input.items(): + # A property has multiple values from different configs. + # Concatenate new value to existing one with '__' + #if v not in pvs_dict[p]: + # pvs_dict[p] = '__'.join(sorted([pvs_dict[p], v])) + # logging.info(f'Joining values for {key}[{p}] into {pvs_dict[p]}') + #else: + #pv_utils.add_key_value( + # p, + # v, + # pvs_dict, + # self._config.get('multi_value_properties', {}), + #) + num_keys_added += 1 + pv_utils.add_key_value( + p, + v, + pvs_dict, + self._config.get('multi_value_properties', {}), + ) + # Track the max number of words in any of the keys. + # This is used when splitting input-string for lookups. + num_words_key = len(pv_utils.get_words(key, word_delimiter)) + self._max_words_in_keys = max(self._max_words_in_keys, + num_words_key) + logging.level_debug() and logging.log( + 2, f'Setting PVMap[{key}] = {pvs_dict}') + + self._num_pv_map_keys += num_keys_added + logging.info( + f'Loaded {num_keys_added} property-value mappings for "{namespace}"' + ) + logging.level_debug() and logging.debug( + f'Loaded pv map {namespace}:{pv_map_input}') + + def get_pv_map(self) -> dict: + """Returns the dictionary mapping input-strings to property:values.""" + return self._pv_map + + def process_pvs_for_data(self, key: str, pvs: dict) -> bool: + """Returns true if property:values are processed successfully. + + Processes values for actionable props such as '#Regex', '#Eval', '#Format'. + Args: pvs (input/output) dictionary of property:values Properties such as + '#Regex', '#Eval', '#Format' are processed and resulting properties are + updated into pvs. + + Returns: + True if any property:values were processed and pvs dict was updated. + """ + logging.level_debug() and logging.log( + 2, f'Processing data PVs:{key}:{pvs}') + data_key = self._config.get('data_key', 'Data') + data = pvs.get(data_key, key) + is_modified = False + + # Process regular expression and add named group matches to the PV. + # Regex PV is of the form: '#Regex': '(?P[0-9]+) *- *(?P[0-9])' + # Parses 'Data': '10 - 20' to generate PVs: + # { 'Start': '10', 'End': '20' } + regex_key = self._config.get('regex_key', '#Regex') + if regex_key in pvs and data: + re_pattern = pvs[regex_key] + re_matches = re.finditer(re_pattern, data) + regex_pvs = {} + for match in re_matches: + regex_pvs.update(match.groupdict()) + logging.level_debug() and logging.log( + 2, + f'Processed regex: {re_pattern} on {key}:{data} to get {regex_pvs}' + ) + if regex_pvs: + self._counters.add_counter('processed-regex', 1, re_pattern) + pv_utils.pvs_update( + regex_pvs, pvs, + self._config.get('multi_value_properties', {})) + pvs.pop(regex_key) + is_modified = True + + # Format the data substituting properties with values. + format_key = self._config.get('format_key', '#Format') + if format_key in pvs: + format_str = pvs[format_key] + (format_prop, strf) = _get_variable_expr(format_str, data_key) + try: + format_data = strf.format(**pvs) + logging.level_debug() and logging.log( + 2, + f'Processed format {format_prop}={strf} on {key}:{data} to get' + f' {format_data}') + except (KeyError, ValueError) as e: + format_data = format_str + self._counters.add_counter('error-process-format', 1, + format_str) + logging.level_debug() and logging.log( + 2, + f'Failed to format {format_prop}={strf} on {key}:{data} with' + f' {pvs}, {e}') + if format_prop != data_key and format_data != format_str: + pvs[format_prop] = format_data + self._counters.add_counter('processed-format', 1, format_str) + pvs.pop(format_key) + is_modified = True + + # Evaluate the expression properties as local variables. + eval_key = self._config.get('eval_key', '#Eval') + if eval_key in pvs: + eval_str = pvs[eval_key] + eval_prop, eval_data = eval_functions.evaluate_statement( + eval_str, + pvs, + self._config.get('eval_globals', eval_functions.EVAL_GLOBALS), + ) + logging.level_debug() and logging.log( + 2, + f'Processed eval {eval_str} with {pvs} to get {eval_prop}:{eval_data}' + ) + if not eval_prop: + eval_prop = data_key + if eval_data and eval_data != eval_str: + pvs[eval_prop] = eval_data + self._counters.add_counter('processed-eval', 1, eval_str) + pvs.pop(eval_key) + is_modified = True + logging.level_debug() and logging.log( + 2, f'Processed data PVs:{is_modified}:{key}:{pvs}') + return is_modified + + def get_pvs_for_key(self, key: str, namespace: str = 'GLOBAL') -> dict: + """Return a dict of property-values that are mapped to the given key + + within the dictionary for the namespace. + Args: + key: input string to be looked up + namespace: the top level dictionary key to get the map within which + input-string is looked up. + + Returns: + dictionary of property:values for the input string. + """ + pvs = None + logging.level_debug() and logging.log( + 3, f'Search PVs for {namespace}:{key}') + if namespace in self._pv_map: + pvs = self._pv_map[namespace].get(key, None) + else: + # Check if key is unique and exists in any other map. + dicts_with_key = [] + pvs = {} + namespaces = self._config.get('default_pv_maps', ['GLOBAL']) + for namespace in namespaces: + logging.level_debug() and logging.log( + 3, f'Search PVs for {namespace}:{key}') + if namespace in self._pv_map.keys(): + pv_map = self._pv_map[namespace] + if key in pv_map: + dicts_with_key.append(namespace) + pv_utils.pvs_update( + pv_map[key], pvs, + self._config.get('multi_value_properties', {})) + if len(dicts_with_key) > 1: + logging.warning( + f'Duplicate key {key} in property maps: {dicts_with_key}') + self._counters.add_counter( + f'warning-multiple-property-key', + 1, + f'{key}:' + ','.join(dicts_with_key), + ) + if not pvs: + logging.level_debug() and logging.log( + 3, f'Missing key {key} in property maps') + self._counters.add_counter(f'warning-missing-property-key', 1, key) + return pvs + logging.level_debug() and logging.debug(f'Got PVs for {key}:{pvs}') + return pvs + + def get_pvs_for_key_variants(self, + key: str, + namespace: str = 'GLOBAL') -> list: + """Return a dict of property-values that are mapped to the given key + + or its variantes with case lower case. + Args: + key: input string to be looked up + namespace: the top level dictionary key to get the map within which + input-string is looked up. + + Returns: + a list of dictionary of property:values for the input string. + """ + if not key: + return None + pvs = self.get_pvs_for_key(key, namespace) + if not pvs: + # Check if GLOBAL map has key namespace:column-key + pvs = self.get_pvs_for_key(f'{namespace}:{key}') + if not pvs: + pvs = self.get_pvs_for_key(key.lower(), namespace) + if pvs: + pvs_list = [pvs] + pvs_list.append({self._config.get('pv_lookup_key', 'Key'): key}) + return pvs_list + # Check for keys with extra characters removed. + key_filtered = re.sub('[^A-Za-z0-9_%$-]+', ' ', key).strip() + if key_filtered != key: + return self.get_pvs_for_key_variants(key_filtered, namespace) + return None + + def _is_key_in_value(self, key: str, value: str) -> bool: + """Returns True if key is a substring of the value string. + + Only substrings separated by the word boundary are considered. + """ + if self._config.get('match_substring_word_boundary', True): + # Match substring around word boundaries. + while value: + pos = value.find(key) + if pos < 0: + return False + if (pos == 0 or not value[pos - 1].isalpha()) and ( + pos + len(key) <= len(value) or + not value[pos + len(key)].isalpha()): + return True + value = value[pos:] + return False + # key_pat = f'\\b{key}\\b' + # try: + # if re.search(key_pat, value, flags=re.IGNORECASE): + # return True + # else: + # return False + # except re.error as e: + # logging.error( + # f'Failed re.search({key_pat}, {value}) with exception: {e}' + # ) + # return False + + # Simple substring without word boundary checks. + if key.lower() in value.lower(): + return True + return False + + def get_pvs_for_key_substring(self, + value: str, + namespace: str = 'GLOBAL') -> dict: + """Return a dict of property-values for any key is a substring of value + + Args: + value: input string to be mapped to property:values + namespace: column header or context for the value string used as the key + for the first level dictionary in the pv_map. + + Returns: + List of dictionary of property:values that apply to the input string + after collecting all PVs for any key that is a substring of the value. + """ + # Get a list of namespaces to lookup. + # If none given, lookup in all namespaces. + namespaces = [] + if namespace and namespace in self._pv_map: + namespaces.append(namespace) + else: + namespaces = list(self._pv_map.keys()) + pvs_list = [] + keys_list = [] + for n in namespaces: + # Lookup keys from shortest to longest. + # Caller will merge PVs in the reverse order. + pv_map = self._pv_map[n] + sorted_keys = sorted(pv_map.keys(), key=len, reverse=True) + for key in sorted_keys: + if self._is_key_in_value(key, value): + pvs_list.append(pv_map[key]) + keys_list.append(key) + logging.level_debug() and logging.log( + 3, f'Got PVs for {key} in {value}: {pvs_list}') + value = value.replace(key, ' ') + logging.level_debug() and logging.log( + 2, + f'Returning pvs for substrings of {value} from {keys_list}:{pvs_list}' + ) + return pvs_list + + def get_all_pvs_for_value(self, + value: str, + namespace: str = 'GLOBAL', + max_fragment_size: int = None) -> list: + """Return a list of property:value dictionaries for an input string. + + Args: + value: input string to be mapped to property:values + namespace: context for the input string such as the column header. + max_fragment_size: the maximum number of words into which value can be + fragmented when looking for matching keys in the pv_map. + + Returns: + a list of dictionary of property:values. + """ + logging.level_debug() and logging.log( + 1, f'Looking up PVs for {namespace}:{value}') + pvs = self.get_pvs_for_key_variants(value, namespace) + if pvs: + return pvs + # Split the value into n-grams and lookup PVs for each fragment. + word_delimiter = self._config.get('word_delimiter', ' ') + if not word_delimiter: + # Splitting of words is disabled. Don't match substrings. + return None + word_joiner = pv_utils.get_delimiter_char(word_delimiter) + words = pv_utils.get_words(value, word_delimiter) + if len(words) <= 1: + return None + max_fragment_words = len(words) - 1 + if not max_fragment_size: + max_fragment_size = self._max_words_in_keys + max_fragment_words = min(max_fragment_words, max_fragment_size) + + num_grams = (len(words) - max_fragment_size)**2 + if self._num_pv_map_keys < num_grams: + # Fewer keys than n-grams in input. + # Get PVs for keys in pv_map that are a substring of the input value. + return self.get_pvs_for_key_substring(value, namespace) + # Fewer n-grams than number of keys in map. + # Check if any input n-gram matches a key. + logging.level_debug() and logging.log( + 3, f'Looking up PVs for {max_fragment_words} words in {words}') + for num_words in range(max_fragment_words, 0, -1): + for start_index in range(0, len(words) - num_words + 1): + sub_value = word_joiner.join(words[start_index:start_index + + num_words]) + sub_pvs = self.get_pvs_for_key_variants(sub_value, namespace) + if sub_pvs: + # Got PVs for a fragment. + # Also lookup remaining fragments before and after this. + pvs_list = [] + before_value = word_delimiter.join(words[0:start_index]) + after_value = word_delimiter.join(words[start_index + + num_words:]) + logging.level_debug() and logging.log( + 3, + f'Got PVs for {start_index}:{num_words} in' + f' {words}:{sub_value}:{sub_pvs}, lookup pvs for {before_value},' + f' {after_value}', + ) + before_pvs = self.get_all_pvs_for_value( + # before_value, namespace, max_fragment_size=None) + before_value, + namespace, + max_fragment_size=num_words, + ) + after_pvs = self.get_all_pvs_for_value( + # after_value, namespace, max_fragment_size=None) + after_value, + namespace, + max_fragment_size=num_words, + ) + if before_pvs: + pvs_list.extend(before_pvs) + pvs_list.extend(sub_pvs) + if after_pvs: + pvs_list.extend(after_pvs) + logging.level_debug() and logging.log( + 2, f'Got PVs for fragments {before_value}:{before_pvs},' + f' {sub_value}:{sub_pvs}, {after_value}:{after_pvs}') + return pvs_list + return None + + +# Local utility functions +def _get_variable_expr(stmt: str, default_var: str = 'Data') -> (str, str): + """Parses a statement of the form = and returns variable, expr.""" + if '=' in stmt: + (var, expr) = stmt.split('=', 1) + return (var.strip(), expr) + return (default_var, stmt) + + +# PVMap utility functions +def load_pv_map(file: str) -> dict: + """Returns a PV map loaded from a file.""" + pvmap = PropertyValueMapper() + for file in file_util.file_get_matching(file): + pvmap.load_pvs_from_file(file) + pvs = pvmap.get_pv_map() + # Return the pvmap for the first namespace + if pvs: + return pvs[list(pvs.keys())[0]] + return {} + + +def write_pv_map(pvmap: dict, file: str) -> str: + """Write the PV map into a file.""" + if file_util.file_is_csv(file): + # Write pvmap as csv file with rows as : key,prop1,value1,prop2,value2 + with file_util.FileIO(file, 'w') as csv_file: + csv_writer = csv.writer(csv_file) + # Set CSV header as 'key, prop, value' + csv_writer.writerow(['key', 'property', 'value']) + # Write each pvmap node as a row. + for key, pvs in pvmap.items(): + row = [key] + for prop, value in pvs.items(): + row.append(prop) + row.append(value) + csv_writer.writerow(row) + else: + file_util.file_write_py_dict(pvmap, file) + logging.info(f'Wrote {len(pvmap)} rows of PVs into {file}') diff --git a/scripts/statvar/property_value_utils.py b/scripts/statvar/property_value_utils.py new file mode 100644 index 0000000000..b3863b7fb2 --- /dev/null +++ b/scripts/statvar/property_value_utils.py @@ -0,0 +1,154 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utility functions for proerty:values.""" + +import os +import re +import sys + +from typing import Union + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) +sys.path.append( + os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util')) + +from mcf_file_util import get_value_list, add_pv_to_node, strip_namespace + + +def is_valid_property(prop: str, schemaless: bool = False) -> bool: + """Returns True if the property begins with a letter, lowercase. + + If schemaless is true, property can begin with uppercase as well. + """ + if prop and isinstance(prop, str) and prop[0].isalpha(): + if schemaless or prop[0].islower(): + return True + return False + + +def is_valid_value(value: str) -> bool: + """Returns True if the value is valid without any references.""" + if value is None: + return False + if isinstance(value, str): + # Check there are no unresolved references. + if not value or value == '""': + return False + if '@' in value: + # Quoted strings can have @<2-letter-lang> suffix. + if not re.search('@[a-z]{2}"$', value): + return False + if '{' in value and '}' in value: + return False + return True + + +def is_schema_node(value: str) -> bool: + """Returns True if the value is a schema node reference.""" + if not value or not isinstance(value, str): + return False + if not value[0].isalpha() and value[0] != '[': + # Numbers or quoted strings are not schema nodes. + return False + # Check if string has any non alpha or non numeric codes + non_alnum_chars = [ + c for c in strip_namespace(value) + if not c.isalnum() and c not in ['_', '/', '[', ']', '.'] + ] + if non_alnum_chars: + return False + return True + + +def has_namespace(value: str) -> bool: + """Returns True if the value has a namespace of letters followed by ':'.""" + if not value or not isinstance(value, str): + return False + len_value = len(value) + pos = 0 + while pos < len_value: + if not value[pos].isalpha(): + break + pos += 1 + if pos < len_value and value[pos] == ':': + return True + return False + + +def add_key_value( + key: str, + value: str, + pvs: dict, + multi_value_keys: set = {}, + overwrite: bool = True, + normalize: bool = True, +) -> dict: + """Adds a key:value to the dict. + + If the key already exists, adds value to a list if key is a multi_value key, + else replaces the value if overwrite is True. + """ + append_value = False + if key in multi_value_keys: + append_value = True + if not append_value and not overwrite and key in pvs: + # Do not add value if one exists and overwrite and append is disabled. + return pvs + return add_pv_to_node(key, value, pvs, append_value=append_value, normalize=normalize) + + +def get_value_as_list(value: str) -> Union[str, list]: + """Returns the value as a list or string.""" + if isinstance(value, list): + return value + if isinstance(value, str) and value: + if "," in value: + # Get a list of unique values + values = set() + values.update(get_value_list(value)) + value_list = list(values) + if len(value_list) == 1: + return value_list[0] + return value_list + return value + + +def pvs_update(new_pvs: dict, pvs: dict, multi_value_keys: set = {}) -> dict: + """Add the key:value pairs from the new_pvs into the pvs dictionary.""" + for prop, value in new_pvs.items(): + add_key_value(prop, value, pvs, multi_value_keys) + return pvs + + +def get_words(value: str, word_delimiter: str) -> list: + """Returns the list of non-empty words separated by the delimiter.""" + return [w for w in re.split(word_delimiter, value) if w] + + +def get_delimiter_char(re_delimiter: str) -> str: + """Returns a single delimiter character that can be used to join words + + from the first character in the delimiter regex. + """ + if re_delimiter: + if '|' in re_delimiter: + return re_delimiter.split('|')[0] + if re_delimiter[0] == '[': + return re_delimiter[1] + return ' ' + + From 8abda2b736546191f2d5f82fe8be56ec8843299c Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Fri, 13 Dec 2024 15:18:41 +0530 Subject: [PATCH 2/3] move statvar files to tools/statvar_importer --- {scripts/statvar => tools/statvar_importer}/__init__.py | 0 {scripts/statvar => tools/statvar_importer}/mcf_diff.py | 0 {scripts/statvar => tools/statvar_importer}/mcf_diff_test.py | 0 {scripts/statvar => tools/statvar_importer}/mcf_file_util.py | 0 {scripts/statvar => tools/statvar_importer}/mcf_file_util_test.py | 0 {scripts/statvar => tools/statvar_importer}/mcf_filter.py | 0 {scripts/statvar => tools/statvar_importer}/mcf_filter_test.py | 0 {scripts/statvar => tools/statvar_importer}/ngram_matcher.py | 0 {scripts/statvar => tools/statvar_importer}/ngram_matcher_test.py | 0 .../statvar => tools/statvar_importer}/property_value_cache.py | 0 .../statvar_importer}/property_value_cache_test.py | 0 .../statvar => tools/statvar_importer}/property_value_mapper.py | 0 .../statvar => tools/statvar_importer}/property_value_utils.py | 0 .../test_data/india_census_sample_output_stat_vars.mcf | 0 .../statvar_importer}/test_data/sample_filtered.mcf | 0 .../statvar_importer}/test_data/sample_output_stat_vars.mcf | 0 .../test_data/us_census_B01001_output_stat_vars.mcf | 0 17 files changed, 0 insertions(+), 0 deletions(-) rename {scripts/statvar => tools/statvar_importer}/__init__.py (100%) rename {scripts/statvar => tools/statvar_importer}/mcf_diff.py (100%) rename {scripts/statvar => tools/statvar_importer}/mcf_diff_test.py (100%) rename {scripts/statvar => tools/statvar_importer}/mcf_file_util.py (100%) rename {scripts/statvar => tools/statvar_importer}/mcf_file_util_test.py (100%) rename {scripts/statvar => tools/statvar_importer}/mcf_filter.py (100%) rename {scripts/statvar => tools/statvar_importer}/mcf_filter_test.py (100%) rename {scripts/statvar => tools/statvar_importer}/ngram_matcher.py (100%) rename {scripts/statvar => tools/statvar_importer}/ngram_matcher_test.py (100%) rename {scripts/statvar => tools/statvar_importer}/property_value_cache.py (100%) rename {scripts/statvar => tools/statvar_importer}/property_value_cache_test.py (100%) rename {scripts/statvar => tools/statvar_importer}/property_value_mapper.py (100%) rename {scripts/statvar => tools/statvar_importer}/property_value_utils.py (100%) rename {scripts/statvar => tools/statvar_importer}/test_data/india_census_sample_output_stat_vars.mcf (100%) rename {scripts/statvar => tools/statvar_importer}/test_data/sample_filtered.mcf (100%) rename {scripts/statvar => tools/statvar_importer}/test_data/sample_output_stat_vars.mcf (100%) rename {scripts/statvar => tools/statvar_importer}/test_data/us_census_B01001_output_stat_vars.mcf (100%) diff --git a/scripts/statvar/__init__.py b/tools/statvar_importer/__init__.py similarity index 100% rename from scripts/statvar/__init__.py rename to tools/statvar_importer/__init__.py diff --git a/scripts/statvar/mcf_diff.py b/tools/statvar_importer/mcf_diff.py similarity index 100% rename from scripts/statvar/mcf_diff.py rename to tools/statvar_importer/mcf_diff.py diff --git a/scripts/statvar/mcf_diff_test.py b/tools/statvar_importer/mcf_diff_test.py similarity index 100% rename from scripts/statvar/mcf_diff_test.py rename to tools/statvar_importer/mcf_diff_test.py diff --git a/scripts/statvar/mcf_file_util.py b/tools/statvar_importer/mcf_file_util.py similarity index 100% rename from scripts/statvar/mcf_file_util.py rename to tools/statvar_importer/mcf_file_util.py diff --git a/scripts/statvar/mcf_file_util_test.py b/tools/statvar_importer/mcf_file_util_test.py similarity index 100% rename from scripts/statvar/mcf_file_util_test.py rename to tools/statvar_importer/mcf_file_util_test.py diff --git a/scripts/statvar/mcf_filter.py b/tools/statvar_importer/mcf_filter.py similarity index 100% rename from scripts/statvar/mcf_filter.py rename to tools/statvar_importer/mcf_filter.py diff --git a/scripts/statvar/mcf_filter_test.py b/tools/statvar_importer/mcf_filter_test.py similarity index 100% rename from scripts/statvar/mcf_filter_test.py rename to tools/statvar_importer/mcf_filter_test.py diff --git a/scripts/statvar/ngram_matcher.py b/tools/statvar_importer/ngram_matcher.py similarity index 100% rename from scripts/statvar/ngram_matcher.py rename to tools/statvar_importer/ngram_matcher.py diff --git a/scripts/statvar/ngram_matcher_test.py b/tools/statvar_importer/ngram_matcher_test.py similarity index 100% rename from scripts/statvar/ngram_matcher_test.py rename to tools/statvar_importer/ngram_matcher_test.py diff --git a/scripts/statvar/property_value_cache.py b/tools/statvar_importer/property_value_cache.py similarity index 100% rename from scripts/statvar/property_value_cache.py rename to tools/statvar_importer/property_value_cache.py diff --git a/scripts/statvar/property_value_cache_test.py b/tools/statvar_importer/property_value_cache_test.py similarity index 100% rename from scripts/statvar/property_value_cache_test.py rename to tools/statvar_importer/property_value_cache_test.py diff --git a/scripts/statvar/property_value_mapper.py b/tools/statvar_importer/property_value_mapper.py similarity index 100% rename from scripts/statvar/property_value_mapper.py rename to tools/statvar_importer/property_value_mapper.py diff --git a/scripts/statvar/property_value_utils.py b/tools/statvar_importer/property_value_utils.py similarity index 100% rename from scripts/statvar/property_value_utils.py rename to tools/statvar_importer/property_value_utils.py diff --git a/scripts/statvar/test_data/india_census_sample_output_stat_vars.mcf b/tools/statvar_importer/test_data/india_census_sample_output_stat_vars.mcf similarity index 100% rename from scripts/statvar/test_data/india_census_sample_output_stat_vars.mcf rename to tools/statvar_importer/test_data/india_census_sample_output_stat_vars.mcf diff --git a/scripts/statvar/test_data/sample_filtered.mcf b/tools/statvar_importer/test_data/sample_filtered.mcf similarity index 100% rename from scripts/statvar/test_data/sample_filtered.mcf rename to tools/statvar_importer/test_data/sample_filtered.mcf diff --git a/scripts/statvar/test_data/sample_output_stat_vars.mcf b/tools/statvar_importer/test_data/sample_output_stat_vars.mcf similarity index 100% rename from scripts/statvar/test_data/sample_output_stat_vars.mcf rename to tools/statvar_importer/test_data/sample_output_stat_vars.mcf diff --git a/scripts/statvar/test_data/us_census_B01001_output_stat_vars.mcf b/tools/statvar_importer/test_data/us_census_B01001_output_stat_vars.mcf similarity index 100% rename from scripts/statvar/test_data/us_census_B01001_output_stat_vars.mcf rename to tools/statvar_importer/test_data/us_census_B01001_output_stat_vars.mcf From 59232d1f4d878a9e40ab434e9e045354d6f7400b Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Tue, 17 Dec 2024 20:46:41 +0530 Subject: [PATCH 3/3] fix comments --- tools/statvar_importer/mcf_file_util.py | 9 +- .../statvar_importer/property_value_mapper.py | 627 ------------------ .../statvar_importer/property_value_utils.py | 154 ----- 3 files changed, 4 insertions(+), 786 deletions(-) delete mode 100644 tools/statvar_importer/property_value_mapper.py delete mode 100644 tools/statvar_importer/property_value_utils.py diff --git a/tools/statvar_importer/mcf_file_util.py b/tools/statvar_importer/mcf_file_util.py index 95c83ce3a4..970e7c3496 100644 --- a/tools/statvar_importer/mcf_file_util.py +++ b/tools/statvar_importer/mcf_file_util.py @@ -145,10 +145,11 @@ def strip_namespace(value: str) -> str: def strip_value(value: str) -> str: - """Returns the string value with spacesding/trailing space stripped. + """Returns the string value with leading/trailing space stripped + even if the value is enclosed in double quotes. Args: - value: string to be cleaned. + value: string to be cleaned as text or qithin double quotes. Returns: string without extra leading and trailing spaces. @@ -346,7 +347,7 @@ def update_mcf_nodes( append_values: bool = True, normalize: bool = True, ) -> dict: - """Returns output_nodes with Property:values form nodes added. + """Returns output_nodes with Property:values from nodes added. Args: nodes: dictionary of MCF nodes in the form: @@ -409,8 +410,6 @@ def load_mcf_nodes( ... } """ - if nodes is None: - nodes = {} if not filenames: return nodes # Load files in order of input diff --git a/tools/statvar_importer/property_value_mapper.py b/tools/statvar_importer/property_value_mapper.py deleted file mode 100644 index b58356fcf5..0000000000 --- a/tools/statvar_importer/property_value_mapper.py +++ /dev/null @@ -1,627 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Utility class to store property:value mappings for data strings.""" - -import csv -import os -import re -import sys - -from absl import app -from absl import flags -from absl import logging -from collections import OrderedDict - -_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(_SCRIPT_DIR) -sys.path.append(os.path.dirname(_SCRIPT_DIR)) -sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) -sys.path.append( - os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util')) - -import config_flags -import eval_functions -import file_util - -import property_value_utils as pv_utils - -from config_map import ConfigMap, read_py_dict_from_file -from counters import Counters, CounterOptions - - -class PropertyValueMapper: - """Class to map strings to set of property values. - - Supports multiple maps with a namespace or context string. Stores string to - property:value maps as a dictionary: _pv_map = { - - 'GLOBAL': { - '': { - '': '' - '': '' - ... - }, - ... - }, - '' : { - '': { - '': '' - ... - }, - ... - }, - } - - The first level keys in _pv_map are namespaces that are column-headers or - 'GLOBAL'. - When looking up PVs for an input string, such as a column header or a cell - value, - first the namespace column-header is tried. - If there are no values then other namespacs such as 'GLOBAL are tried. - - within the PV can have a reference to another property. - Such reference are replaced with that property's value after - all PVs for a data cell have been collected. - - The references are indicated with the syntax '{Variable}' or '@Variable'. - where 'Variable' is expected to be another property in the cell's PVs. - - Internal properties that require special processing begin with '#', such as: - '#Regex': refers to a regular expression with names match groups - to be applied on a cell value - '#Format': a format string to be processed with other parameters - '#Eval': a python statement to be evaluated. It could have some computations - of the form = where the '' is evaluated and - assigned to property or to 'Data'. - - The cell value is mapped to the following default properties: - 'Data': the string value in the cell - 'Number': the numeric value if the cell is a number. - """ - - def __init__( - self, - pv_map_files: list = [], - config_dict: dict = None, - counters_dict: dict = None, - ): - self._config = ConfigMap(config_dict=config_dict) - self._counters = Counters( - counters_dict=counters_dict, - options=CounterOptions(debug=self._config.get('debug', False)), - ) - # Map from a namespace to dictionary of string-> { p:v} - self._pv_map = OrderedDict({'GLOBAL': {}}) - self._num_pv_map_keys = 0 - self._max_words_in_keys = 0 - for filename in pv_map_files: - namespace = 'GLOBAL' - if not file_util.file_get_matching(filename): - if ':' in filename: - namespace, filename = filename.split(':', 1) - self.load_pvs_from_file(filename, namespace) - logging.level_debug() and logging.debug( - f'Loaded PV map {self._pv_map} with max words {self._max_words_in_keys}' - ) - - def load_pvs_from_file(self, filename: str, namespace: str = 'GLOBAL'): - """Loads a map of the form 'string -> { P: V }' from a file. - - File is a python dictionary or a JSON file with python equivalents such as - True(true), False(false), None(null). - - Args: - filename: file containing the dictionary of string to dictionary of PVs - namespace: the namespace key for the dictionary to be loaded against. the - namespace is the first level key in the _pv_map. - """ - # Append new PVs to existing map. - pv_map_input = {} - if file_util.file_is_csv(filename): - # Load rows into a dict of prop,value - # if the first col is a config key, next column is its value - logging.info( - f'Loading PV maps for {namespace} from csv file: {filename}') - with file_util.FileIO(filename) as csvfile: - csv_reader = csv.reader(csvfile, - skipinitialspace=True, - escapechar='\\') - for row in csv_reader: - # Drop trailing empty columns in the row - last_col = len(row) - 1 - while last_col >= 0 and row[last_col].strip() == '': - last_col -= 1 - row = row[:last_col + 1] - if not row: - continue - key = row[0].strip() - if key in self._config.get_configs(): - # Add value to the config with same type as original. - value = ','.join(row[1:]) - config_flags.set_config_value(key, value, self._config) - else: - # Row is a pv map - pvs_list = row[1:] - if len(pvs_list) == 1: - # PVs list has no property, just a value. - # Use the namespace as the property - pvs_list = [namespace] - pvs_list.append(row[1]) - if len(pvs_list) % 2 != 0: - raise RuntimeError( - f'Invalid list of property value: {row} in {filename}' - ) - # Get property,values from the columns - pvs = {} - for i in range(0, len(pvs_list), 2): - prop = pvs_list[i].strip() - if not prop: - continue - value = pvs_list[i + 1].strip() - if value == '""': - value = '' - # Remove extra quotes around schema values. - # if value and value[0] == '"' and value[-1] == '"': - # value = value[1:-1].strip() - if value and value[0] != '[' and prop[0] != '#': - # Add quotes around text strings - # with spaces without commas. - # if re.search('[^,] +', value): - # value = f'"{value}"' - if value[0] == "'" and value[-1] == "'": - # Replace single quote with double quotes - # To distinguish quote as delimiter vs value in CSVs - # single quote is used instead of double quote in CSV values. - value[0] = '"' - value[-1] = '"' - #pvs[prop] = value - normalize = True - if '#' in prop or '=' in value: - # Value is a formula. e value as a string. - normalize = False - pv_utils.add_key_value( - prop, - value, - pvs, - self._config.get('multi_value_properties', {}), - normalize=normalize - ) - pv_map_input[key] = pvs - else: - logging.info( - f'Loading PV maps for {namespace} from dictionary file: {filename}' - ) - pv_map_input = read_py_dict_from_file(filename) - self.load_pvs_dict(pv_map_input, namespace) - - def load_pvs_dict(self, pv_map_input: dict, namespace: str = 'GLOBAL'): - if namespace not in self._pv_map: - self._pv_map[namespace] = {} - pv_map = self._pv_map[namespace] - word_delimiter = self._config.get('word_delimiter', ' ') - num_keys_added = 0 - for key, pvs_input in pv_map_input.items(): - if key not in pv_map: - pv_map[key] = {} - pvs_dict = pv_map[key] - if isinstance(pvs_input, str): - pvs_input = {namespace: pvs_input} - for p, v in pvs_input.items(): - # A property has multiple values from different configs. - # Concatenate new value to existing one with '__' - #if v not in pvs_dict[p]: - # pvs_dict[p] = '__'.join(sorted([pvs_dict[p], v])) - # logging.info(f'Joining values for {key}[{p}] into {pvs_dict[p]}') - #else: - #pv_utils.add_key_value( - # p, - # v, - # pvs_dict, - # self._config.get('multi_value_properties', {}), - #) - num_keys_added += 1 - pv_utils.add_key_value( - p, - v, - pvs_dict, - self._config.get('multi_value_properties', {}), - ) - # Track the max number of words in any of the keys. - # This is used when splitting input-string for lookups. - num_words_key = len(pv_utils.get_words(key, word_delimiter)) - self._max_words_in_keys = max(self._max_words_in_keys, - num_words_key) - logging.level_debug() and logging.log( - 2, f'Setting PVMap[{key}] = {pvs_dict}') - - self._num_pv_map_keys += num_keys_added - logging.info( - f'Loaded {num_keys_added} property-value mappings for "{namespace}"' - ) - logging.level_debug() and logging.debug( - f'Loaded pv map {namespace}:{pv_map_input}') - - def get_pv_map(self) -> dict: - """Returns the dictionary mapping input-strings to property:values.""" - return self._pv_map - - def process_pvs_for_data(self, key: str, pvs: dict) -> bool: - """Returns true if property:values are processed successfully. - - Processes values for actionable props such as '#Regex', '#Eval', '#Format'. - Args: pvs (input/output) dictionary of property:values Properties such as - '#Regex', '#Eval', '#Format' are processed and resulting properties are - updated into pvs. - - Returns: - True if any property:values were processed and pvs dict was updated. - """ - logging.level_debug() and logging.log( - 2, f'Processing data PVs:{key}:{pvs}') - data_key = self._config.get('data_key', 'Data') - data = pvs.get(data_key, key) - is_modified = False - - # Process regular expression and add named group matches to the PV. - # Regex PV is of the form: '#Regex': '(?P[0-9]+) *- *(?P[0-9])' - # Parses 'Data': '10 - 20' to generate PVs: - # { 'Start': '10', 'End': '20' } - regex_key = self._config.get('regex_key', '#Regex') - if regex_key in pvs and data: - re_pattern = pvs[regex_key] - re_matches = re.finditer(re_pattern, data) - regex_pvs = {} - for match in re_matches: - regex_pvs.update(match.groupdict()) - logging.level_debug() and logging.log( - 2, - f'Processed regex: {re_pattern} on {key}:{data} to get {regex_pvs}' - ) - if regex_pvs: - self._counters.add_counter('processed-regex', 1, re_pattern) - pv_utils.pvs_update( - regex_pvs, pvs, - self._config.get('multi_value_properties', {})) - pvs.pop(regex_key) - is_modified = True - - # Format the data substituting properties with values. - format_key = self._config.get('format_key', '#Format') - if format_key in pvs: - format_str = pvs[format_key] - (format_prop, strf) = _get_variable_expr(format_str, data_key) - try: - format_data = strf.format(**pvs) - logging.level_debug() and logging.log( - 2, - f'Processed format {format_prop}={strf} on {key}:{data} to get' - f' {format_data}') - except (KeyError, ValueError) as e: - format_data = format_str - self._counters.add_counter('error-process-format', 1, - format_str) - logging.level_debug() and logging.log( - 2, - f'Failed to format {format_prop}={strf} on {key}:{data} with' - f' {pvs}, {e}') - if format_prop != data_key and format_data != format_str: - pvs[format_prop] = format_data - self._counters.add_counter('processed-format', 1, format_str) - pvs.pop(format_key) - is_modified = True - - # Evaluate the expression properties as local variables. - eval_key = self._config.get('eval_key', '#Eval') - if eval_key in pvs: - eval_str = pvs[eval_key] - eval_prop, eval_data = eval_functions.evaluate_statement( - eval_str, - pvs, - self._config.get('eval_globals', eval_functions.EVAL_GLOBALS), - ) - logging.level_debug() and logging.log( - 2, - f'Processed eval {eval_str} with {pvs} to get {eval_prop}:{eval_data}' - ) - if not eval_prop: - eval_prop = data_key - if eval_data and eval_data != eval_str: - pvs[eval_prop] = eval_data - self._counters.add_counter('processed-eval', 1, eval_str) - pvs.pop(eval_key) - is_modified = True - logging.level_debug() and logging.log( - 2, f'Processed data PVs:{is_modified}:{key}:{pvs}') - return is_modified - - def get_pvs_for_key(self, key: str, namespace: str = 'GLOBAL') -> dict: - """Return a dict of property-values that are mapped to the given key - - within the dictionary for the namespace. - Args: - key: input string to be looked up - namespace: the top level dictionary key to get the map within which - input-string is looked up. - - Returns: - dictionary of property:values for the input string. - """ - pvs = None - logging.level_debug() and logging.log( - 3, f'Search PVs for {namespace}:{key}') - if namespace in self._pv_map: - pvs = self._pv_map[namespace].get(key, None) - else: - # Check if key is unique and exists in any other map. - dicts_with_key = [] - pvs = {} - namespaces = self._config.get('default_pv_maps', ['GLOBAL']) - for namespace in namespaces: - logging.level_debug() and logging.log( - 3, f'Search PVs for {namespace}:{key}') - if namespace in self._pv_map.keys(): - pv_map = self._pv_map[namespace] - if key in pv_map: - dicts_with_key.append(namespace) - pv_utils.pvs_update( - pv_map[key], pvs, - self._config.get('multi_value_properties', {})) - if len(dicts_with_key) > 1: - logging.warning( - f'Duplicate key {key} in property maps: {dicts_with_key}') - self._counters.add_counter( - f'warning-multiple-property-key', - 1, - f'{key}:' + ','.join(dicts_with_key), - ) - if not pvs: - logging.level_debug() and logging.log( - 3, f'Missing key {key} in property maps') - self._counters.add_counter(f'warning-missing-property-key', 1, key) - return pvs - logging.level_debug() and logging.debug(f'Got PVs for {key}:{pvs}') - return pvs - - def get_pvs_for_key_variants(self, - key: str, - namespace: str = 'GLOBAL') -> list: - """Return a dict of property-values that are mapped to the given key - - or its variantes with case lower case. - Args: - key: input string to be looked up - namespace: the top level dictionary key to get the map within which - input-string is looked up. - - Returns: - a list of dictionary of property:values for the input string. - """ - if not key: - return None - pvs = self.get_pvs_for_key(key, namespace) - if not pvs: - # Check if GLOBAL map has key namespace:column-key - pvs = self.get_pvs_for_key(f'{namespace}:{key}') - if not pvs: - pvs = self.get_pvs_for_key(key.lower(), namespace) - if pvs: - pvs_list = [pvs] - pvs_list.append({self._config.get('pv_lookup_key', 'Key'): key}) - return pvs_list - # Check for keys with extra characters removed. - key_filtered = re.sub('[^A-Za-z0-9_%$-]+', ' ', key).strip() - if key_filtered != key: - return self.get_pvs_for_key_variants(key_filtered, namespace) - return None - - def _is_key_in_value(self, key: str, value: str) -> bool: - """Returns True if key is a substring of the value string. - - Only substrings separated by the word boundary are considered. - """ - if self._config.get('match_substring_word_boundary', True): - # Match substring around word boundaries. - while value: - pos = value.find(key) - if pos < 0: - return False - if (pos == 0 or not value[pos - 1].isalpha()) and ( - pos + len(key) <= len(value) or - not value[pos + len(key)].isalpha()): - return True - value = value[pos:] - return False - # key_pat = f'\\b{key}\\b' - # try: - # if re.search(key_pat, value, flags=re.IGNORECASE): - # return True - # else: - # return False - # except re.error as e: - # logging.error( - # f'Failed re.search({key_pat}, {value}) with exception: {e}' - # ) - # return False - - # Simple substring without word boundary checks. - if key.lower() in value.lower(): - return True - return False - - def get_pvs_for_key_substring(self, - value: str, - namespace: str = 'GLOBAL') -> dict: - """Return a dict of property-values for any key is a substring of value - - Args: - value: input string to be mapped to property:values - namespace: column header or context for the value string used as the key - for the first level dictionary in the pv_map. - - Returns: - List of dictionary of property:values that apply to the input string - after collecting all PVs for any key that is a substring of the value. - """ - # Get a list of namespaces to lookup. - # If none given, lookup in all namespaces. - namespaces = [] - if namespace and namespace in self._pv_map: - namespaces.append(namespace) - else: - namespaces = list(self._pv_map.keys()) - pvs_list = [] - keys_list = [] - for n in namespaces: - # Lookup keys from shortest to longest. - # Caller will merge PVs in the reverse order. - pv_map = self._pv_map[n] - sorted_keys = sorted(pv_map.keys(), key=len, reverse=True) - for key in sorted_keys: - if self._is_key_in_value(key, value): - pvs_list.append(pv_map[key]) - keys_list.append(key) - logging.level_debug() and logging.log( - 3, f'Got PVs for {key} in {value}: {pvs_list}') - value = value.replace(key, ' ') - logging.level_debug() and logging.log( - 2, - f'Returning pvs for substrings of {value} from {keys_list}:{pvs_list}' - ) - return pvs_list - - def get_all_pvs_for_value(self, - value: str, - namespace: str = 'GLOBAL', - max_fragment_size: int = None) -> list: - """Return a list of property:value dictionaries for an input string. - - Args: - value: input string to be mapped to property:values - namespace: context for the input string such as the column header. - max_fragment_size: the maximum number of words into which value can be - fragmented when looking for matching keys in the pv_map. - - Returns: - a list of dictionary of property:values. - """ - logging.level_debug() and logging.log( - 1, f'Looking up PVs for {namespace}:{value}') - pvs = self.get_pvs_for_key_variants(value, namespace) - if pvs: - return pvs - # Split the value into n-grams and lookup PVs for each fragment. - word_delimiter = self._config.get('word_delimiter', ' ') - if not word_delimiter: - # Splitting of words is disabled. Don't match substrings. - return None - word_joiner = pv_utils.get_delimiter_char(word_delimiter) - words = pv_utils.get_words(value, word_delimiter) - if len(words) <= 1: - return None - max_fragment_words = len(words) - 1 - if not max_fragment_size: - max_fragment_size = self._max_words_in_keys - max_fragment_words = min(max_fragment_words, max_fragment_size) - - num_grams = (len(words) - max_fragment_size)**2 - if self._num_pv_map_keys < num_grams: - # Fewer keys than n-grams in input. - # Get PVs for keys in pv_map that are a substring of the input value. - return self.get_pvs_for_key_substring(value, namespace) - # Fewer n-grams than number of keys in map. - # Check if any input n-gram matches a key. - logging.level_debug() and logging.log( - 3, f'Looking up PVs for {max_fragment_words} words in {words}') - for num_words in range(max_fragment_words, 0, -1): - for start_index in range(0, len(words) - num_words + 1): - sub_value = word_joiner.join(words[start_index:start_index + - num_words]) - sub_pvs = self.get_pvs_for_key_variants(sub_value, namespace) - if sub_pvs: - # Got PVs for a fragment. - # Also lookup remaining fragments before and after this. - pvs_list = [] - before_value = word_delimiter.join(words[0:start_index]) - after_value = word_delimiter.join(words[start_index + - num_words:]) - logging.level_debug() and logging.log( - 3, - f'Got PVs for {start_index}:{num_words} in' - f' {words}:{sub_value}:{sub_pvs}, lookup pvs for {before_value},' - f' {after_value}', - ) - before_pvs = self.get_all_pvs_for_value( - # before_value, namespace, max_fragment_size=None) - before_value, - namespace, - max_fragment_size=num_words, - ) - after_pvs = self.get_all_pvs_for_value( - # after_value, namespace, max_fragment_size=None) - after_value, - namespace, - max_fragment_size=num_words, - ) - if before_pvs: - pvs_list.extend(before_pvs) - pvs_list.extend(sub_pvs) - if after_pvs: - pvs_list.extend(after_pvs) - logging.level_debug() and logging.log( - 2, f'Got PVs for fragments {before_value}:{before_pvs},' - f' {sub_value}:{sub_pvs}, {after_value}:{after_pvs}') - return pvs_list - return None - - -# Local utility functions -def _get_variable_expr(stmt: str, default_var: str = 'Data') -> (str, str): - """Parses a statement of the form = and returns variable, expr.""" - if '=' in stmt: - (var, expr) = stmt.split('=', 1) - return (var.strip(), expr) - return (default_var, stmt) - - -# PVMap utility functions -def load_pv_map(file: str) -> dict: - """Returns a PV map loaded from a file.""" - pvmap = PropertyValueMapper() - for file in file_util.file_get_matching(file): - pvmap.load_pvs_from_file(file) - pvs = pvmap.get_pv_map() - # Return the pvmap for the first namespace - if pvs: - return pvs[list(pvs.keys())[0]] - return {} - - -def write_pv_map(pvmap: dict, file: str) -> str: - """Write the PV map into a file.""" - if file_util.file_is_csv(file): - # Write pvmap as csv file with rows as : key,prop1,value1,prop2,value2 - with file_util.FileIO(file, 'w') as csv_file: - csv_writer = csv.writer(csv_file) - # Set CSV header as 'key, prop, value' - csv_writer.writerow(['key', 'property', 'value']) - # Write each pvmap node as a row. - for key, pvs in pvmap.items(): - row = [key] - for prop, value in pvs.items(): - row.append(prop) - row.append(value) - csv_writer.writerow(row) - else: - file_util.file_write_py_dict(pvmap, file) - logging.info(f'Wrote {len(pvmap)} rows of PVs into {file}') diff --git a/tools/statvar_importer/property_value_utils.py b/tools/statvar_importer/property_value_utils.py deleted file mode 100644 index b3863b7fb2..0000000000 --- a/tools/statvar_importer/property_value_utils.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Utility functions for proerty:values.""" - -import os -import re -import sys - -from typing import Union - -_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(_SCRIPT_DIR) -sys.path.append(os.path.dirname(_SCRIPT_DIR)) -sys.path.append(os.path.dirname(os.path.dirname(_SCRIPT_DIR))) -sys.path.append( - os.path.join(os.path.dirname(os.path.dirname(_SCRIPT_DIR)), 'util')) - -from mcf_file_util import get_value_list, add_pv_to_node, strip_namespace - - -def is_valid_property(prop: str, schemaless: bool = False) -> bool: - """Returns True if the property begins with a letter, lowercase. - - If schemaless is true, property can begin with uppercase as well. - """ - if prop and isinstance(prop, str) and prop[0].isalpha(): - if schemaless or prop[0].islower(): - return True - return False - - -def is_valid_value(value: str) -> bool: - """Returns True if the value is valid without any references.""" - if value is None: - return False - if isinstance(value, str): - # Check there are no unresolved references. - if not value or value == '""': - return False - if '@' in value: - # Quoted strings can have @<2-letter-lang> suffix. - if not re.search('@[a-z]{2}"$', value): - return False - if '{' in value and '}' in value: - return False - return True - - -def is_schema_node(value: str) -> bool: - """Returns True if the value is a schema node reference.""" - if not value or not isinstance(value, str): - return False - if not value[0].isalpha() and value[0] != '[': - # Numbers or quoted strings are not schema nodes. - return False - # Check if string has any non alpha or non numeric codes - non_alnum_chars = [ - c for c in strip_namespace(value) - if not c.isalnum() and c not in ['_', '/', '[', ']', '.'] - ] - if non_alnum_chars: - return False - return True - - -def has_namespace(value: str) -> bool: - """Returns True if the value has a namespace of letters followed by ':'.""" - if not value or not isinstance(value, str): - return False - len_value = len(value) - pos = 0 - while pos < len_value: - if not value[pos].isalpha(): - break - pos += 1 - if pos < len_value and value[pos] == ':': - return True - return False - - -def add_key_value( - key: str, - value: str, - pvs: dict, - multi_value_keys: set = {}, - overwrite: bool = True, - normalize: bool = True, -) -> dict: - """Adds a key:value to the dict. - - If the key already exists, adds value to a list if key is a multi_value key, - else replaces the value if overwrite is True. - """ - append_value = False - if key in multi_value_keys: - append_value = True - if not append_value and not overwrite and key in pvs: - # Do not add value if one exists and overwrite and append is disabled. - return pvs - return add_pv_to_node(key, value, pvs, append_value=append_value, normalize=normalize) - - -def get_value_as_list(value: str) -> Union[str, list]: - """Returns the value as a list or string.""" - if isinstance(value, list): - return value - if isinstance(value, str) and value: - if "," in value: - # Get a list of unique values - values = set() - values.update(get_value_list(value)) - value_list = list(values) - if len(value_list) == 1: - return value_list[0] - return value_list - return value - - -def pvs_update(new_pvs: dict, pvs: dict, multi_value_keys: set = {}) -> dict: - """Add the key:value pairs from the new_pvs into the pvs dictionary.""" - for prop, value in new_pvs.items(): - add_key_value(prop, value, pvs, multi_value_keys) - return pvs - - -def get_words(value: str, word_delimiter: str) -> list: - """Returns the list of non-empty words separated by the delimiter.""" - return [w for w in re.split(word_delimiter, value) if w] - - -def get_delimiter_char(re_delimiter: str) -> str: - """Returns a single delimiter character that can be used to join words - - from the first character in the delimiter regex. - """ - if re_delimiter: - if '|' in re_delimiter: - return re_delimiter.split('|')[0] - if re_delimiter[0] == '[': - return re_delimiter[1] - return ' ' - -