From e45f23993971badffc5272b71acf2f82772b6f2c Mon Sep 17 00:00:00 2001 From: rafasumi Date: Sat, 30 Nov 2024 16:38:48 -0300 Subject: [PATCH 1/3] Add script to tune probabilities. --- scripts/tune_probabilities.py | 133 ++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 scripts/tune_probabilities.py diff --git a/scripts/tune_probabilities.py b/scripts/tune_probabilities.py new file mode 100644 index 0000000000..e3919a4d61 --- /dev/null +++ b/scripts/tune_probabilities.py @@ -0,0 +1,133 @@ +"""Update JSON with probabilities + +This script updates the JSON file with the probabilistic context-free grammar +to avoid having production rules with very small probabilities. The +normalization through a sigmoid with temperature. For high temperatures, the +distribution will be more spread out. However the relative order between +probabilities for a production rule will be respected during normalization. + +The counts for a production rule will only be normalized if the log of the +maximum count divided by the minimum count for this rule is greater than a +constant threshold, which can be set as an argument. + +Usage: + tune_probabilities.py [-h] [--output_file OUTPUT_FILE] + [--threshold THRESHOLD] [--temp TEMP] + probabilities_path + +Arguments: + probabilities_path + (str) Path to the JSON with probabilities. + --output_file OUTPUT_FILE + (str) Output path for the JSON with updated probabilities (default: ./updated_probabilities.json). + --threshold THRESHOLD, -c THRESHOLD + (int) Threshold that defines if the counts for rule will be normalized (default: 3). + --temp TEMP + (float) Temperature for the normalization function (default: 1). + -h, --help + show help message and exit +""" + +import argparse +import json +import numpy as np +import numpy.typing as npt +import os +import sys + + +def convert_float_array_to_int(normalized_counts: npt.NDArray) -> npt.NDArray: + min_exponent = abs(np.floor(np.log10(normalized_counts)).min()) + # The base exponent is used to guarantee that counts won't be too small + base_exponent = 3 + scaling_factor = 10 ** (min_exponent + base_exponent) + return np.round(normalized_counts * scaling_factor).astype(int).tolist() + + +def z_score_scaling(counts: npt.NDArray) -> npt.NDArray: + return (counts - np.mean(counts)) / np.std(counts) + + +def sigmoid(counts: npt.NDArray, temperature: float) -> npt.NDArray: + counts = z_score_scaling(counts) + counts = counts / temperature + return 1 / (1 + np.exp(-counts)) + + +def update_counts(probabilities: dict, threshold: int, temperature: float) -> list: + updated_rules = [] + for rule, productions in probabilities.items(): + counts = np.array(list(productions.values())) + max_count = counts.max() + min_count = counts.min() + + if np.log2(max_count / min_count) > threshold: + updated_rules.append(rule) + normalized_counts = convert_float_array_to_int(sigmoid(counts, temperature)) + for key, count in zip(productions.keys(), normalized_counts): + productions[key] = count + + return updated_rules + + +def read_json(probabilities_path: str) -> dict: + with open(probabilities_path, "r", encoding="utf-8") as input_json: + json_content = input_json.read() + + return json.loads(json_content if json_content != "" else "{}") + + +def save_json(probabilities: dict, output_file: str) -> None: + with open(output_file, "w", encoding="utf-8") as file: + json.dump(probabilities, file) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument( + "probabilities_path", + type=str, + help="(%(type)s) Path to the JSON with probabilities.", + ) + parser.add_argument( + "--output_file", + type=str, + help="(%(type)s) Output path for the JSON with updated probabilities (default: %(default)s).", + default="./updated_probabilities.json", + ) + parser.add_argument( + "--threshold", + "-c", + type=int, + help="(%(type)s) Threshold that defines if the counts for rule will be normalized (default: %(default)s).", + default=3, + ) + parser.add_argument( + "--temp", + type=float, + help="(%(type)s) Temperature for the normalization function (default: %(default)s).", + default=1, + ) + args = parser.parse_args() + + if not os.path.isfile(args.probabilities_path): + print(f"Invalid file '{args.probabilities_path}'.", file=sys.stderr) + exit(1) + + return args + + +def main() -> None: + args = parse_args() + + probabilities = read_json(args.probabilities_path) + updated_rules = update_counts(probabilities, args.threshold, args.temp) + save_json(probabilities, args.output_file) + + print("Updated production rules:") + for rule in updated_rules: + print(rule) + + +if __name__ == "__main__": + main() From 473fa43419eac1a52364ad3e434f7c92cf64e8e1 Mon Sep 17 00:00:00 2001 From: rafasumi Date: Sat, 30 Nov 2024 16:45:58 -0300 Subject: [PATCH 2/3] Fix typing in tune_probabilities.py --- scripts/tune_probabilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tune_probabilities.py b/scripts/tune_probabilities.py index e3919a4d61..84353c68cc 100644 --- a/scripts/tune_probabilities.py +++ b/scripts/tune_probabilities.py @@ -36,7 +36,7 @@ import sys -def convert_float_array_to_int(normalized_counts: npt.NDArray) -> npt.NDArray: +def convert_float_array_to_int(normalized_counts: npt.NDArray) -> list: min_exponent = abs(np.floor(np.log10(normalized_counts)).min()) # The base exponent is used to guarantee that counts won't be too small base_exponent = 3 From 67c45b05c16b861f6f96fb47524c8d09a310e2f5 Mon Sep 17 00:00:00 2001 From: rafasumi Date: Wed, 4 Dec 2024 19:52:58 -0300 Subject: [PATCH 3/3] Adjustments to tuning script --- scripts/tune_probabilities.py | 38 ++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/scripts/tune_probabilities.py b/scripts/tune_probabilities.py index 84353c68cc..513a4ac74e 100644 --- a/scripts/tune_probabilities.py +++ b/scripts/tune_probabilities.py @@ -28,6 +28,7 @@ show help message and exit """ +# 2572 import argparse import json import numpy as np @@ -54,18 +55,33 @@ def sigmoid(counts: npt.NDArray, temperature: float) -> npt.NDArray: return 1 / (1 + np.exp(-counts)) -def update_counts(probabilities: dict, threshold: int, temperature: float) -> list: +def update_counts( + probabilities: dict, threshold: int, temperature: float, large_diff: int +) -> list: updated_rules = [] for rule, productions in probabilities.items(): + if len(productions.values()) < 2: + continue + counts = np.array(list(productions.values())) + more_than_one_production = " " in list(productions.keys())[counts.argmin()] + + if more_than_one_production: + continue + print(list(productions.keys())[counts.argmin()]) + max_count = counts.max() min_count = counts.min() - if np.log2(max_count / min_count) > threshold: - updated_rules.append(rule) - normalized_counts = convert_float_array_to_int(sigmoid(counts, temperature)) - for key, count in zip(productions.keys(), normalized_counts): - productions[key] = count + diff = np.log10(max_count / min_count) + if diff < threshold: + continue + + updated_rules.append(rule) + normalized_counts = convert_float_array_to_int(sigmoid(counts, temperature)) + + for key, count in zip(productions.keys(), normalized_counts): + productions[key] = count return updated_rules @@ -108,6 +124,12 @@ def parse_args() -> argparse.Namespace: help="(%(type)s) Temperature for the normalization function (default: %(default)s).", default=1, ) + parser.add_argument( + "--large_diff", + type=int, + help="(%(type)s) Sets the large proportial difference, in log scale (default: %(default)s).", + default=8, + ) args = parser.parse_args() if not os.path.isfile(args.probabilities_path): @@ -121,7 +143,9 @@ def main() -> None: args = parse_args() probabilities = read_json(args.probabilities_path) - updated_rules = update_counts(probabilities, args.threshold, args.temp) + updated_rules = update_counts( + probabilities, args.threshold, args.temp, args.large_diff + ) save_json(probabilities, args.output_file) print("Updated production rules:")