From e45f23993971badffc5272b71acf2f82772b6f2c Mon Sep 17 00:00:00 2001
From: rafasumi <rafaelsumitani@dcc.ufmg.br>
Date: Sat, 30 Nov 2024 16:38:48 -0300
Subject: [PATCH 1/3] Add script to tune probabilities.

---
 scripts/tune_probabilities.py | 133 ++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 scripts/tune_probabilities.py

diff --git a/scripts/tune_probabilities.py b/scripts/tune_probabilities.py
new file mode 100644
index 0000000000..e3919a4d61
--- /dev/null
+++ b/scripts/tune_probabilities.py
@@ -0,0 +1,133 @@
+"""Update JSON with probabilities
+
+This script updates the JSON file with the probabilistic context-free grammar
+to avoid having production rules with very small probabilities. The
+normalization through a sigmoid with temperature. For high temperatures, the
+distribution will be more spread out. However the relative order between
+probabilities for a production rule will be respected during normalization.
+
+The counts for a production rule will only be normalized if the log of the
+maximum count divided by the minimum count for this rule is greater than a
+constant threshold, which can be set as an argument.
+
+Usage:
+    tune_probabilities.py [-h] [--output_file OUTPUT_FILE]
+                          [--threshold THRESHOLD] [--temp TEMP]
+                          probabilities_path
+
+Arguments:
+    probabilities_path
+        (str) Path to the JSON with probabilities.
+    --output_file OUTPUT_FILE
+        (str) Output path for the JSON with updated probabilities (default: ./updated_probabilities.json).
+    --threshold THRESHOLD, -c THRESHOLD
+        (int) Threshold that defines if the counts for rule will be normalized (default: 3).
+    --temp TEMP
+        (float) Temperature for the normalization function (default: 1).
+    -h, --help
+        show help message and exit
+"""
+
+import argparse
+import json
+import numpy as np
+import numpy.typing as npt
+import os
+import sys
+
+
+def convert_float_array_to_int(normalized_counts: npt.NDArray) -> npt.NDArray:
+    min_exponent = abs(np.floor(np.log10(normalized_counts)).min())
+    # The base exponent is used to guarantee that counts won't be too small
+    base_exponent = 3
+    scaling_factor = 10 ** (min_exponent + base_exponent)
+    return np.round(normalized_counts * scaling_factor).astype(int).tolist()
+
+
+def z_score_scaling(counts: npt.NDArray) -> npt.NDArray:
+    return (counts - np.mean(counts)) / np.std(counts)
+
+
+def sigmoid(counts: npt.NDArray, temperature: float) -> npt.NDArray:
+    counts = z_score_scaling(counts)
+    counts = counts / temperature
+    return 1 / (1 + np.exp(-counts))
+
+
+def update_counts(probabilities: dict, threshold: int, temperature: float) -> list:
+    updated_rules = []
+    for rule, productions in probabilities.items():
+        counts = np.array(list(productions.values()))
+        max_count = counts.max()
+        min_count = counts.min()
+
+        if np.log2(max_count / min_count) > threshold:
+            updated_rules.append(rule)
+            normalized_counts = convert_float_array_to_int(sigmoid(counts, temperature))
+            for key, count in zip(productions.keys(), normalized_counts):
+                productions[key] = count
+
+    return updated_rules
+
+
+def read_json(probabilities_path: str) -> dict:
+    with open(probabilities_path, "r", encoding="utf-8") as input_json:
+        json_content = input_json.read()
+
+    return json.loads(json_content if json_content != "" else "{}")
+
+
+def save_json(probabilities: dict, output_file: str) -> None:
+    with open(output_file, "w", encoding="utf-8") as file:
+        json.dump(probabilities, file)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "probabilities_path",
+        type=str,
+        help="(%(type)s) Path to the JSON with probabilities.",
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        help="(%(type)s) Output path for the JSON with updated probabilities (default: %(default)s).",
+        default="./updated_probabilities.json",
+    )
+    parser.add_argument(
+        "--threshold",
+        "-c",
+        type=int,
+        help="(%(type)s) Threshold that defines if the counts for rule will be normalized (default: %(default)s).",
+        default=3,
+    )
+    parser.add_argument(
+        "--temp",
+        type=float,
+        help="(%(type)s) Temperature for the normalization function (default: %(default)s).",
+        default=1,
+    )
+    args = parser.parse_args()
+
+    if not os.path.isfile(args.probabilities_path):
+        print(f"Invalid file '{args.probabilities_path}'.", file=sys.stderr)
+        exit(1)
+
+    return args
+
+
+def main() -> None:
+    args = parse_args()
+
+    probabilities = read_json(args.probabilities_path)
+    updated_rules = update_counts(probabilities, args.threshold, args.temp)
+    save_json(probabilities, args.output_file)
+
+    print("Updated production rules:")
+    for rule in updated_rules:
+        print(rule)
+
+
+if __name__ == "__main__":
+    main()

From 473fa43419eac1a52364ad3e434f7c92cf64e8e1 Mon Sep 17 00:00:00 2001
From: rafasumi <rafaelsumitani@dcc.ufmg.br>
Date: Sat, 30 Nov 2024 16:45:58 -0300
Subject: [PATCH 2/3] Fix typing in tune_probabilities.py

---
 scripts/tune_probabilities.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/tune_probabilities.py b/scripts/tune_probabilities.py
index e3919a4d61..84353c68cc 100644
--- a/scripts/tune_probabilities.py
+++ b/scripts/tune_probabilities.py
@@ -36,7 +36,7 @@
 import sys
 
 
-def convert_float_array_to_int(normalized_counts: npt.NDArray) -> npt.NDArray:
+def convert_float_array_to_int(normalized_counts: npt.NDArray) -> list:
     min_exponent = abs(np.floor(np.log10(normalized_counts)).min())
     # The base exponent is used to guarantee that counts won't be too small
     base_exponent = 3

From 67c45b05c16b861f6f96fb47524c8d09a310e2f5 Mon Sep 17 00:00:00 2001
From: rafasumi <rafaelsumitani@dcc.ufmg.br>
Date: Wed, 4 Dec 2024 19:52:58 -0300
Subject: [PATCH 3/3] Adjustments to tuning script

---
 scripts/tune_probabilities.py | 38 ++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/scripts/tune_probabilities.py b/scripts/tune_probabilities.py
index 84353c68cc..513a4ac74e 100644
--- a/scripts/tune_probabilities.py
+++ b/scripts/tune_probabilities.py
@@ -28,6 +28,7 @@
         show help message and exit
 """
 
+# 2572
 import argparse
 import json
 import numpy as np
@@ -54,18 +55,33 @@ def sigmoid(counts: npt.NDArray, temperature: float) -> npt.NDArray:
     return 1 / (1 + np.exp(-counts))
 
 
-def update_counts(probabilities: dict, threshold: int, temperature: float) -> list:
+def update_counts(
+    probabilities: dict, threshold: int, temperature: float, large_diff: int
+) -> list:
     updated_rules = []
     for rule, productions in probabilities.items():
+        if len(productions.values()) < 2:
+            continue
+
         counts = np.array(list(productions.values()))
+        more_than_one_production = " " in list(productions.keys())[counts.argmin()]
+
+        if more_than_one_production:
+            continue
+        print(list(productions.keys())[counts.argmin()])
+
         max_count = counts.max()
         min_count = counts.min()
 
-        if np.log2(max_count / min_count) > threshold:
-            updated_rules.append(rule)
-            normalized_counts = convert_float_array_to_int(sigmoid(counts, temperature))
-            for key, count in zip(productions.keys(), normalized_counts):
-                productions[key] = count
+        diff = np.log10(max_count / min_count)
+        if diff < threshold:
+            continue
+
+        updated_rules.append(rule)
+        normalized_counts = convert_float_array_to_int(sigmoid(counts, temperature))
+
+        for key, count in zip(productions.keys(), normalized_counts):
+            productions[key] = count
 
     return updated_rules
 
@@ -108,6 +124,12 @@ def parse_args() -> argparse.Namespace:
         help="(%(type)s) Temperature for the normalization function (default: %(default)s).",
         default=1,
     )
+    parser.add_argument(
+        "--large_diff",
+        type=int,
+        help="(%(type)s) Sets the large proportial difference, in log scale (default: %(default)s).",
+        default=8,
+    )
     args = parser.parse_args()
 
     if not os.path.isfile(args.probabilities_path):
@@ -121,7 +143,9 @@ def main() -> None:
     args = parse_args()
 
     probabilities = read_json(args.probabilities_path)
-    updated_rules = update_counts(probabilities, args.threshold, args.temp)
+    updated_rules = update_counts(
+        probabilities, args.threshold, args.temp, args.large_diff
+    )
     save_json(probabilities, args.output_file)
 
     print("Updated production rules:")