From 0b3e5fb585060c67b9a155fe5c8036d0f13ea5d7 Mon Sep 17 00:00:00 2001
From: Kevin Ramirez <kvnzavalza@hotmail.com>
Date: Tue, 13 Feb 2024 11:14:34 -0600
Subject: [PATCH 01/11] fix(columbia_merger): exclude combined opinions from
 cluster

---
 .../management/commands/columbia_merge.py     | 64 +++++++++----------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py
index ee280e12f6..fcf44cedb4 100644
--- a/cl/corpus_importer/management/commands/columbia_merge.py
+++ b/cl/corpus_importer/management/commands/columbia_merge.py
@@ -79,7 +79,6 @@
     Docket.COLUMBIA_AND_RECAP_AND_SCRAPER_AND_IDB_AND_HARVARD,
 ]
 
-
 VALID_MERGED_SOURCES = [
     key
     for key in dict(SOURCES.NAMES).keys()
@@ -120,7 +119,8 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]:
     :return: list with opinion content from cl
     """
     cl_cleaned_opinions = []
-    opinions_from_cluster = Opinion.objects.filter(cluster_id=cluster_id)
+    opinions_from_cluster = Opinion.objects.filter(cluster_id=cluster_id).exclude(
+        type="010combined")
     is_harvard = False
 
     for i, op in enumerate(opinions_from_cluster):
@@ -153,7 +153,7 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]:
 
 
 def update_matching_opinions(
-    matches: dict, cl_cleaned_opinions: list, columbia_opinions: list
+        matches: dict, cl_cleaned_opinions: list, columbia_opinions: list
 ) -> None:
     """Store matching opinion content in html_columbia field from Opinion object
 
@@ -183,8 +183,8 @@ def update_matching_opinions(
         else:
             if author_str:
                 if (
-                    find_just_name(op.author_str).lower()
-                    != find_just_name(author_str).lower()
+                        find_just_name(op.author_str).lower()
+                        != find_just_name(author_str).lower()
                 ):
                     # last resort, use distance between words to solve typos
                     s = SequenceMatcher(
@@ -209,8 +209,8 @@ def update_matching_opinions(
 
 
 def map_and_merge_opinions(
-    cluster_id: int,
-    columbia_opinions: list[dict],
+        cluster_id: int,
+        columbia_opinions: list[dict],
 ) -> None:
     """Map and merge opinion data
 
@@ -275,7 +275,7 @@ def map_and_merge_opinions(
 
 
 def merge_date_filed(
-    cluster: OpinionCluster, columbia_data: dict
+        cluster: OpinionCluster, columbia_data: dict
 ) -> dict[str, Any]:
     """Merge date filed
 
@@ -318,10 +318,10 @@ def update_cluster_source(cluster: OpinionCluster) -> None:
 
 
 def merge_field(
-    cluster: OpinionCluster,
-    file_value: Optional[str],
-    field_name: str,
-    skip_judge_merger: bool = False,
+        cluster: OpinionCluster,
+        file_value: Optional[str],
+        field_name: str,
+        skip_judge_merger: bool = False,
 ) -> dict:
     """Try to merge the cluster data and file field data
 
@@ -373,8 +373,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
         merge_docket_numbers(cluster, docket_data["docket_number"])
         cluster.docket.refresh_from_db()
     if (
-        docket_data["date_cert_granted"]
-        and not cluster.docket.date_cert_granted
+            docket_data["date_cert_granted"]
+            and not cluster.docket.date_cert_granted
     ):
         data_to_update["date_cert_granted"] = docket_data["date_cert_granted"]
 
@@ -388,8 +388,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
         data_to_update["date_reargued"] = docket_data["date_reargued"]
 
     if (
-        docket_data["date_reargument_denied"]
-        and not cluster.docket.date_reargument_denied
+            docket_data["date_reargument_denied"]
+            and not cluster.docket.date_reargument_denied
     ):
         data_to_update["date_reargument_denied"] = docket_data[
             "date_reargument_denied"
@@ -400,9 +400,9 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
 
 
 def process_cluster(
-    cluster_id: int,
-    filepath: str,
-    skip_judge_merger: bool = False,
+        cluster_id: int,
+        filepath: str,
+        skip_judge_merger: bool = False,
 ) -> None:
     """Merge specified cluster id
 
@@ -473,14 +473,14 @@ def process_cluster(
         k: v
         for k, v in columbia_data.items()
         if k
-        in [
-            "docket_number",
-            "date_cert_granted",
-            "date_cert_denied",
-            "date_argued",
-            "date_reargued",
-            "date_reargument_denied",
-        ]
+           in [
+               "docket_number",
+               "date_cert_granted",
+               "date_cert_denied",
+               "date_argued",
+               "date_reargued",
+               "date_reargument_denied",
+           ]
     }
 
     try:
@@ -491,10 +491,10 @@ def process_cluster(
             for field in ["syllabus", "attorneys", "posture", "judges"]:
                 columbia_value = columbia_data.get(field)
                 if data := merge_field(
-                    cluster,
-                    columbia_value,
-                    field,
-                    skip_judge_merger=skip_judge_merger,
+                        cluster,
+                        columbia_value,
+                        field,
+                        skip_judge_merger=skip_judge_merger,
                 ):
                     merged_data.update(data)
 
@@ -515,7 +515,7 @@ def process_cluster(
 
             # Merge results into a single dict
             data_to_update = (
-                merged_data | case_names_to_update | date_filed_to_update
+                    merged_data | case_names_to_update | date_filed_to_update
             )
 
             if data_to_update:

From efc3931cc84a3e4e980270bc8d5bdd984b3fc967 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 16 Feb 2024 19:47:09 +0000
Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../management/commands/columbia_merge.py     | 65 ++++++++++---------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py
index fcf44cedb4..f7e827b01e 100644
--- a/cl/corpus_importer/management/commands/columbia_merge.py
+++ b/cl/corpus_importer/management/commands/columbia_merge.py
@@ -119,8 +119,9 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]:
     :return: list with opinion content from cl
     """
     cl_cleaned_opinions = []
-    opinions_from_cluster = Opinion.objects.filter(cluster_id=cluster_id).exclude(
-        type="010combined")
+    opinions_from_cluster = Opinion.objects.filter(
+        cluster_id=cluster_id
+    ).exclude(type="010combined")
     is_harvard = False
 
     for i, op in enumerate(opinions_from_cluster):
@@ -153,7 +154,7 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]:
 
 
 def update_matching_opinions(
-        matches: dict, cl_cleaned_opinions: list, columbia_opinions: list
+    matches: dict, cl_cleaned_opinions: list, columbia_opinions: list
 ) -> None:
     """Store matching opinion content in html_columbia field from Opinion object
 
@@ -183,8 +184,8 @@ def update_matching_opinions(
         else:
             if author_str:
                 if (
-                        find_just_name(op.author_str).lower()
-                        != find_just_name(author_str).lower()
+                    find_just_name(op.author_str).lower()
+                    != find_just_name(author_str).lower()
                 ):
                     # last resort, use distance between words to solve typos
                     s = SequenceMatcher(
@@ -209,8 +210,8 @@ def update_matching_opinions(
 
 
 def map_and_merge_opinions(
-        cluster_id: int,
-        columbia_opinions: list[dict],
+    cluster_id: int,
+    columbia_opinions: list[dict],
 ) -> None:
     """Map and merge opinion data
 
@@ -275,7 +276,7 @@ def map_and_merge_opinions(
 
 
 def merge_date_filed(
-        cluster: OpinionCluster, columbia_data: dict
+    cluster: OpinionCluster, columbia_data: dict
 ) -> dict[str, Any]:
     """Merge date filed
 
@@ -318,10 +319,10 @@ def update_cluster_source(cluster: OpinionCluster) -> None:
 
 
 def merge_field(
-        cluster: OpinionCluster,
-        file_value: Optional[str],
-        field_name: str,
-        skip_judge_merger: bool = False,
+    cluster: OpinionCluster,
+    file_value: Optional[str],
+    field_name: str,
+    skip_judge_merger: bool = False,
 ) -> dict:
     """Try to merge the cluster data and file field data
 
@@ -373,8 +374,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
         merge_docket_numbers(cluster, docket_data["docket_number"])
         cluster.docket.refresh_from_db()
     if (
-            docket_data["date_cert_granted"]
-            and not cluster.docket.date_cert_granted
+        docket_data["date_cert_granted"]
+        and not cluster.docket.date_cert_granted
     ):
         data_to_update["date_cert_granted"] = docket_data["date_cert_granted"]
 
@@ -388,8 +389,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
         data_to_update["date_reargued"] = docket_data["date_reargued"]
 
     if (
-            docket_data["date_reargument_denied"]
-            and not cluster.docket.date_reargument_denied
+        docket_data["date_reargument_denied"]
+        and not cluster.docket.date_reargument_denied
     ):
         data_to_update["date_reargument_denied"] = docket_data[
             "date_reargument_denied"
@@ -400,9 +401,9 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
 
 
 def process_cluster(
-        cluster_id: int,
-        filepath: str,
-        skip_judge_merger: bool = False,
+    cluster_id: int,
+    filepath: str,
+    skip_judge_merger: bool = False,
 ) -> None:
     """Merge specified cluster id
 
@@ -473,14 +474,14 @@ def process_cluster(
         k: v
         for k, v in columbia_data.items()
         if k
-           in [
-               "docket_number",
-               "date_cert_granted",
-               "date_cert_denied",
-               "date_argued",
-               "date_reargued",
-               "date_reargument_denied",
-           ]
+        in [
+            "docket_number",
+            "date_cert_granted",
+            "date_cert_denied",
+            "date_argued",
+            "date_reargued",
+            "date_reargument_denied",
+        ]
     }
 
     try:
@@ -491,10 +492,10 @@ def process_cluster(
             for field in ["syllabus", "attorneys", "posture", "judges"]:
                 columbia_value = columbia_data.get(field)
                 if data := merge_field(
-                        cluster,
-                        columbia_value,
-                        field,
-                        skip_judge_merger=skip_judge_merger,
+                    cluster,
+                    columbia_value,
+                    field,
+                    skip_judge_merger=skip_judge_merger,
                 ):
                     merged_data.update(data)
 
@@ -515,7 +516,7 @@ def process_cluster(
 
             # Merge results into a single dict
             data_to_update = (
-                    merged_data | case_names_to_update | date_filed_to_update
+                merged_data | case_names_to_update | date_filed_to_update
             )
 
             if data_to_update:

From f9d4b87736a6e97ac2b5356b627af1a95c65a11b Mon Sep 17 00:00:00 2001
From: Kevin Ramirez <kvnzavalza@hotmail.com>
Date: Fri, 16 Feb 2024 14:03:39 -0600
Subject: [PATCH 03/11] tests(columbia_merger): update tests

---
 cl/corpus_importer/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py
index e5726e4a13..c2e0491b04 100644
--- a/cl/corpus_importer/tests.py
+++ b/cl/corpus_importer/tests.py
@@ -3235,7 +3235,7 @@ def test_merger(self):
             docket=DocketFactory(source=Docket.HARVARD),
             sub_opinions__data=[
                 {
-                    "type": "010combined",
+                    "type": "020lead",
                     "xml_harvard": "<p>Lorem ipsum dolor sit amet, consectetur "
                     "adipiscing elit. Nullam quis elit sed dui "
                     "interdum feugiat.</p>",

From 6be40ee8ddf0ef56d527628b36faff424b789226 Mon Sep 17 00:00:00 2001
From: Kevin Ramirez <kvnzavalza@hotmail.com>
Date: Tue, 20 Feb 2024 14:40:33 -0600
Subject: [PATCH 04/11] fix(columbia_merger): improve date extraction

---
 .../import_columbia/columbia_utils.py            | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/cl/corpus_importer/import_columbia/columbia_utils.py b/cl/corpus_importer/import_columbia/columbia_utils.py
index a862788c36..5d61c99b0e 100644
--- a/cl/corpus_importer/import_columbia/columbia_utils.py
+++ b/cl/corpus_importer/import_columbia/columbia_utils.py
@@ -29,6 +29,22 @@
     "affirmed and opinion filed",
     "dismissed and opinion filed",
     "decided and entered",
+    "memorandum opinion filed",
+    "memorandum opinion delivered and filed",
+    "granted",
+    "affirmed",
+    "submitted and decided",
+    "affirmed and memorandum opinion filed",
+    "memorandum filed",
+    "modified opinion filed",
+    "opinion modified and refiled",
+    "opinion filed on",
+    "opinion on merits filed",
+    "opinion delivered and filed on",
+    "order delivered and filed",
+    "date filed",
+    "opinion filed in",
+    "affirmed opinion filed",
 ]
 DECIDED_TAGS = ["decided", "date decided", "decided on", "decided date"]
 ARGUED_TAGS = [

From 65d89f46f879b34199fd66eb1e2637e6cd16daea Mon Sep 17 00:00:00 2001
From: Kevin Ramirez <kvnzavalza@hotmail.com>
Date: Mon, 26 Feb 2024 12:15:51 -0600
Subject: [PATCH 05/11] fix(columbia_merger): handle invalid citation volume

---
 .../management/commands/columbia_merge.py     |   6 +-
 cl/corpus_importer/utils.py                   | 136 ++++++++++--------
 2 files changed, 79 insertions(+), 63 deletions(-)

diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py
index f7e827b01e..f8e0988360 100644
--- a/cl/corpus_importer/management/commands/columbia_merge.py
+++ b/cl/corpus_importer/management/commands/columbia_merge.py
@@ -54,7 +54,7 @@
     merge_judges,
     merge_long_fields,
     merge_strings,
-    update_cluster_panel,
+    update_cluster_panel, CitationException,
 )
 from cl.lib.command_utils import VerboseCommand, logger
 from cl.people_db.lookup_utils import extract_judge_last_name, find_just_name
@@ -215,6 +215,8 @@ def map_and_merge_opinions(
 ) -> None:
     """Map and merge opinion data
 
+    # TODO handle combined opinions
+
     :param cluster_id: Cluster id
     :param columbia_opinions: list of columbia opinions from file
     :return: None
@@ -545,6 +547,8 @@ def process_cluster(
         )
     except JudgeException:
         logger.warning(msg=f"Judge exception for cluster id: {cluster_id}")
+    except CitationException:
+        logger.warning(msg=f"Invalid citation found in {filepath } while merging cluster id: {cluster_id}")
 
 
 def merge_columbia_into_cl(options) -> None:
diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py
index 98f73dfe84..d1a13c8c51 100644
--- a/cl/corpus_importer/utils.py
+++ b/cl/corpus_importer/utils.py
@@ -82,6 +82,13 @@ def __init__(self, message: str) -> None:
         self.message = message
 
 
+class CitationException(Exception):
+    """Error found in cite."""
+
+    def __init__(self, message: str) -> None:
+        self.message = message
+
+
 async def mark_ia_upload_needed(d: Docket, save_docket: bool) -> None:
     """Mark the docket as needing upload if it's not already marked.
 
@@ -143,9 +150,9 @@ def filter_subsets(lists: list[list[int]]) -> Iterator[list[int]]:
 
     for match in lists:
         if not any(
-            is_subset(match, other_matches)
-            for other_matches in lists
-            if match is not other_matches
+                is_subset(match, other_matches)
+                for other_matches in lists
+                if match is not other_matches
         ):
             yield match
 
@@ -214,7 +221,7 @@ def compare_documents(file_characters: str, cl_characters: str) -> int:
 
 
 def similarity_scores(
-    texts_to_compare_1: list[str], texts_to_compare_2: list[str]
+        texts_to_compare_1: list[str], texts_to_compare_2: list[str]
 ) -> list[list[float]]:
     """Get similarity scores between two sets of lists
 
@@ -239,13 +246,13 @@ def similarity_scores(
 
     # Calculate cosine similarity between weight of words for each text in list
     scores = cosine_similarity(
-        X[: len(texts_to_compare_1)], X[len(texts_to_compare_1) :]
+        X[: len(texts_to_compare_1)], X[len(texts_to_compare_1):]
     )
     return scores
 
 
 def match_opinion_lists(
-    file_opinions_list: list[Any], cl_opinions_list: list[Any]
+        file_opinions_list: list[Any], cl_opinions_list: list[Any]
 ) -> dict[int, int]:
     """Match opinions on two lists from different sources
 
@@ -322,7 +329,7 @@ def clean_docket_number(docket_number: str) -> str:
 
 
 def merge_docket_numbers(
-    cluster: OpinionCluster, docket_number: str
+        cluster: OpinionCluster, docket_number: str
 ) -> Optional[str]:
     """Merge docket number
 
@@ -338,8 +345,8 @@ def merge_docket_numbers(
         # e.g. CL docket id #3952066 doesn't have
         cl_clean_docket = clean_docket_number(cl_docket.docket_number)
         if (
-            cl_clean_docket in file_cleaned_docket
-            and cl_docket.docket_number != file_cleaned_docket
+                cl_clean_docket in file_cleaned_docket
+                and cl_docket.docket_number != file_cleaned_docket
         ):
             return file_cleaned_docket
         else:
@@ -359,10 +366,10 @@ def merge_docket_numbers(
 
 
 def merge_case_names(
-    cluster: OpinionCluster,
-    file_data: dict[str, Any],
-    case_name_key: str,
-    case_name_full_key: str,
+        cluster: OpinionCluster,
+        file_data: dict[str, Any],
+        case_name_key: str,
+        case_name_full_key: str,
 ) -> dict[str, Any]:
     """Merge case names
 
@@ -418,7 +425,7 @@ def merge_case_names(
 
 
 def merge_strings(
-    field_name: str, overlapping_data: tuple[str, str]
+        field_name: str, overlapping_data: tuple[str, str]
 ) -> dict[str, Any]:
     """Compare two strings and choose the largest
 
@@ -437,9 +444,9 @@ def merge_strings(
 
 
 def merge_long_fields(
-    field_name: str,
-    overlapping_data: Optional[tuple[str, str]],
-    cluster_id: int,
+        field_name: str,
+        overlapping_data: Optional[tuple[str, str]],
+        cluster_id: int,
 ) -> dict[str, Any]:
     """Merge two long text fields
 
@@ -468,10 +475,10 @@ def merge_long_fields(
 
 
 def merge_judges(
-    overlapping_data: Optional[tuple[str, str]],
-    cluster_id: int,
-    is_columbia: bool = False,
-    skip_judge_merger: bool = False,
+        overlapping_data: Optional[tuple[str, str]],
+        cluster_id: int,
+        is_columbia: bool = False,
+        skip_judge_merger: bool = False,
 ) -> dict[str, Any]:
     """Merge overlapping judge values
 
@@ -501,7 +508,7 @@ def merge_judges(
     # Prepare judges string
     judges = titlecase(", ".join(find_all_judges(file_data)))
     if (
-        temp_file_data_clean.issuperset(temp_cl_clean) or cl_data_upper
+            temp_file_data_clean.issuperset(temp_cl_clean) or cl_data_upper
     ) and file_data_cleaned != cl_clean:
         return {"judges": judges}
     elif not temp_file_data_clean.intersection(temp_cl_clean):
@@ -554,11 +561,11 @@ def merge_judges(
 
 
 def merge_overlapping_data(
-    cluster: OpinionCluster,
-    long_fields,
-    changed_values_dictionary: dict,
-    skip_judge_merger: bool = False,
-    is_columbia: bool = False,
+        cluster: OpinionCluster,
+        long_fields,
+        changed_values_dictionary: dict,
+        skip_judge_merger: bool = False,
+        is_columbia: bool = False,
 ) -> dict[str, Any]:
     """Merge overlapping data
 
@@ -618,9 +625,9 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None:
         clean_cite = re.sub(r"\s+", " ", cite)
         citation = get_citations(clean_cite)
         if (
-            not citation
-            or not isinstance(citation[0], FullCaseCitation)
-            or not citation[0].groups.get("volume", False)
+                not citation
+                or not isinstance(citation[0], FullCaseCitation)
+                or not citation[0].groups.get("volume", False)
         ):
             logger.warning(f"Citation parsing failed for {clean_cite}")
             continue
@@ -632,7 +639,7 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None:
             reporter_type = map_reporter_db_cite_type(cite_type_str)
 
         if Citation.objects.filter(
-            cluster_id=cluster_id, reporter=citation[0].corrected_reporter()
+                cluster_id=cluster_id, reporter=citation[0].corrected_reporter()
         ).exists():
             # Avoid adding a citation if we already have a citation from the
             # citation's reporter
@@ -654,12 +661,15 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None:
             logger.warning(
                 f"Reporter mismatch for cluster: {cluster_id} on cite: {cite}"
             )
+        except ValueError:
+            # Handle: ValueError: Field ‘volume’ expected a number but got ‘1986-2’
+            raise CitationException(f"Invalid citation found: {cite}")
 
 
 def update_cluster_panel(
-    cluster: OpinionCluster,
-    panel_list: list[str],
-    panel_date: Optional[date] = None,
+        cluster: OpinionCluster,
+        panel_list: list[str],
+        panel_date: Optional[date] = None,
 ) -> None:
     """Update cluster's panel
 
@@ -786,21 +796,23 @@ def clean_body_content(case_body: str, harvard_file: bool = False) -> str:
     else:
         opinions = []
         for op in soup.find_all(
-            lambda tag: (
-                tag.name == "opinion" and tag.get("data-type") is None
-            )
-            or tag.get("data-type") == "opinion"
+                lambda tag: (
+                                    tag.name == "opinion" and tag.get(
+                                "data-type") is None
+                            )
+                            or tag.get("data-type") == "opinion"
         ):
             opinions.append(op.text)
         opinion_text = "".join(
             [
                 op.text
                 for op in soup.find_all(
-                    lambda tag: (
-                        tag.name == "opinion" and tag.get("data-type") is None
-                    )
-                    or tag.get("data-type") == "opinion"
-                )
+                lambda tag: (
+                                    tag.name == "opinion" and tag.get(
+                                "data-type") is None
+                            )
+                            or tag.get("data-type") == "opinion"
+            )
             ]
         )
 
@@ -808,7 +820,7 @@ def clean_body_content(case_body: str, harvard_file: bool = False) -> str:
 
 
 def length_too_different(
-    case: OpinionCluster, file_characters: str, cl_characters: str
+        case: OpinionCluster, file_characters: str, cl_characters: str
 ) -> bool:
     """Check if length is too different between texts
 
@@ -829,10 +841,10 @@ def length_too_different(
 
 
 def content_too_different(
-    case: OpinionCluster,
-    file_characters: str,
-    cl_characters: str,
-    docket: str,
+        case: OpinionCluster,
+        file_characters: str,
+        cl_characters: str,
+        docket: str,
 ) -> bool:
     """Is the content too different
 
@@ -881,7 +893,7 @@ def content_too_different(
 
 
 def case_names_dont_overlap(
-    case: OpinionCluster, case_name_full: str, case_name_abbreviation: str
+        case: OpinionCluster, case_name_full: str, case_name_abbreviation: str
 ) -> bool:
     """Case names not overlap
 
@@ -904,7 +916,7 @@ def case_names_dont_overlap(
 
 
 def cosine_similarity_too_different(
-    case: OpinionCluster, case_name_full: str, case_name_abbreviation: str
+        case: OpinionCluster, case_name_full: str, case_name_abbreviation: str
 ) -> bool:
     """Cosine similarity comparison between case names
 
@@ -929,7 +941,7 @@ def cosine_similarity_too_different(
 
 
 def has_too_similar_citation(
-    case: OpinionCluster, citation: FullCaseCitation
+        case: OpinionCluster, citation: FullCaseCitation
 ) -> bool:
     """Has a citation associated with cluster in same volume
 
@@ -956,12 +968,12 @@ def has_too_similar_citation(
 
 
 def match_based_text(
-    file_characters: str,
-    docket_number: str,
-    case_name_full: str,
-    possible_cases: QuerySet,
-    case_name_abbreviation: str,
-    citation: FullCaseCitation,
+        file_characters: str,
+        docket_number: str,
+        case_name_full: str,
+        possible_cases: QuerySet,
+        case_name_abbreviation: str,
+        citation: FullCaseCitation,
 ) -> Optional[OpinionCluster]:
     """Compare CL text to file content to establish duplicates
 
@@ -985,11 +997,11 @@ def match_based_text(
         case_and_texts_and_docket = case_and_texts + [docket_number]
         case_and_titles = [case, case_name_full, case_name_abbreviation]
         if (
-            length_too_different(*case_and_texts)
-            or has_too_similar_citation(case, citation)
-            or case_names_dont_overlap(*case_and_titles)
-            or cosine_similarity_too_different(*case_and_titles)
-            or content_too_different(*case_and_texts_and_docket)
+                length_too_different(*case_and_texts)
+                or has_too_similar_citation(case, citation)
+                or case_names_dont_overlap(*case_and_titles)
+                or cosine_similarity_too_different(*case_and_titles)
+                or content_too_different(*case_and_texts_and_docket)
         ):
             continue
         return case

From 4a9f4ecd1e82ba6adc4c04bf9703d5fc80a85a11 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 26 Feb 2024 18:17:02 +0000
Subject: [PATCH 06/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../management/commands/columbia_merge.py     |   7 +-
 cl/corpus_importer/utils.py                   | 126 +++++++++---------
 2 files changed, 67 insertions(+), 66 deletions(-)

diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py
index f8e0988360..b5ab5e6e46 100644
--- a/cl/corpus_importer/management/commands/columbia_merge.py
+++ b/cl/corpus_importer/management/commands/columbia_merge.py
@@ -43,6 +43,7 @@
 )
 from cl.corpus_importer.utils import (
     AuthorException,
+    CitationException,
     JudgeException,
     OpinionMatchingException,
     OpinionTypeException,
@@ -54,7 +55,7 @@
     merge_judges,
     merge_long_fields,
     merge_strings,
-    update_cluster_panel, CitationException,
+    update_cluster_panel,
 )
 from cl.lib.command_utils import VerboseCommand, logger
 from cl.people_db.lookup_utils import extract_judge_last_name, find_just_name
@@ -548,7 +549,9 @@ def process_cluster(
     except JudgeException:
         logger.warning(msg=f"Judge exception for cluster id: {cluster_id}")
     except CitationException:
-        logger.warning(msg=f"Invalid citation found in {filepath } while merging cluster id: {cluster_id}")
+        logger.warning(
+            msg=f"Invalid citation found in {filepath } while merging cluster id: {cluster_id}"
+        )
 
 
 def merge_columbia_into_cl(options) -> None:
diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py
index d1a13c8c51..d82c376b54 100644
--- a/cl/corpus_importer/utils.py
+++ b/cl/corpus_importer/utils.py
@@ -150,9 +150,9 @@ def filter_subsets(lists: list[list[int]]) -> Iterator[list[int]]:
 
     for match in lists:
         if not any(
-                is_subset(match, other_matches)
-                for other_matches in lists
-                if match is not other_matches
+            is_subset(match, other_matches)
+            for other_matches in lists
+            if match is not other_matches
         ):
             yield match
 
@@ -221,7 +221,7 @@ def compare_documents(file_characters: str, cl_characters: str) -> int:
 
 
 def similarity_scores(
-        texts_to_compare_1: list[str], texts_to_compare_2: list[str]
+    texts_to_compare_1: list[str], texts_to_compare_2: list[str]
 ) -> list[list[float]]:
     """Get similarity scores between two sets of lists
 
@@ -246,13 +246,13 @@ def similarity_scores(
 
     # Calculate cosine similarity between weight of words for each text in list
     scores = cosine_similarity(
-        X[: len(texts_to_compare_1)], X[len(texts_to_compare_1):]
+        X[: len(texts_to_compare_1)], X[len(texts_to_compare_1) :]
     )
     return scores
 
 
 def match_opinion_lists(
-        file_opinions_list: list[Any], cl_opinions_list: list[Any]
+    file_opinions_list: list[Any], cl_opinions_list: list[Any]
 ) -> dict[int, int]:
     """Match opinions on two lists from different sources
 
@@ -329,7 +329,7 @@ def clean_docket_number(docket_number: str) -> str:
 
 
 def merge_docket_numbers(
-        cluster: OpinionCluster, docket_number: str
+    cluster: OpinionCluster, docket_number: str
 ) -> Optional[str]:
     """Merge docket number
 
@@ -345,8 +345,8 @@ def merge_docket_numbers(
         # e.g. CL docket id #3952066 doesn't have
         cl_clean_docket = clean_docket_number(cl_docket.docket_number)
         if (
-                cl_clean_docket in file_cleaned_docket
-                and cl_docket.docket_number != file_cleaned_docket
+            cl_clean_docket in file_cleaned_docket
+            and cl_docket.docket_number != file_cleaned_docket
         ):
             return file_cleaned_docket
         else:
@@ -366,10 +366,10 @@ def merge_docket_numbers(
 
 
 def merge_case_names(
-        cluster: OpinionCluster,
-        file_data: dict[str, Any],
-        case_name_key: str,
-        case_name_full_key: str,
+    cluster: OpinionCluster,
+    file_data: dict[str, Any],
+    case_name_key: str,
+    case_name_full_key: str,
 ) -> dict[str, Any]:
     """Merge case names
 
@@ -425,7 +425,7 @@ def merge_case_names(
 
 
 def merge_strings(
-        field_name: str, overlapping_data: tuple[str, str]
+    field_name: str, overlapping_data: tuple[str, str]
 ) -> dict[str, Any]:
     """Compare two strings and choose the largest
 
@@ -444,9 +444,9 @@ def merge_strings(
 
 
 def merge_long_fields(
-        field_name: str,
-        overlapping_data: Optional[tuple[str, str]],
-        cluster_id: int,
+    field_name: str,
+    overlapping_data: Optional[tuple[str, str]],
+    cluster_id: int,
 ) -> dict[str, Any]:
     """Merge two long text fields
 
@@ -475,10 +475,10 @@ def merge_long_fields(
 
 
 def merge_judges(
-        overlapping_data: Optional[tuple[str, str]],
-        cluster_id: int,
-        is_columbia: bool = False,
-        skip_judge_merger: bool = False,
+    overlapping_data: Optional[tuple[str, str]],
+    cluster_id: int,
+    is_columbia: bool = False,
+    skip_judge_merger: bool = False,
 ) -> dict[str, Any]:
     """Merge overlapping judge values
 
@@ -508,7 +508,7 @@ def merge_judges(
     # Prepare judges string
     judges = titlecase(", ".join(find_all_judges(file_data)))
     if (
-            temp_file_data_clean.issuperset(temp_cl_clean) or cl_data_upper
+        temp_file_data_clean.issuperset(temp_cl_clean) or cl_data_upper
     ) and file_data_cleaned != cl_clean:
         return {"judges": judges}
     elif not temp_file_data_clean.intersection(temp_cl_clean):
@@ -561,11 +561,11 @@ def merge_judges(
 
 
 def merge_overlapping_data(
-        cluster: OpinionCluster,
-        long_fields,
-        changed_values_dictionary: dict,
-        skip_judge_merger: bool = False,
-        is_columbia: bool = False,
+    cluster: OpinionCluster,
+    long_fields,
+    changed_values_dictionary: dict,
+    skip_judge_merger: bool = False,
+    is_columbia: bool = False,
 ) -> dict[str, Any]:
     """Merge overlapping data
 
@@ -625,9 +625,9 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None:
         clean_cite = re.sub(r"\s+", " ", cite)
         citation = get_citations(clean_cite)
         if (
-                not citation
-                or not isinstance(citation[0], FullCaseCitation)
-                or not citation[0].groups.get("volume", False)
+            not citation
+            or not isinstance(citation[0], FullCaseCitation)
+            or not citation[0].groups.get("volume", False)
         ):
             logger.warning(f"Citation parsing failed for {clean_cite}")
             continue
@@ -639,7 +639,7 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None:
             reporter_type = map_reporter_db_cite_type(cite_type_str)
 
         if Citation.objects.filter(
-                cluster_id=cluster_id, reporter=citation[0].corrected_reporter()
+            cluster_id=cluster_id, reporter=citation[0].corrected_reporter()
         ).exists():
             # Avoid adding a citation if we already have a citation from the
             # citation's reporter
@@ -667,9 +667,9 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None:
 
 
 def update_cluster_panel(
-        cluster: OpinionCluster,
-        panel_list: list[str],
-        panel_date: Optional[date] = None,
+    cluster: OpinionCluster,
+    panel_list: list[str],
+    panel_date: Optional[date] = None,
 ) -> None:
     """Update cluster's panel
 
@@ -796,23 +796,21 @@ def clean_body_content(case_body: str, harvard_file: bool = False) -> str:
     else:
         opinions = []
         for op in soup.find_all(
-                lambda tag: (
-                                    tag.name == "opinion" and tag.get(
-                                "data-type") is None
-                            )
-                            or tag.get("data-type") == "opinion"
+            lambda tag: (
+                tag.name == "opinion" and tag.get("data-type") is None
+            )
+            or tag.get("data-type") == "opinion"
         ):
             opinions.append(op.text)
         opinion_text = "".join(
             [
                 op.text
                 for op in soup.find_all(
-                lambda tag: (
-                                    tag.name == "opinion" and tag.get(
-                                "data-type") is None
-                            )
-                            or tag.get("data-type") == "opinion"
-            )
+                    lambda tag: (
+                        tag.name == "opinion" and tag.get("data-type") is None
+                    )
+                    or tag.get("data-type") == "opinion"
+                )
             ]
         )
 
@@ -820,7 +818,7 @@ def clean_body_content(case_body: str, harvard_file: bool = False) -> str:
 
 
 def length_too_different(
-        case: OpinionCluster, file_characters: str, cl_characters: str
+    case: OpinionCluster, file_characters: str, cl_characters: str
 ) -> bool:
     """Check if length is too different between texts
 
@@ -841,10 +839,10 @@ def length_too_different(
 
 
 def content_too_different(
-        case: OpinionCluster,
-        file_characters: str,
-        cl_characters: str,
-        docket: str,
+    case: OpinionCluster,
+    file_characters: str,
+    cl_characters: str,
+    docket: str,
 ) -> bool:
     """Is the content too different
 
@@ -893,7 +891,7 @@ def content_too_different(
 
 
 def case_names_dont_overlap(
-        case: OpinionCluster, case_name_full: str, case_name_abbreviation: str
+    case: OpinionCluster, case_name_full: str, case_name_abbreviation: str
 ) -> bool:
     """Case names not overlap
 
@@ -916,7 +914,7 @@ def case_names_dont_overlap(
 
 
 def cosine_similarity_too_different(
-        case: OpinionCluster, case_name_full: str, case_name_abbreviation: str
+    case: OpinionCluster, case_name_full: str, case_name_abbreviation: str
 ) -> bool:
     """Cosine similarity comparison between case names
 
@@ -941,7 +939,7 @@ def cosine_similarity_too_different(
 
 
 def has_too_similar_citation(
-        case: OpinionCluster, citation: FullCaseCitation
+    case: OpinionCluster, citation: FullCaseCitation
 ) -> bool:
     """Has a citation associated with cluster in same volume
 
@@ -968,12 +966,12 @@ def has_too_similar_citation(
 
 
 def match_based_text(
-        file_characters: str,
-        docket_number: str,
-        case_name_full: str,
-        possible_cases: QuerySet,
-        case_name_abbreviation: str,
-        citation: FullCaseCitation,
+    file_characters: str,
+    docket_number: str,
+    case_name_full: str,
+    possible_cases: QuerySet,
+    case_name_abbreviation: str,
+    citation: FullCaseCitation,
 ) -> Optional[OpinionCluster]:
     """Compare CL text to file content to establish duplicates
 
@@ -997,11 +995,11 @@ def match_based_text(
         case_and_texts_and_docket = case_and_texts + [docket_number]
         case_and_titles = [case, case_name_full, case_name_abbreviation]
         if (
-                length_too_different(*case_and_texts)
-                or has_too_similar_citation(case, citation)
-                or case_names_dont_overlap(*case_and_titles)
-                or cosine_similarity_too_different(*case_and_titles)
-                or content_too_different(*case_and_texts_and_docket)
+            length_too_different(*case_and_texts)
+            or has_too_similar_citation(case, citation)
+            or case_names_dont_overlap(*case_and_titles)
+            or cosine_similarity_too_different(*case_and_titles)
+            or content_too_different(*case_and_texts_and_docket)
         ):
             continue
         return case

From b2432e5eabfc3425ca59e9a501fd46f9b410e9d1 Mon Sep 17 00:00:00 2001
From: Kevin Ramirez <kvnzavalza@hotmail.com>
Date: Mon, 26 Feb 2024 14:39:06 -0600
Subject: [PATCH 07/11] fix(columbia_merger): handle invalid citation volume

---
 .../management/commands/columbia_merge.py     | 90 +++++++++++--------
 1 file changed, 52 insertions(+), 38 deletions(-)

diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py
index f8e0988360..c61c918eaf 100644
--- a/cl/corpus_importer/management/commands/columbia_merge.py
+++ b/cl/corpus_importer/management/commands/columbia_merge.py
@@ -112,16 +112,24 @@ def clean_opinion_content(content: str, is_harvard: bool) -> str:
     return prep_text
 
 
-def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]:
+def get_cl_opinion_content(cluster_id: int, columbia_single_opinion: bool = False) -> list[dict[Any, Any]]:
     """Get the opinions content for a cluster object
 
     :param cluster_id: Cluster ID for a set of opinions
+    :param columbia_single_opinion: True if xml file only has one opinion else False
     :return: list with opinion content from cl
     """
     cl_cleaned_opinions = []
+
+    # Get all opinions from cluster
     opinions_from_cluster = Opinion.objects.filter(
         cluster_id=cluster_id
-    ).exclude(type="010combined")
+    )
+
+    if not columbia_single_opinion:
+        # File has multiple opinions, then we can exclude combined opinions
+        opinions_from_cluster = opinions_from_cluster.exclude(type="010combined")
+
     is_harvard = False
 
     for i, op in enumerate(opinions_from_cluster):
@@ -154,7 +162,7 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]:
 
 
 def update_matching_opinions(
-    matches: dict, cl_cleaned_opinions: list, columbia_opinions: list
+        matches: dict, cl_cleaned_opinions: list, columbia_opinions: list
 ) -> None:
     """Store matching opinion content in html_columbia field from Opinion object
 
@@ -184,8 +192,8 @@ def update_matching_opinions(
         else:
             if author_str:
                 if (
-                    find_just_name(op.author_str).lower()
-                    != find_just_name(author_str).lower()
+                        find_just_name(op.author_str).lower()
+                        != find_just_name(author_str).lower()
                 ):
                     # last resort, use distance between words to solve typos
                     s = SequenceMatcher(
@@ -210,19 +218,21 @@ def update_matching_opinions(
 
 
 def map_and_merge_opinions(
-    cluster_id: int,
-    columbia_opinions: list[dict],
+        cluster_id: int,
+        columbia_opinions: list[dict],
 ) -> None:
     """Map and merge opinion data
 
-    # TODO handle combined opinions
-
     :param cluster_id: Cluster id
     :param columbia_opinions: list of columbia opinions from file
     :return: None
     """
 
-    cl_cleaned_opinions = get_cl_opinion_content(cluster_id)
+    # Check if columbia source only has one opinion
+    columbia_single_opinion = True if len(columbia_opinions) == 1 else False
+
+    # We exclude combined opinions only if we have more than one opinion in the xml
+    cl_cleaned_opinions = get_cl_opinion_content(cluster_id, columbia_single_opinion)
 
     if len(columbia_opinions) == len(cl_cleaned_opinions):
         # We need that both list to be cleaned, so we can have a more
@@ -235,13 +245,17 @@ def map_and_merge_opinions(
             [op.get("opinion") for op in cl_cleaned_opinions],
         )
         if len(matches) == len(columbia_opinions):
+            # We were able to match opinions, add opinions to html_columbia field
             update_matching_opinions(
                 matches, cl_cleaned_opinions, columbia_opinions
             )
         else:
             raise OpinionMatchingException("Failed to match opinions")
 
-    elif len(columbia_opinions) > len(cl_cleaned_opinions) == 1:
+    elif (len(columbia_opinions) > len(cl_cleaned_opinions)) and len(cl_cleaned_opinions) == 0:
+        # We have more opinions in file than in CL and if cl_cleaned_opinions == 0 it
+        # means that we probably excluded the combined opinion, we create each
+        # opinion from file
         for op in columbia_opinions:
             opinion_type = op.get("type")
             file = op.get("file")
@@ -278,7 +292,7 @@ def map_and_merge_opinions(
 
 
 def merge_date_filed(
-    cluster: OpinionCluster, columbia_data: dict
+        cluster: OpinionCluster, columbia_data: dict
 ) -> dict[str, Any]:
     """Merge date filed
 
@@ -321,10 +335,10 @@ def update_cluster_source(cluster: OpinionCluster) -> None:
 
 
 def merge_field(
-    cluster: OpinionCluster,
-    file_value: Optional[str],
-    field_name: str,
-    skip_judge_merger: bool = False,
+        cluster: OpinionCluster,
+        file_value: Optional[str],
+        field_name: str,
+        skip_judge_merger: bool = False,
 ) -> dict:
     """Try to merge the cluster data and file field data
 
@@ -376,8 +390,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
         merge_docket_numbers(cluster, docket_data["docket_number"])
         cluster.docket.refresh_from_db()
     if (
-        docket_data["date_cert_granted"]
-        and not cluster.docket.date_cert_granted
+            docket_data["date_cert_granted"]
+            and not cluster.docket.date_cert_granted
     ):
         data_to_update["date_cert_granted"] = docket_data["date_cert_granted"]
 
@@ -391,8 +405,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
         data_to_update["date_reargued"] = docket_data["date_reargued"]
 
     if (
-        docket_data["date_reargument_denied"]
-        and not cluster.docket.date_reargument_denied
+            docket_data["date_reargument_denied"]
+            and not cluster.docket.date_reargument_denied
     ):
         data_to_update["date_reargument_denied"] = docket_data[
             "date_reargument_denied"
@@ -403,9 +417,9 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
 
 
 def process_cluster(
-    cluster_id: int,
-    filepath: str,
-    skip_judge_merger: bool = False,
+        cluster_id: int,
+        filepath: str,
+        skip_judge_merger: bool = False,
 ) -> None:
     """Merge specified cluster id
 
@@ -475,15 +489,14 @@ def process_cluster(
     docket_data = {
         k: v
         for k, v in columbia_data.items()
-        if k
-        in [
-            "docket_number",
-            "date_cert_granted",
-            "date_cert_denied",
-            "date_argued",
-            "date_reargued",
-            "date_reargument_denied",
-        ]
+        if k in [
+               "docket_number",
+               "date_cert_granted",
+               "date_cert_denied",
+               "date_argued",
+               "date_reargued",
+               "date_reargument_denied",
+           ]
     }
 
     try:
@@ -494,10 +507,10 @@ def process_cluster(
             for field in ["syllabus", "attorneys", "posture", "judges"]:
                 columbia_value = columbia_data.get(field)
                 if data := merge_field(
-                    cluster,
-                    columbia_value,
-                    field,
-                    skip_judge_merger=skip_judge_merger,
+                        cluster,
+                        columbia_value,
+                        field,
+                        skip_judge_merger=skip_judge_merger,
                 ):
                     merged_data.update(data)
 
@@ -518,7 +531,7 @@ def process_cluster(
 
             # Merge results into a single dict
             data_to_update = (
-                merged_data | case_names_to_update | date_filed_to_update
+                    merged_data | case_names_to_update | date_filed_to_update
             )
 
             if data_to_update:
@@ -548,7 +561,8 @@ def process_cluster(
     except JudgeException:
         logger.warning(msg=f"Judge exception for cluster id: {cluster_id}")
     except CitationException:
-        logger.warning(msg=f"Invalid citation found in {filepath } while merging cluster id: {cluster_id}")
+        logger.warning(
+            msg=f"Invalid citation found in {filepath} while merging cluster id: {cluster_id}")
 
 
 def merge_columbia_into_cl(options) -> None:

From 78adef707b8a8f3d950eecd45f59bf610ae8f834 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 26 Feb 2024 20:41:17 +0000
Subject: [PATCH 08/11] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../management/commands/columbia_merge.py     | 87 ++++++++++---------
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py
index c61c918eaf..9e2cd6b205 100644
--- a/cl/corpus_importer/management/commands/columbia_merge.py
+++ b/cl/corpus_importer/management/commands/columbia_merge.py
@@ -43,6 +43,7 @@
 )
 from cl.corpus_importer.utils import (
     AuthorException,
+    CitationException,
     JudgeException,
     OpinionMatchingException,
     OpinionTypeException,
@@ -54,7 +55,7 @@
     merge_judges,
     merge_long_fields,
     merge_strings,
-    update_cluster_panel, CitationException,
+    update_cluster_panel,
 )
 from cl.lib.command_utils import VerboseCommand, logger
 from cl.people_db.lookup_utils import extract_judge_last_name, find_just_name
@@ -112,7 +113,9 @@ def clean_opinion_content(content: str, is_harvard: bool) -> str:
     return prep_text
 
 
-def get_cl_opinion_content(cluster_id: int, columbia_single_opinion: bool = False) -> list[dict[Any, Any]]:
+def get_cl_opinion_content(
+    cluster_id: int, columbia_single_opinion: bool = False
+) -> list[dict[Any, Any]]:
     """Get the opinions content for a cluster object
 
     :param cluster_id: Cluster ID for a set of opinions
@@ -122,13 +125,13 @@ def get_cl_opinion_content(cluster_id: int, columbia_single_opinion: bool = Fals
     cl_cleaned_opinions = []
 
     # Get all opinions from cluster
-    opinions_from_cluster = Opinion.objects.filter(
-        cluster_id=cluster_id
-    )
+    opinions_from_cluster = Opinion.objects.filter(cluster_id=cluster_id)
 
     if not columbia_single_opinion:
         # File has multiple opinions, then we can exclude combined opinions
-        opinions_from_cluster = opinions_from_cluster.exclude(type="010combined")
+        opinions_from_cluster = opinions_from_cluster.exclude(
+            type="010combined"
+        )
 
     is_harvard = False
 
@@ -162,7 +165,7 @@ def get_cl_opinion_content(cluster_id: int, columbia_single_opinion: bool = Fals
 
 
 def update_matching_opinions(
-        matches: dict, cl_cleaned_opinions: list, columbia_opinions: list
+    matches: dict, cl_cleaned_opinions: list, columbia_opinions: list
 ) -> None:
     """Store matching opinion content in html_columbia field from Opinion object
 
@@ -192,8 +195,8 @@ def update_matching_opinions(
         else:
             if author_str:
                 if (
-                        find_just_name(op.author_str).lower()
-                        != find_just_name(author_str).lower()
+                    find_just_name(op.author_str).lower()
+                    != find_just_name(author_str).lower()
                 ):
                     # last resort, use distance between words to solve typos
                     s = SequenceMatcher(
@@ -218,8 +221,8 @@ def update_matching_opinions(
 
 
 def map_and_merge_opinions(
-        cluster_id: int,
-        columbia_opinions: list[dict],
+    cluster_id: int,
+    columbia_opinions: list[dict],
 ) -> None:
     """Map and merge opinion data
 
@@ -232,7 +235,9 @@ def map_and_merge_opinions(
     columbia_single_opinion = True if len(columbia_opinions) == 1 else False
 
     # We exclude combined opinions only if we have more than one opinion in the xml
-    cl_cleaned_opinions = get_cl_opinion_content(cluster_id, columbia_single_opinion)
+    cl_cleaned_opinions = get_cl_opinion_content(
+        cluster_id, columbia_single_opinion
+    )
 
     if len(columbia_opinions) == len(cl_cleaned_opinions):
         # We need that both list to be cleaned, so we can have a more
@@ -252,7 +257,9 @@ def map_and_merge_opinions(
         else:
             raise OpinionMatchingException("Failed to match opinions")
 
-    elif (len(columbia_opinions) > len(cl_cleaned_opinions)) and len(cl_cleaned_opinions) == 0:
+    elif (len(columbia_opinions) > len(cl_cleaned_opinions)) and len(
+        cl_cleaned_opinions
+    ) == 0:
         # We have more opinions in file than in CL and if cl_cleaned_opinions == 0 it
         # means that we probably excluded the combined opinion, we create each
         # opinion from file
@@ -292,7 +299,7 @@ def map_and_merge_opinions(
 
 
 def merge_date_filed(
-        cluster: OpinionCluster, columbia_data: dict
+    cluster: OpinionCluster, columbia_data: dict
 ) -> dict[str, Any]:
     """Merge date filed
 
@@ -335,10 +342,10 @@ def update_cluster_source(cluster: OpinionCluster) -> None:
 
 
 def merge_field(
-        cluster: OpinionCluster,
-        file_value: Optional[str],
-        field_name: str,
-        skip_judge_merger: bool = False,
+    cluster: OpinionCluster,
+    file_value: Optional[str],
+    field_name: str,
+    skip_judge_merger: bool = False,
 ) -> dict:
     """Try to merge the cluster data and file field data
 
@@ -390,8 +397,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
         merge_docket_numbers(cluster, docket_data["docket_number"])
         cluster.docket.refresh_from_db()
     if (
-            docket_data["date_cert_granted"]
-            and not cluster.docket.date_cert_granted
+        docket_data["date_cert_granted"]
+        and not cluster.docket.date_cert_granted
     ):
         data_to_update["date_cert_granted"] = docket_data["date_cert_granted"]
 
@@ -405,8 +412,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
         data_to_update["date_reargued"] = docket_data["date_reargued"]
 
     if (
-            docket_data["date_reargument_denied"]
-            and not cluster.docket.date_reargument_denied
+        docket_data["date_reargument_denied"]
+        and not cluster.docket.date_reargument_denied
     ):
         data_to_update["date_reargument_denied"] = docket_data[
             "date_reargument_denied"
@@ -417,9 +424,9 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None:
 
 
 def process_cluster(
-        cluster_id: int,
-        filepath: str,
-        skip_judge_merger: bool = False,
+    cluster_id: int,
+    filepath: str,
+    skip_judge_merger: bool = False,
 ) -> None:
     """Merge specified cluster id
 
@@ -489,14 +496,15 @@ def process_cluster(
     docket_data = {
         k: v
         for k, v in columbia_data.items()
-        if k in [
-               "docket_number",
-               "date_cert_granted",
-               "date_cert_denied",
-               "date_argued",
-               "date_reargued",
-               "date_reargument_denied",
-           ]
+        if k
+        in [
+            "docket_number",
+            "date_cert_granted",
+            "date_cert_denied",
+            "date_argued",
+            "date_reargued",
+            "date_reargument_denied",
+        ]
     }
 
     try:
@@ -507,10 +515,10 @@ def process_cluster(
             for field in ["syllabus", "attorneys", "posture", "judges"]:
                 columbia_value = columbia_data.get(field)
                 if data := merge_field(
-                        cluster,
-                        columbia_value,
-                        field,
-                        skip_judge_merger=skip_judge_merger,
+                    cluster,
+                    columbia_value,
+                    field,
+                    skip_judge_merger=skip_judge_merger,
                 ):
                     merged_data.update(data)
 
@@ -531,7 +539,7 @@ def process_cluster(
 
             # Merge results into a single dict
             data_to_update = (
-                    merged_data | case_names_to_update | date_filed_to_update
+                merged_data | case_names_to_update | date_filed_to_update
             )
 
             if data_to_update:
@@ -562,7 +570,8 @@ def process_cluster(
         logger.warning(msg=f"Judge exception for cluster id: {cluster_id}")
     except CitationException:
         logger.warning(
-            msg=f"Invalid citation found in {filepath} while merging cluster id: {cluster_id}")
+            msg=f"Invalid citation found in {filepath} while merging cluster id: {cluster_id}"
+        )
 
 
 def merge_columbia_into_cl(options) -> None:

From 0e699c9fcec9f7250184b64a260cc1dd6dd203a3 Mon Sep 17 00:00:00 2001
From: Kevin Ramirez <kvnzavalza@hotmail.com>
Date: Fri, 1 Mar 2024 10:23:03 -0600
Subject: [PATCH 09/11] fix(columbia_merger): save local_path when creating a
 new opinion

---
 .../management/commands/columbia_merge.py          | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py
index 9e2cd6b205..c9f88b72d9 100644
--- a/cl/corpus_importer/management/commands/columbia_merge.py
+++ b/cl/corpus_importer/management/commands/columbia_merge.py
@@ -165,7 +165,10 @@ def get_cl_opinion_content(
 
 
 def update_matching_opinions(
-    matches: dict, cl_cleaned_opinions: list, columbia_opinions: list
+    matches: dict,
+    cl_cleaned_opinions: list,
+    columbia_opinions: list,
+    filepath: str,
 ) -> None:
     """Store matching opinion content in html_columbia field from Opinion object
 
@@ -223,11 +226,13 @@ def update_matching_opinions(
 def map_and_merge_opinions(
     cluster_id: int,
     columbia_opinions: list[dict],
+    filepath: str,
 ) -> None:
     """Map and merge opinion data
 
     :param cluster_id: Cluster id
     :param columbia_opinions: list of columbia opinions from file
+    :param filepath: xml file from which the opinion was extracted
     :return: None
     """
 
@@ -252,7 +257,7 @@ def map_and_merge_opinions(
         if len(matches) == len(columbia_opinions):
             # We were able to match opinions, add opinions to html_columbia field
             update_matching_opinions(
-                matches, cl_cleaned_opinions, columbia_opinions
+                matches, cl_cleaned_opinions, columbia_opinions, filepath
             )
         else:
             raise OpinionMatchingException("Failed to match opinions")
@@ -279,6 +284,7 @@ def map_and_merge_opinions(
                 per_curiam=op["per_curiam"],
                 cluster_id=cluster_id,
                 type=opinion_type,
+                local_path=filepath,
                 author_str=(
                     titlecase(find_just_name(author.strip(":")))
                     if author
@@ -509,7 +515,9 @@ def process_cluster(
 
     try:
         with transaction.atomic():
-            map_and_merge_opinions(cluster_id, columbia_data["opinions"])
+            map_and_merge_opinions(
+                cluster_id, columbia_data["opinions"], filepath
+            )
 
             merged_data = {}
             for field in ["syllabus", "attorneys", "posture", "judges"]:

From 30bbab25715381885134369b5430db3cf0ebbcc8 Mon Sep 17 00:00:00 2001
From: Kevin Ramirez <kvnzavalza@hotmail.com>
Date: Mon, 6 May 2024 17:27:29 -0600
Subject: [PATCH 10/11] feat(columbia_merge): save file path for opinion when
 possible

---
 cl/corpus_importer/management/commands/columbia_merge.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py
index e5564d84e3..7ec3c68fca 100644
--- a/cl/corpus_importer/management/commands/columbia_merge.py
+++ b/cl/corpus_importer/management/commands/columbia_merge.py
@@ -156,6 +156,7 @@ def update_matching_opinions(
     :param matches: dict with matching position from cl and columbia opinions
     :param cl_cleaned_opinions: list of cl opinions
     :param columbia_opinions: list of columbia opinions
+    :param filepath: xml file from which the opinion was extracted
     :return: None
     """
     for columbia_pos, cl_pos in matches.items():
@@ -201,6 +202,9 @@ def update_matching_opinions(
             file_opinion["opinion"], columbia_pos
         )
         op.html_columbia = str(converted_text)
+        if not op.local_path:
+            # Store file path only if it is empty in the Opinion object
+            op.local_path = filepath
         op.save()
 
 

From 156a2eccd3aaeaaf8db3761f07bcfeff9d6f91ae Mon Sep 17 00:00:00 2001
From: Kevin Ramirez <kvnzavalza@hotmail.com>
Date: Thu, 23 May 2024 14:04:06 -0600
Subject: [PATCH 11/11] feat(columbia_merger): remove unused import

update comments
---
 cl/corpus_importer/import_columbia/columbia_utils.py  |  2 +-
 .../management/commands/columbia_merge.py             | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/cl/corpus_importer/import_columbia/columbia_utils.py b/cl/corpus_importer/import_columbia/columbia_utils.py
index 5cc00dd2d0..256f7d302f 100644
--- a/cl/corpus_importer/import_columbia/columbia_utils.py
+++ b/cl/corpus_importer/import_columbia/columbia_utils.py
@@ -554,7 +554,7 @@ def convert_columbia_html(text: str, opinion_index: int) -> str:
         )
 
         # We use opinion index to ensure that all footnotes are linked to the
-        # corresponding opinion
+        # corresponding opinion (when a case has multiple opinions)
         for ref in foot_references:
             if (match := re.search(r"[*\d]+", ref)) is not None:
                 f_num = match.group()
diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py
index 7ec3c68fca..8c93be484f 100644
--- a/cl/corpus_importer/management/commands/columbia_merge.py
+++ b/cl/corpus_importer/management/commands/columbia_merge.py
@@ -26,7 +26,6 @@
 import pandas as pd
 from bs4 import BeautifulSoup
 from django.db import transaction
-from django.db.models import Q
 from juriscraper.lib.string_utils import titlecase
 
 from cl.corpus_importer.import_columbia.columbia_utils import (
@@ -154,7 +153,7 @@ def update_matching_opinions(
     """Store matching opinion content in html_columbia field from Opinion object
 
     :param matches: dict with matching position from cl and columbia opinions
-    :param cl_cleaned_opinions: list of cl opinions
+    :param cl_cleaned_opinions: list of cl opinions from a single cluster
     :param columbia_opinions: list of columbia opinions
     :param filepath: xml file from which the opinion was extracted
     :return: None
@@ -175,9 +174,10 @@ def update_matching_opinions(
         if op.author_str == "":
             # We have an empty author name
             if author_str:
-                # Store the name extracted from the author tag
+                # Store the name extracted from the author tag of the xml file
                 op.author_str = author_str
         else:
+            # opinion already has an author in cl
             if author_str:
                 if (
                     find_just_name(op.author_str).lower()
@@ -215,7 +215,7 @@ def map_and_merge_opinions(
 ) -> None:
     """Map and merge opinion data
 
-    :param cluster_id: Cluster id
+    :param cluster_id: Cluster id to merge with
     :param columbia_opinions: list of columbia opinions from file
     :param filepath: xml file from which the opinion was extracted
     :return: None
@@ -240,7 +240,8 @@ def map_and_merge_opinions(
             [op.get("opinion") for op in cl_cleaned_opinions],
         )
         if len(matches) == len(columbia_opinions):
-            # We were able to match opinions, add opinions to html_columbia field
+            # We were able to match all opinions, add opinion content to
+            # html_columbia field
             update_matching_opinions(
                 matches, cl_cleaned_opinions, columbia_opinions, filepath
             )