From 0b3e5fb585060c67b9a155fe5c8036d0f13ea5d7 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 13 Feb 2024 11:14:34 -0600 Subject: [PATCH 01/11] fix(columbia_merger): exclude combined opinions from cluster --- .../management/commands/columbia_merge.py | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py index ee280e12f6..fcf44cedb4 100644 --- a/cl/corpus_importer/management/commands/columbia_merge.py +++ b/cl/corpus_importer/management/commands/columbia_merge.py @@ -79,7 +79,6 @@ Docket.COLUMBIA_AND_RECAP_AND_SCRAPER_AND_IDB_AND_HARVARD, ] - VALID_MERGED_SOURCES = [ key for key in dict(SOURCES.NAMES).keys() @@ -120,7 +119,8 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]: :return: list with opinion content from cl """ cl_cleaned_opinions = [] - opinions_from_cluster = Opinion.objects.filter(cluster_id=cluster_id) + opinions_from_cluster = Opinion.objects.filter(cluster_id=cluster_id).exclude( + type="010combined") is_harvard = False for i, op in enumerate(opinions_from_cluster): @@ -153,7 +153,7 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]: def update_matching_opinions( - matches: dict, cl_cleaned_opinions: list, columbia_opinions: list + matches: dict, cl_cleaned_opinions: list, columbia_opinions: list ) -> None: """Store matching opinion content in html_columbia field from Opinion object @@ -183,8 +183,8 @@ def update_matching_opinions( else: if author_str: if ( - find_just_name(op.author_str).lower() - != find_just_name(author_str).lower() + find_just_name(op.author_str).lower() + != find_just_name(author_str).lower() ): # last resort, use distance between words to solve typos s = SequenceMatcher( @@ -209,8 +209,8 @@ def update_matching_opinions( def map_and_merge_opinions( - cluster_id: int, - columbia_opinions: list[dict], + cluster_id: int, + columbia_opinions: list[dict], ) -> None: """Map and merge opinion data @@ -275,7 +275,7 @@ def map_and_merge_opinions( def merge_date_filed( - cluster: OpinionCluster, columbia_data: dict + cluster: OpinionCluster, columbia_data: dict ) -> dict[str, Any]: """Merge date filed @@ -318,10 +318,10 @@ def update_cluster_source(cluster: OpinionCluster) -> None: def merge_field( - cluster: OpinionCluster, - file_value: Optional[str], - field_name: str, - skip_judge_merger: bool = False, + cluster: OpinionCluster, + file_value: Optional[str], + field_name: str, + skip_judge_merger: bool = False, ) -> dict: """Try to merge the cluster data and file field data @@ -373,8 +373,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: merge_docket_numbers(cluster, docket_data["docket_number"]) cluster.docket.refresh_from_db() if ( - docket_data["date_cert_granted"] - and not cluster.docket.date_cert_granted + docket_data["date_cert_granted"] + and not cluster.docket.date_cert_granted ): data_to_update["date_cert_granted"] = docket_data["date_cert_granted"] @@ -388,8 +388,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: data_to_update["date_reargued"] = docket_data["date_reargued"] if ( - docket_data["date_reargument_denied"] - and not cluster.docket.date_reargument_denied + docket_data["date_reargument_denied"] + and not cluster.docket.date_reargument_denied ): data_to_update["date_reargument_denied"] = docket_data[ "date_reargument_denied" @@ -400,9 +400,9 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: def process_cluster( - cluster_id: int, - filepath: str, - skip_judge_merger: bool = False, + cluster_id: int, + filepath: str, + skip_judge_merger: bool = False, ) -> None: """Merge specified cluster id @@ -473,14 +473,14 @@ def process_cluster( k: v for k, v in columbia_data.items() if k - in [ - "docket_number", - "date_cert_granted", - "date_cert_denied", - "date_argued", - "date_reargued", - "date_reargument_denied", - ] + in [ + "docket_number", + "date_cert_granted", + "date_cert_denied", + "date_argued", + "date_reargued", + "date_reargument_denied", + ] } try: @@ -491,10 +491,10 @@ def process_cluster( for field in ["syllabus", "attorneys", "posture", "judges"]: columbia_value = columbia_data.get(field) if data := merge_field( - cluster, - columbia_value, - field, - skip_judge_merger=skip_judge_merger, + cluster, + columbia_value, + field, + skip_judge_merger=skip_judge_merger, ): merged_data.update(data) @@ -515,7 +515,7 @@ def process_cluster( # Merge results into a single dict data_to_update = ( - merged_data | case_names_to_update | date_filed_to_update + merged_data | case_names_to_update | date_filed_to_update ) if data_to_update: From efc3931cc84a3e4e980270bc8d5bdd984b3fc967 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Feb 2024 19:47:09 +0000 Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../management/commands/columbia_merge.py | 65 ++++++++++--------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py index fcf44cedb4..f7e827b01e 100644 --- a/cl/corpus_importer/management/commands/columbia_merge.py +++ b/cl/corpus_importer/management/commands/columbia_merge.py @@ -119,8 +119,9 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]: :return: list with opinion content from cl """ cl_cleaned_opinions = [] - opinions_from_cluster = Opinion.objects.filter(cluster_id=cluster_id).exclude( - type="010combined") + opinions_from_cluster = Opinion.objects.filter( + cluster_id=cluster_id + ).exclude(type="010combined") is_harvard = False for i, op in enumerate(opinions_from_cluster): @@ -153,7 +154,7 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]: def update_matching_opinions( - matches: dict, cl_cleaned_opinions: list, columbia_opinions: list + matches: dict, cl_cleaned_opinions: list, columbia_opinions: list ) -> None: """Store matching opinion content in html_columbia field from Opinion object @@ -183,8 +184,8 @@ def update_matching_opinions( else: if author_str: if ( - find_just_name(op.author_str).lower() - != find_just_name(author_str).lower() + find_just_name(op.author_str).lower() + != find_just_name(author_str).lower() ): # last resort, use distance between words to solve typos s = SequenceMatcher( @@ -209,8 +210,8 @@ def update_matching_opinions( def map_and_merge_opinions( - cluster_id: int, - columbia_opinions: list[dict], + cluster_id: int, + columbia_opinions: list[dict], ) -> None: """Map and merge opinion data @@ -275,7 +276,7 @@ def map_and_merge_opinions( def merge_date_filed( - cluster: OpinionCluster, columbia_data: dict + cluster: OpinionCluster, columbia_data: dict ) -> dict[str, Any]: """Merge date filed @@ -318,10 +319,10 @@ def update_cluster_source(cluster: OpinionCluster) -> None: def merge_field( - cluster: OpinionCluster, - file_value: Optional[str], - field_name: str, - skip_judge_merger: bool = False, + cluster: OpinionCluster, + file_value: Optional[str], + field_name: str, + skip_judge_merger: bool = False, ) -> dict: """Try to merge the cluster data and file field data @@ -373,8 +374,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: merge_docket_numbers(cluster, docket_data["docket_number"]) cluster.docket.refresh_from_db() if ( - docket_data["date_cert_granted"] - and not cluster.docket.date_cert_granted + docket_data["date_cert_granted"] + and not cluster.docket.date_cert_granted ): data_to_update["date_cert_granted"] = docket_data["date_cert_granted"] @@ -388,8 +389,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: data_to_update["date_reargued"] = docket_data["date_reargued"] if ( - docket_data["date_reargument_denied"] - and not cluster.docket.date_reargument_denied + docket_data["date_reargument_denied"] + and not cluster.docket.date_reargument_denied ): data_to_update["date_reargument_denied"] = docket_data[ "date_reargument_denied" @@ -400,9 +401,9 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: def process_cluster( - cluster_id: int, - filepath: str, - skip_judge_merger: bool = False, + cluster_id: int, + filepath: str, + skip_judge_merger: bool = False, ) -> None: """Merge specified cluster id @@ -473,14 +474,14 @@ def process_cluster( k: v for k, v in columbia_data.items() if k - in [ - "docket_number", - "date_cert_granted", - "date_cert_denied", - "date_argued", - "date_reargued", - "date_reargument_denied", - ] + in [ + "docket_number", + "date_cert_granted", + "date_cert_denied", + "date_argued", + "date_reargued", + "date_reargument_denied", + ] } try: @@ -491,10 +492,10 @@ def process_cluster( for field in ["syllabus", "attorneys", "posture", "judges"]: columbia_value = columbia_data.get(field) if data := merge_field( - cluster, - columbia_value, - field, - skip_judge_merger=skip_judge_merger, + cluster, + columbia_value, + field, + skip_judge_merger=skip_judge_merger, ): merged_data.update(data) @@ -515,7 +516,7 @@ def process_cluster( # Merge results into a single dict data_to_update = ( - merged_data | case_names_to_update | date_filed_to_update + merged_data | case_names_to_update | date_filed_to_update ) if data_to_update: From f9d4b87736a6e97ac2b5356b627af1a95c65a11b Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 16 Feb 2024 14:03:39 -0600 Subject: [PATCH 03/11] tests(columbia_merger): update tests --- cl/corpus_importer/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index e5726e4a13..c2e0491b04 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -3235,7 +3235,7 @@ def test_merger(self): docket=DocketFactory(source=Docket.HARVARD), sub_opinions__data=[ { - "type": "010combined", + "type": "020lead", "xml_harvard": "

Lorem ipsum dolor sit amet, consectetur " "adipiscing elit. Nullam quis elit sed dui " "interdum feugiat.

", From 6be40ee8ddf0ef56d527628b36faff424b789226 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 20 Feb 2024 14:40:33 -0600 Subject: [PATCH 04/11] fix(columbia_merger): improve date extraction --- .../import_columbia/columbia_utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cl/corpus_importer/import_columbia/columbia_utils.py b/cl/corpus_importer/import_columbia/columbia_utils.py index a862788c36..5d61c99b0e 100644 --- a/cl/corpus_importer/import_columbia/columbia_utils.py +++ b/cl/corpus_importer/import_columbia/columbia_utils.py @@ -29,6 +29,22 @@ "affirmed and opinion filed", "dismissed and opinion filed", "decided and entered", + "memorandum opinion filed", + "memorandum opinion delivered and filed", + "granted", + "affirmed", + "submitted and decided", + "affirmed and memorandum opinion filed", + "memorandum filed", + "modified opinion filed", + "opinion modified and refiled", + "opinion filed on", + "opinion on merits filed", + "opinion delivered and filed on", + "order delivered and filed", + "date filed", + "opinion filed in", + "affirmed opinion filed", ] DECIDED_TAGS = ["decided", "date decided", "decided on", "decided date"] ARGUED_TAGS = [ From 65d89f46f879b34199fd66eb1e2637e6cd16daea Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 26 Feb 2024 12:15:51 -0600 Subject: [PATCH 05/11] fix(columbia_merger): handle invalid citation volume --- .../management/commands/columbia_merge.py | 6 +- cl/corpus_importer/utils.py | 136 ++++++++++-------- 2 files changed, 79 insertions(+), 63 deletions(-) diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py index f7e827b01e..f8e0988360 100644 --- a/cl/corpus_importer/management/commands/columbia_merge.py +++ b/cl/corpus_importer/management/commands/columbia_merge.py @@ -54,7 +54,7 @@ merge_judges, merge_long_fields, merge_strings, - update_cluster_panel, + update_cluster_panel, CitationException, ) from cl.lib.command_utils import VerboseCommand, logger from cl.people_db.lookup_utils import extract_judge_last_name, find_just_name @@ -215,6 +215,8 @@ def map_and_merge_opinions( ) -> None: """Map and merge opinion data + # TODO handle combined opinions + :param cluster_id: Cluster id :param columbia_opinions: list of columbia opinions from file :return: None @@ -545,6 +547,8 @@ def process_cluster( ) except JudgeException: logger.warning(msg=f"Judge exception for cluster id: {cluster_id}") + except CitationException: + logger.warning(msg=f"Invalid citation found in {filepath } while merging cluster id: {cluster_id}") def merge_columbia_into_cl(options) -> None: diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index 98f73dfe84..d1a13c8c51 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -82,6 +82,13 @@ def __init__(self, message: str) -> None: self.message = message +class CitationException(Exception): + """Error found in cite.""" + + def __init__(self, message: str) -> None: + self.message = message + + async def mark_ia_upload_needed(d: Docket, save_docket: bool) -> None: """Mark the docket as needing upload if it's not already marked. @@ -143,9 +150,9 @@ def filter_subsets(lists: list[list[int]]) -> Iterator[list[int]]: for match in lists: if not any( - is_subset(match, other_matches) - for other_matches in lists - if match is not other_matches + is_subset(match, other_matches) + for other_matches in lists + if match is not other_matches ): yield match @@ -214,7 +221,7 @@ def compare_documents(file_characters: str, cl_characters: str) -> int: def similarity_scores( - texts_to_compare_1: list[str], texts_to_compare_2: list[str] + texts_to_compare_1: list[str], texts_to_compare_2: list[str] ) -> list[list[float]]: """Get similarity scores between two sets of lists @@ -239,13 +246,13 @@ def similarity_scores( # Calculate cosine similarity between weight of words for each text in list scores = cosine_similarity( - X[: len(texts_to_compare_1)], X[len(texts_to_compare_1) :] + X[: len(texts_to_compare_1)], X[len(texts_to_compare_1):] ) return scores def match_opinion_lists( - file_opinions_list: list[Any], cl_opinions_list: list[Any] + file_opinions_list: list[Any], cl_opinions_list: list[Any] ) -> dict[int, int]: """Match opinions on two lists from different sources @@ -322,7 +329,7 @@ def clean_docket_number(docket_number: str) -> str: def merge_docket_numbers( - cluster: OpinionCluster, docket_number: str + cluster: OpinionCluster, docket_number: str ) -> Optional[str]: """Merge docket number @@ -338,8 +345,8 @@ def merge_docket_numbers( # e.g. CL docket id #3952066 doesn't have cl_clean_docket = clean_docket_number(cl_docket.docket_number) if ( - cl_clean_docket in file_cleaned_docket - and cl_docket.docket_number != file_cleaned_docket + cl_clean_docket in file_cleaned_docket + and cl_docket.docket_number != file_cleaned_docket ): return file_cleaned_docket else: @@ -359,10 +366,10 @@ def merge_docket_numbers( def merge_case_names( - cluster: OpinionCluster, - file_data: dict[str, Any], - case_name_key: str, - case_name_full_key: str, + cluster: OpinionCluster, + file_data: dict[str, Any], + case_name_key: str, + case_name_full_key: str, ) -> dict[str, Any]: """Merge case names @@ -418,7 +425,7 @@ def merge_case_names( def merge_strings( - field_name: str, overlapping_data: tuple[str, str] + field_name: str, overlapping_data: tuple[str, str] ) -> dict[str, Any]: """Compare two strings and choose the largest @@ -437,9 +444,9 @@ def merge_strings( def merge_long_fields( - field_name: str, - overlapping_data: Optional[tuple[str, str]], - cluster_id: int, + field_name: str, + overlapping_data: Optional[tuple[str, str]], + cluster_id: int, ) -> dict[str, Any]: """Merge two long text fields @@ -468,10 +475,10 @@ def merge_long_fields( def merge_judges( - overlapping_data: Optional[tuple[str, str]], - cluster_id: int, - is_columbia: bool = False, - skip_judge_merger: bool = False, + overlapping_data: Optional[tuple[str, str]], + cluster_id: int, + is_columbia: bool = False, + skip_judge_merger: bool = False, ) -> dict[str, Any]: """Merge overlapping judge values @@ -501,7 +508,7 @@ def merge_judges( # Prepare judges string judges = titlecase(", ".join(find_all_judges(file_data))) if ( - temp_file_data_clean.issuperset(temp_cl_clean) or cl_data_upper + temp_file_data_clean.issuperset(temp_cl_clean) or cl_data_upper ) and file_data_cleaned != cl_clean: return {"judges": judges} elif not temp_file_data_clean.intersection(temp_cl_clean): @@ -554,11 +561,11 @@ def merge_judges( def merge_overlapping_data( - cluster: OpinionCluster, - long_fields, - changed_values_dictionary: dict, - skip_judge_merger: bool = False, - is_columbia: bool = False, + cluster: OpinionCluster, + long_fields, + changed_values_dictionary: dict, + skip_judge_merger: bool = False, + is_columbia: bool = False, ) -> dict[str, Any]: """Merge overlapping data @@ -618,9 +625,9 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None: clean_cite = re.sub(r"\s+", " ", cite) citation = get_citations(clean_cite) if ( - not citation - or not isinstance(citation[0], FullCaseCitation) - or not citation[0].groups.get("volume", False) + not citation + or not isinstance(citation[0], FullCaseCitation) + or not citation[0].groups.get("volume", False) ): logger.warning(f"Citation parsing failed for {clean_cite}") continue @@ -632,7 +639,7 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None: reporter_type = map_reporter_db_cite_type(cite_type_str) if Citation.objects.filter( - cluster_id=cluster_id, reporter=citation[0].corrected_reporter() + cluster_id=cluster_id, reporter=citation[0].corrected_reporter() ).exists(): # Avoid adding a citation if we already have a citation from the # citation's reporter @@ -654,12 +661,15 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None: logger.warning( f"Reporter mismatch for cluster: {cluster_id} on cite: {cite}" ) + except ValueError: + # Handle: ValueError: Field ‘volume’ expected a number but got ‘1986-2’ + raise CitationException(f"Invalid citation found: {cite}") def update_cluster_panel( - cluster: OpinionCluster, - panel_list: list[str], - panel_date: Optional[date] = None, + cluster: OpinionCluster, + panel_list: list[str], + panel_date: Optional[date] = None, ) -> None: """Update cluster's panel @@ -786,21 +796,23 @@ def clean_body_content(case_body: str, harvard_file: bool = False) -> str: else: opinions = [] for op in soup.find_all( - lambda tag: ( - tag.name == "opinion" and tag.get("data-type") is None - ) - or tag.get("data-type") == "opinion" + lambda tag: ( + tag.name == "opinion" and tag.get( + "data-type") is None + ) + or tag.get("data-type") == "opinion" ): opinions.append(op.text) opinion_text = "".join( [ op.text for op in soup.find_all( - lambda tag: ( - tag.name == "opinion" and tag.get("data-type") is None - ) - or tag.get("data-type") == "opinion" - ) + lambda tag: ( + tag.name == "opinion" and tag.get( + "data-type") is None + ) + or tag.get("data-type") == "opinion" + ) ] ) @@ -808,7 +820,7 @@ def clean_body_content(case_body: str, harvard_file: bool = False) -> str: def length_too_different( - case: OpinionCluster, file_characters: str, cl_characters: str + case: OpinionCluster, file_characters: str, cl_characters: str ) -> bool: """Check if length is too different between texts @@ -829,10 +841,10 @@ def length_too_different( def content_too_different( - case: OpinionCluster, - file_characters: str, - cl_characters: str, - docket: str, + case: OpinionCluster, + file_characters: str, + cl_characters: str, + docket: str, ) -> bool: """Is the content too different @@ -881,7 +893,7 @@ def content_too_different( def case_names_dont_overlap( - case: OpinionCluster, case_name_full: str, case_name_abbreviation: str + case: OpinionCluster, case_name_full: str, case_name_abbreviation: str ) -> bool: """Case names not overlap @@ -904,7 +916,7 @@ def case_names_dont_overlap( def cosine_similarity_too_different( - case: OpinionCluster, case_name_full: str, case_name_abbreviation: str + case: OpinionCluster, case_name_full: str, case_name_abbreviation: str ) -> bool: """Cosine similarity comparison between case names @@ -929,7 +941,7 @@ def cosine_similarity_too_different( def has_too_similar_citation( - case: OpinionCluster, citation: FullCaseCitation + case: OpinionCluster, citation: FullCaseCitation ) -> bool: """Has a citation associated with cluster in same volume @@ -956,12 +968,12 @@ def has_too_similar_citation( def match_based_text( - file_characters: str, - docket_number: str, - case_name_full: str, - possible_cases: QuerySet, - case_name_abbreviation: str, - citation: FullCaseCitation, + file_characters: str, + docket_number: str, + case_name_full: str, + possible_cases: QuerySet, + case_name_abbreviation: str, + citation: FullCaseCitation, ) -> Optional[OpinionCluster]: """Compare CL text to file content to establish duplicates @@ -985,11 +997,11 @@ def match_based_text( case_and_texts_and_docket = case_and_texts + [docket_number] case_and_titles = [case, case_name_full, case_name_abbreviation] if ( - length_too_different(*case_and_texts) - or has_too_similar_citation(case, citation) - or case_names_dont_overlap(*case_and_titles) - or cosine_similarity_too_different(*case_and_titles) - or content_too_different(*case_and_texts_and_docket) + length_too_different(*case_and_texts) + or has_too_similar_citation(case, citation) + or case_names_dont_overlap(*case_and_titles) + or cosine_similarity_too_different(*case_and_titles) + or content_too_different(*case_and_texts_and_docket) ): continue return case From 4a9f4ecd1e82ba6adc4c04bf9703d5fc80a85a11 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Feb 2024 18:17:02 +0000 Subject: [PATCH 06/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../management/commands/columbia_merge.py | 7 +- cl/corpus_importer/utils.py | 126 +++++++++--------- 2 files changed, 67 insertions(+), 66 deletions(-) diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py index f8e0988360..b5ab5e6e46 100644 --- a/cl/corpus_importer/management/commands/columbia_merge.py +++ b/cl/corpus_importer/management/commands/columbia_merge.py @@ -43,6 +43,7 @@ ) from cl.corpus_importer.utils import ( AuthorException, + CitationException, JudgeException, OpinionMatchingException, OpinionTypeException, @@ -54,7 +55,7 @@ merge_judges, merge_long_fields, merge_strings, - update_cluster_panel, CitationException, + update_cluster_panel, ) from cl.lib.command_utils import VerboseCommand, logger from cl.people_db.lookup_utils import extract_judge_last_name, find_just_name @@ -548,7 +549,9 @@ def process_cluster( except JudgeException: logger.warning(msg=f"Judge exception for cluster id: {cluster_id}") except CitationException: - logger.warning(msg=f"Invalid citation found in {filepath } while merging cluster id: {cluster_id}") + logger.warning( + msg=f"Invalid citation found in {filepath } while merging cluster id: {cluster_id}" + ) def merge_columbia_into_cl(options) -> None: diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index d1a13c8c51..d82c376b54 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -150,9 +150,9 @@ def filter_subsets(lists: list[list[int]]) -> Iterator[list[int]]: for match in lists: if not any( - is_subset(match, other_matches) - for other_matches in lists - if match is not other_matches + is_subset(match, other_matches) + for other_matches in lists + if match is not other_matches ): yield match @@ -221,7 +221,7 @@ def compare_documents(file_characters: str, cl_characters: str) -> int: def similarity_scores( - texts_to_compare_1: list[str], texts_to_compare_2: list[str] + texts_to_compare_1: list[str], texts_to_compare_2: list[str] ) -> list[list[float]]: """Get similarity scores between two sets of lists @@ -246,13 +246,13 @@ def similarity_scores( # Calculate cosine similarity between weight of words for each text in list scores = cosine_similarity( - X[: len(texts_to_compare_1)], X[len(texts_to_compare_1):] + X[: len(texts_to_compare_1)], X[len(texts_to_compare_1) :] ) return scores def match_opinion_lists( - file_opinions_list: list[Any], cl_opinions_list: list[Any] + file_opinions_list: list[Any], cl_opinions_list: list[Any] ) -> dict[int, int]: """Match opinions on two lists from different sources @@ -329,7 +329,7 @@ def clean_docket_number(docket_number: str) -> str: def merge_docket_numbers( - cluster: OpinionCluster, docket_number: str + cluster: OpinionCluster, docket_number: str ) -> Optional[str]: """Merge docket number @@ -345,8 +345,8 @@ def merge_docket_numbers( # e.g. CL docket id #3952066 doesn't have cl_clean_docket = clean_docket_number(cl_docket.docket_number) if ( - cl_clean_docket in file_cleaned_docket - and cl_docket.docket_number != file_cleaned_docket + cl_clean_docket in file_cleaned_docket + and cl_docket.docket_number != file_cleaned_docket ): return file_cleaned_docket else: @@ -366,10 +366,10 @@ def merge_docket_numbers( def merge_case_names( - cluster: OpinionCluster, - file_data: dict[str, Any], - case_name_key: str, - case_name_full_key: str, + cluster: OpinionCluster, + file_data: dict[str, Any], + case_name_key: str, + case_name_full_key: str, ) -> dict[str, Any]: """Merge case names @@ -425,7 +425,7 @@ def merge_case_names( def merge_strings( - field_name: str, overlapping_data: tuple[str, str] + field_name: str, overlapping_data: tuple[str, str] ) -> dict[str, Any]: """Compare two strings and choose the largest @@ -444,9 +444,9 @@ def merge_strings( def merge_long_fields( - field_name: str, - overlapping_data: Optional[tuple[str, str]], - cluster_id: int, + field_name: str, + overlapping_data: Optional[tuple[str, str]], + cluster_id: int, ) -> dict[str, Any]: """Merge two long text fields @@ -475,10 +475,10 @@ def merge_long_fields( def merge_judges( - overlapping_data: Optional[tuple[str, str]], - cluster_id: int, - is_columbia: bool = False, - skip_judge_merger: bool = False, + overlapping_data: Optional[tuple[str, str]], + cluster_id: int, + is_columbia: bool = False, + skip_judge_merger: bool = False, ) -> dict[str, Any]: """Merge overlapping judge values @@ -508,7 +508,7 @@ def merge_judges( # Prepare judges string judges = titlecase(", ".join(find_all_judges(file_data))) if ( - temp_file_data_clean.issuperset(temp_cl_clean) or cl_data_upper + temp_file_data_clean.issuperset(temp_cl_clean) or cl_data_upper ) and file_data_cleaned != cl_clean: return {"judges": judges} elif not temp_file_data_clean.intersection(temp_cl_clean): @@ -561,11 +561,11 @@ def merge_judges( def merge_overlapping_data( - cluster: OpinionCluster, - long_fields, - changed_values_dictionary: dict, - skip_judge_merger: bool = False, - is_columbia: bool = False, + cluster: OpinionCluster, + long_fields, + changed_values_dictionary: dict, + skip_judge_merger: bool = False, + is_columbia: bool = False, ) -> dict[str, Any]: """Merge overlapping data @@ -625,9 +625,9 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None: clean_cite = re.sub(r"\s+", " ", cite) citation = get_citations(clean_cite) if ( - not citation - or not isinstance(citation[0], FullCaseCitation) - or not citation[0].groups.get("volume", False) + not citation + or not isinstance(citation[0], FullCaseCitation) + or not citation[0].groups.get("volume", False) ): logger.warning(f"Citation parsing failed for {clean_cite}") continue @@ -639,7 +639,7 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None: reporter_type = map_reporter_db_cite_type(cite_type_str) if Citation.objects.filter( - cluster_id=cluster_id, reporter=citation[0].corrected_reporter() + cluster_id=cluster_id, reporter=citation[0].corrected_reporter() ).exists(): # Avoid adding a citation if we already have a citation from the # citation's reporter @@ -667,9 +667,9 @@ def add_citations_to_cluster(cites: list[str], cluster_id: int) -> None: def update_cluster_panel( - cluster: OpinionCluster, - panel_list: list[str], - panel_date: Optional[date] = None, + cluster: OpinionCluster, + panel_list: list[str], + panel_date: Optional[date] = None, ) -> None: """Update cluster's panel @@ -796,23 +796,21 @@ def clean_body_content(case_body: str, harvard_file: bool = False) -> str: else: opinions = [] for op in soup.find_all( - lambda tag: ( - tag.name == "opinion" and tag.get( - "data-type") is None - ) - or tag.get("data-type") == "opinion" + lambda tag: ( + tag.name == "opinion" and tag.get("data-type") is None + ) + or tag.get("data-type") == "opinion" ): opinions.append(op.text) opinion_text = "".join( [ op.text for op in soup.find_all( - lambda tag: ( - tag.name == "opinion" and tag.get( - "data-type") is None - ) - or tag.get("data-type") == "opinion" - ) + lambda tag: ( + tag.name == "opinion" and tag.get("data-type") is None + ) + or tag.get("data-type") == "opinion" + ) ] ) @@ -820,7 +818,7 @@ def clean_body_content(case_body: str, harvard_file: bool = False) -> str: def length_too_different( - case: OpinionCluster, file_characters: str, cl_characters: str + case: OpinionCluster, file_characters: str, cl_characters: str ) -> bool: """Check if length is too different between texts @@ -841,10 +839,10 @@ def length_too_different( def content_too_different( - case: OpinionCluster, - file_characters: str, - cl_characters: str, - docket: str, + case: OpinionCluster, + file_characters: str, + cl_characters: str, + docket: str, ) -> bool: """Is the content too different @@ -893,7 +891,7 @@ def content_too_different( def case_names_dont_overlap( - case: OpinionCluster, case_name_full: str, case_name_abbreviation: str + case: OpinionCluster, case_name_full: str, case_name_abbreviation: str ) -> bool: """Case names not overlap @@ -916,7 +914,7 @@ def case_names_dont_overlap( def cosine_similarity_too_different( - case: OpinionCluster, case_name_full: str, case_name_abbreviation: str + case: OpinionCluster, case_name_full: str, case_name_abbreviation: str ) -> bool: """Cosine similarity comparison between case names @@ -941,7 +939,7 @@ def cosine_similarity_too_different( def has_too_similar_citation( - case: OpinionCluster, citation: FullCaseCitation + case: OpinionCluster, citation: FullCaseCitation ) -> bool: """Has a citation associated with cluster in same volume @@ -968,12 +966,12 @@ def has_too_similar_citation( def match_based_text( - file_characters: str, - docket_number: str, - case_name_full: str, - possible_cases: QuerySet, - case_name_abbreviation: str, - citation: FullCaseCitation, + file_characters: str, + docket_number: str, + case_name_full: str, + possible_cases: QuerySet, + case_name_abbreviation: str, + citation: FullCaseCitation, ) -> Optional[OpinionCluster]: """Compare CL text to file content to establish duplicates @@ -997,11 +995,11 @@ def match_based_text( case_and_texts_and_docket = case_and_texts + [docket_number] case_and_titles = [case, case_name_full, case_name_abbreviation] if ( - length_too_different(*case_and_texts) - or has_too_similar_citation(case, citation) - or case_names_dont_overlap(*case_and_titles) - or cosine_similarity_too_different(*case_and_titles) - or content_too_different(*case_and_texts_and_docket) + length_too_different(*case_and_texts) + or has_too_similar_citation(case, citation) + or case_names_dont_overlap(*case_and_titles) + or cosine_similarity_too_different(*case_and_titles) + or content_too_different(*case_and_texts_and_docket) ): continue return case From b2432e5eabfc3425ca59e9a501fd46f9b410e9d1 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 26 Feb 2024 14:39:06 -0600 Subject: [PATCH 07/11] fix(columbia_merger): handle invalid citation volume --- .../management/commands/columbia_merge.py | 90 +++++++++++-------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py index f8e0988360..c61c918eaf 100644 --- a/cl/corpus_importer/management/commands/columbia_merge.py +++ b/cl/corpus_importer/management/commands/columbia_merge.py @@ -112,16 +112,24 @@ def clean_opinion_content(content: str, is_harvard: bool) -> str: return prep_text -def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]: +def get_cl_opinion_content(cluster_id: int, columbia_single_opinion: bool = False) -> list[dict[Any, Any]]: """Get the opinions content for a cluster object :param cluster_id: Cluster ID for a set of opinions + :param columbia_single_opinion: True if xml file only has one opinion else False :return: list with opinion content from cl """ cl_cleaned_opinions = [] + + # Get all opinions from cluster opinions_from_cluster = Opinion.objects.filter( cluster_id=cluster_id - ).exclude(type="010combined") + ) + + if not columbia_single_opinion: + # File has multiple opinions, then we can exclude combined opinions + opinions_from_cluster = opinions_from_cluster.exclude(type="010combined") + is_harvard = False for i, op in enumerate(opinions_from_cluster): @@ -154,7 +162,7 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]: def update_matching_opinions( - matches: dict, cl_cleaned_opinions: list, columbia_opinions: list + matches: dict, cl_cleaned_opinions: list, columbia_opinions: list ) -> None: """Store matching opinion content in html_columbia field from Opinion object @@ -184,8 +192,8 @@ def update_matching_opinions( else: if author_str: if ( - find_just_name(op.author_str).lower() - != find_just_name(author_str).lower() + find_just_name(op.author_str).lower() + != find_just_name(author_str).lower() ): # last resort, use distance between words to solve typos s = SequenceMatcher( @@ -210,19 +218,21 @@ def update_matching_opinions( def map_and_merge_opinions( - cluster_id: int, - columbia_opinions: list[dict], + cluster_id: int, + columbia_opinions: list[dict], ) -> None: """Map and merge opinion data - # TODO handle combined opinions - :param cluster_id: Cluster id :param columbia_opinions: list of columbia opinions from file :return: None """ - cl_cleaned_opinions = get_cl_opinion_content(cluster_id) + # Check if columbia source only has one opinion + columbia_single_opinion = True if len(columbia_opinions) == 1 else False + + # We exclude combined opinions only if we have more than one opinion in the xml + cl_cleaned_opinions = get_cl_opinion_content(cluster_id, columbia_single_opinion) if len(columbia_opinions) == len(cl_cleaned_opinions): # We need that both list to be cleaned, so we can have a more @@ -235,13 +245,17 @@ def map_and_merge_opinions( [op.get("opinion") for op in cl_cleaned_opinions], ) if len(matches) == len(columbia_opinions): + # We were able to match opinions, add opinions to html_columbia field update_matching_opinions( matches, cl_cleaned_opinions, columbia_opinions ) else: raise OpinionMatchingException("Failed to match opinions") - elif len(columbia_opinions) > len(cl_cleaned_opinions) == 1: + elif (len(columbia_opinions) > len(cl_cleaned_opinions)) and len(cl_cleaned_opinions) == 0: + # We have more opinions in file than in CL and if cl_cleaned_opinions == 0 it + # means that we probably excluded the combined opinion, we create each + # opinion from file for op in columbia_opinions: opinion_type = op.get("type") file = op.get("file") @@ -278,7 +292,7 @@ def map_and_merge_opinions( def merge_date_filed( - cluster: OpinionCluster, columbia_data: dict + cluster: OpinionCluster, columbia_data: dict ) -> dict[str, Any]: """Merge date filed @@ -321,10 +335,10 @@ def update_cluster_source(cluster: OpinionCluster) -> None: def merge_field( - cluster: OpinionCluster, - file_value: Optional[str], - field_name: str, - skip_judge_merger: bool = False, + cluster: OpinionCluster, + file_value: Optional[str], + field_name: str, + skip_judge_merger: bool = False, ) -> dict: """Try to merge the cluster data and file field data @@ -376,8 +390,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: merge_docket_numbers(cluster, docket_data["docket_number"]) cluster.docket.refresh_from_db() if ( - docket_data["date_cert_granted"] - and not cluster.docket.date_cert_granted + docket_data["date_cert_granted"] + and not cluster.docket.date_cert_granted ): data_to_update["date_cert_granted"] = docket_data["date_cert_granted"] @@ -391,8 +405,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: data_to_update["date_reargued"] = docket_data["date_reargued"] if ( - docket_data["date_reargument_denied"] - and not cluster.docket.date_reargument_denied + docket_data["date_reargument_denied"] + and not cluster.docket.date_reargument_denied ): data_to_update["date_reargument_denied"] = docket_data[ "date_reargument_denied" @@ -403,9 +417,9 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: def process_cluster( - cluster_id: int, - filepath: str, - skip_judge_merger: bool = False, + cluster_id: int, + filepath: str, + skip_judge_merger: bool = False, ) -> None: """Merge specified cluster id @@ -475,15 +489,14 @@ def process_cluster( docket_data = { k: v for k, v in columbia_data.items() - if k - in [ - "docket_number", - "date_cert_granted", - "date_cert_denied", - "date_argued", - "date_reargued", - "date_reargument_denied", - ] + if k in [ + "docket_number", + "date_cert_granted", + "date_cert_denied", + "date_argued", + "date_reargued", + "date_reargument_denied", + ] } try: @@ -494,10 +507,10 @@ def process_cluster( for field in ["syllabus", "attorneys", "posture", "judges"]: columbia_value = columbia_data.get(field) if data := merge_field( - cluster, - columbia_value, - field, - skip_judge_merger=skip_judge_merger, + cluster, + columbia_value, + field, + skip_judge_merger=skip_judge_merger, ): merged_data.update(data) @@ -518,7 +531,7 @@ def process_cluster( # Merge results into a single dict data_to_update = ( - merged_data | case_names_to_update | date_filed_to_update + merged_data | case_names_to_update | date_filed_to_update ) if data_to_update: @@ -548,7 +561,8 @@ def process_cluster( except JudgeException: logger.warning(msg=f"Judge exception for cluster id: {cluster_id}") except CitationException: - logger.warning(msg=f"Invalid citation found in {filepath } while merging cluster id: {cluster_id}") + logger.warning( + msg=f"Invalid citation found in {filepath} while merging cluster id: {cluster_id}") def merge_columbia_into_cl(options) -> None: From 78adef707b8a8f3d950eecd45f59bf610ae8f834 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 26 Feb 2024 20:41:17 +0000 Subject: [PATCH 08/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../management/commands/columbia_merge.py | 87 ++++++++++--------- 1 file changed, 48 insertions(+), 39 deletions(-) diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py index c61c918eaf..9e2cd6b205 100644 --- a/cl/corpus_importer/management/commands/columbia_merge.py +++ b/cl/corpus_importer/management/commands/columbia_merge.py @@ -43,6 +43,7 @@ ) from cl.corpus_importer.utils import ( AuthorException, + CitationException, JudgeException, OpinionMatchingException, OpinionTypeException, @@ -54,7 +55,7 @@ merge_judges, merge_long_fields, merge_strings, - update_cluster_panel, CitationException, + update_cluster_panel, ) from cl.lib.command_utils import VerboseCommand, logger from cl.people_db.lookup_utils import extract_judge_last_name, find_just_name @@ -112,7 +113,9 @@ def clean_opinion_content(content: str, is_harvard: bool) -> str: return prep_text -def get_cl_opinion_content(cluster_id: int, columbia_single_opinion: bool = False) -> list[dict[Any, Any]]: +def get_cl_opinion_content( + cluster_id: int, columbia_single_opinion: bool = False +) -> list[dict[Any, Any]]: """Get the opinions content for a cluster object :param cluster_id: Cluster ID for a set of opinions @@ -122,13 +125,13 @@ def get_cl_opinion_content(cluster_id: int, columbia_single_opinion: bool = Fals cl_cleaned_opinions = [] # Get all opinions from cluster - opinions_from_cluster = Opinion.objects.filter( - cluster_id=cluster_id - ) + opinions_from_cluster = Opinion.objects.filter(cluster_id=cluster_id) if not columbia_single_opinion: # File has multiple opinions, then we can exclude combined opinions - opinions_from_cluster = opinions_from_cluster.exclude(type="010combined") + opinions_from_cluster = opinions_from_cluster.exclude( + type="010combined" + ) is_harvard = False @@ -162,7 +165,7 @@ def get_cl_opinion_content(cluster_id: int, columbia_single_opinion: bool = Fals def update_matching_opinions( - matches: dict, cl_cleaned_opinions: list, columbia_opinions: list + matches: dict, cl_cleaned_opinions: list, columbia_opinions: list ) -> None: """Store matching opinion content in html_columbia field from Opinion object @@ -192,8 +195,8 @@ def update_matching_opinions( else: if author_str: if ( - find_just_name(op.author_str).lower() - != find_just_name(author_str).lower() + find_just_name(op.author_str).lower() + != find_just_name(author_str).lower() ): # last resort, use distance between words to solve typos s = SequenceMatcher( @@ -218,8 +221,8 @@ def update_matching_opinions( def map_and_merge_opinions( - cluster_id: int, - columbia_opinions: list[dict], + cluster_id: int, + columbia_opinions: list[dict], ) -> None: """Map and merge opinion data @@ -232,7 +235,9 @@ def map_and_merge_opinions( columbia_single_opinion = True if len(columbia_opinions) == 1 else False # We exclude combined opinions only if we have more than one opinion in the xml - cl_cleaned_opinions = get_cl_opinion_content(cluster_id, columbia_single_opinion) + cl_cleaned_opinions = get_cl_opinion_content( + cluster_id, columbia_single_opinion + ) if len(columbia_opinions) == len(cl_cleaned_opinions): # We need that both list to be cleaned, so we can have a more @@ -252,7 +257,9 @@ def map_and_merge_opinions( else: raise OpinionMatchingException("Failed to match opinions") - elif (len(columbia_opinions) > len(cl_cleaned_opinions)) and len(cl_cleaned_opinions) == 0: + elif (len(columbia_opinions) > len(cl_cleaned_opinions)) and len( + cl_cleaned_opinions + ) == 0: # We have more opinions in file than in CL and if cl_cleaned_opinions == 0 it # means that we probably excluded the combined opinion, we create each # opinion from file @@ -292,7 +299,7 @@ def map_and_merge_opinions( def merge_date_filed( - cluster: OpinionCluster, columbia_data: dict + cluster: OpinionCluster, columbia_data: dict ) -> dict[str, Any]: """Merge date filed @@ -335,10 +342,10 @@ def update_cluster_source(cluster: OpinionCluster) -> None: def merge_field( - cluster: OpinionCluster, - file_value: Optional[str], - field_name: str, - skip_judge_merger: bool = False, + cluster: OpinionCluster, + file_value: Optional[str], + field_name: str, + skip_judge_merger: bool = False, ) -> dict: """Try to merge the cluster data and file field data @@ -390,8 +397,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: merge_docket_numbers(cluster, docket_data["docket_number"]) cluster.docket.refresh_from_db() if ( - docket_data["date_cert_granted"] - and not cluster.docket.date_cert_granted + docket_data["date_cert_granted"] + and not cluster.docket.date_cert_granted ): data_to_update["date_cert_granted"] = docket_data["date_cert_granted"] @@ -405,8 +412,8 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: data_to_update["date_reargued"] = docket_data["date_reargued"] if ( - docket_data["date_reargument_denied"] - and not cluster.docket.date_reargument_denied + docket_data["date_reargument_denied"] + and not cluster.docket.date_reargument_denied ): data_to_update["date_reargument_denied"] = docket_data[ "date_reargument_denied" @@ -417,9 +424,9 @@ def merge_docket_data(docket_data: dict, cluster: OpinionCluster) -> None: def process_cluster( - cluster_id: int, - filepath: str, - skip_judge_merger: bool = False, + cluster_id: int, + filepath: str, + skip_judge_merger: bool = False, ) -> None: """Merge specified cluster id @@ -489,14 +496,15 @@ def process_cluster( docket_data = { k: v for k, v in columbia_data.items() - if k in [ - "docket_number", - "date_cert_granted", - "date_cert_denied", - "date_argued", - "date_reargued", - "date_reargument_denied", - ] + if k + in [ + "docket_number", + "date_cert_granted", + "date_cert_denied", + "date_argued", + "date_reargued", + "date_reargument_denied", + ] } try: @@ -507,10 +515,10 @@ def process_cluster( for field in ["syllabus", "attorneys", "posture", "judges"]: columbia_value = columbia_data.get(field) if data := merge_field( - cluster, - columbia_value, - field, - skip_judge_merger=skip_judge_merger, + cluster, + columbia_value, + field, + skip_judge_merger=skip_judge_merger, ): merged_data.update(data) @@ -531,7 +539,7 @@ def process_cluster( # Merge results into a single dict data_to_update = ( - merged_data | case_names_to_update | date_filed_to_update + merged_data | case_names_to_update | date_filed_to_update ) if data_to_update: @@ -562,7 +570,8 @@ def process_cluster( logger.warning(msg=f"Judge exception for cluster id: {cluster_id}") except CitationException: logger.warning( - msg=f"Invalid citation found in {filepath} while merging cluster id: {cluster_id}") + msg=f"Invalid citation found in {filepath} while merging cluster id: {cluster_id}" + ) def merge_columbia_into_cl(options) -> None: From 0e699c9fcec9f7250184b64a260cc1dd6dd203a3 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 1 Mar 2024 10:23:03 -0600 Subject: [PATCH 09/11] fix(columbia_merger): save local_path when creating a new opinion --- .../management/commands/columbia_merge.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py index 9e2cd6b205..c9f88b72d9 100644 --- a/cl/corpus_importer/management/commands/columbia_merge.py +++ b/cl/corpus_importer/management/commands/columbia_merge.py @@ -165,7 +165,10 @@ def get_cl_opinion_content( def update_matching_opinions( - matches: dict, cl_cleaned_opinions: list, columbia_opinions: list + matches: dict, + cl_cleaned_opinions: list, + columbia_opinions: list, + filepath: str, ) -> None: """Store matching opinion content in html_columbia field from Opinion object @@ -223,11 +226,13 @@ def update_matching_opinions( def map_and_merge_opinions( cluster_id: int, columbia_opinions: list[dict], + filepath: str, ) -> None: """Map and merge opinion data :param cluster_id: Cluster id :param columbia_opinions: list of columbia opinions from file + :param filepath: xml file from which the opinion was extracted :return: None """ @@ -252,7 +257,7 @@ def map_and_merge_opinions( if len(matches) == len(columbia_opinions): # We were able to match opinions, add opinions to html_columbia field update_matching_opinions( - matches, cl_cleaned_opinions, columbia_opinions + matches, cl_cleaned_opinions, columbia_opinions, filepath ) else: raise OpinionMatchingException("Failed to match opinions") @@ -279,6 +284,7 @@ def map_and_merge_opinions( per_curiam=op["per_curiam"], cluster_id=cluster_id, type=opinion_type, + local_path=filepath, author_str=( titlecase(find_just_name(author.strip(":"))) if author @@ -509,7 +515,9 @@ def process_cluster( try: with transaction.atomic(): - map_and_merge_opinions(cluster_id, columbia_data["opinions"]) + map_and_merge_opinions( + cluster_id, columbia_data["opinions"], filepath + ) merged_data = {} for field in ["syllabus", "attorneys", "posture", "judges"]: From 30bbab25715381885134369b5430db3cf0ebbcc8 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 6 May 2024 17:27:29 -0600 Subject: [PATCH 10/11] feat(columbia_merge): save file path for opinion when possible --- cl/corpus_importer/management/commands/columbia_merge.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py index e5564d84e3..7ec3c68fca 100644 --- a/cl/corpus_importer/management/commands/columbia_merge.py +++ b/cl/corpus_importer/management/commands/columbia_merge.py @@ -156,6 +156,7 @@ def update_matching_opinions( :param matches: dict with matching position from cl and columbia opinions :param cl_cleaned_opinions: list of cl opinions :param columbia_opinions: list of columbia opinions + :param filepath: xml file from which the opinion was extracted :return: None """ for columbia_pos, cl_pos in matches.items(): @@ -201,6 +202,9 @@ def update_matching_opinions( file_opinion["opinion"], columbia_pos ) op.html_columbia = str(converted_text) + if not op.local_path: + # Store file path only if it is empty in the Opinion object + op.local_path = filepath op.save() From 156a2eccd3aaeaaf8db3761f07bcfeff9d6f91ae Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 23 May 2024 14:04:06 -0600 Subject: [PATCH 11/11] feat(columbia_merger): remove unused import update comments --- cl/corpus_importer/import_columbia/columbia_utils.py | 2 +- .../management/commands/columbia_merge.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cl/corpus_importer/import_columbia/columbia_utils.py b/cl/corpus_importer/import_columbia/columbia_utils.py index 5cc00dd2d0..256f7d302f 100644 --- a/cl/corpus_importer/import_columbia/columbia_utils.py +++ b/cl/corpus_importer/import_columbia/columbia_utils.py @@ -554,7 +554,7 @@ def convert_columbia_html(text: str, opinion_index: int) -> str: ) # We use opinion index to ensure that all footnotes are linked to the - # corresponding opinion + # corresponding opinion (when a case has multiple opinions) for ref in foot_references: if (match := re.search(r"[*\d]+", ref)) is not None: f_num = match.group() diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py index 7ec3c68fca..8c93be484f 100644 --- a/cl/corpus_importer/management/commands/columbia_merge.py +++ b/cl/corpus_importer/management/commands/columbia_merge.py @@ -26,7 +26,6 @@ import pandas as pd from bs4 import BeautifulSoup from django.db import transaction -from django.db.models import Q from juriscraper.lib.string_utils import titlecase from cl.corpus_importer.import_columbia.columbia_utils import ( @@ -154,7 +153,7 @@ def update_matching_opinions( """Store matching opinion content in html_columbia field from Opinion object :param matches: dict with matching position from cl and columbia opinions - :param cl_cleaned_opinions: list of cl opinions + :param cl_cleaned_opinions: list of cl opinions from a single cluster :param columbia_opinions: list of columbia opinions :param filepath: xml file from which the opinion was extracted :return: None @@ -175,9 +174,10 @@ def update_matching_opinions( if op.author_str == "": # We have an empty author name if author_str: - # Store the name extracted from the author tag + # Store the name extracted from the author tag of the xml file op.author_str = author_str else: + # opinion already has an author in cl if author_str: if ( find_just_name(op.author_str).lower() @@ -215,7 +215,7 @@ def map_and_merge_opinions( ) -> None: """Map and merge opinion data - :param cluster_id: Cluster id + :param cluster_id: Cluster id to merge with :param columbia_opinions: list of columbia opinions from file :param filepath: xml file from which the opinion was extracted :return: None @@ -240,7 +240,8 @@ def map_and_merge_opinions( [op.get("opinion") for op in cl_cleaned_opinions], ) if len(matches) == len(columbia_opinions): - # We were able to match opinions, add opinions to html_columbia field + # We were able to match all opinions, add opinion content to + # html_columbia field update_matching_opinions( matches, cl_cleaned_opinions, columbia_opinions, filepath )