diff --git a/cl/corpus_importer/import_columbia/columbia_utils.py b/cl/corpus_importer/import_columbia/columbia_utils.py index 8434568c7e..60945f92bd 100644 --- a/cl/corpus_importer/import_columbia/columbia_utils.py +++ b/cl/corpus_importer/import_columbia/columbia_utils.py @@ -29,6 +29,22 @@ "affirmed and opinion filed", "dismissed and opinion filed", "decided and entered", + "memorandum opinion filed", + "memorandum opinion delivered and filed", + "granted", + "affirmed", + "submitted and decided", + "affirmed and memorandum opinion filed", + "memorandum filed", + "modified opinion filed", + "opinion modified and refiled", + "opinion filed on", + "opinion on merits filed", + "opinion delivered and filed on", + "order delivered and filed", + "date filed", + "opinion filed in", + "affirmed opinion filed", ] DECIDED_TAGS = ["decided", "date decided", "decided on", "decided date"] ARGUED_TAGS = [ @@ -538,7 +554,7 @@ def convert_columbia_html(text: str, opinion_index: int) -> str: ) # We use opinion index to ensure that all footnotes are linked to the - # corresponding opinion + # corresponding opinion (when a case has multiple opinions) for ref in foot_references: if (match := re.search(r"[*\d]+", ref)) is not None: f_num = match.group() diff --git a/cl/corpus_importer/management/commands/columbia_merge.py b/cl/corpus_importer/management/commands/columbia_merge.py index a9f18fcce7..8c93be484f 100644 --- a/cl/corpus_importer/management/commands/columbia_merge.py +++ b/cl/corpus_importer/management/commands/columbia_merge.py @@ -26,7 +26,6 @@ import pandas as pd from bs4 import BeautifulSoup from django.db import transaction -from django.db.models import Q from juriscraper.lib.string_utils import titlecase from cl.corpus_importer.import_columbia.columbia_utils import ( @@ -43,6 +42,7 @@ ) from cl.corpus_importer.utils import ( AuthorException, + CitationException, JudgeException, OpinionMatchingException, OpinionTypeException, @@ -93,14 +93,26 @@ def clean_opinion_content(content: str, is_harvard: bool) -> str: return prep_text -def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]: +def get_cl_opinion_content( + cluster_id: int, columbia_single_opinion: bool = False +) -> list[dict[Any, Any]]: """Get the opinions content for a cluster object :param cluster_id: Cluster ID for a set of opinions + :param columbia_single_opinion: True if xml file only has one opinion else False :return: list with opinion content from cl """ cl_cleaned_opinions = [] + + # Get all opinions from cluster opinions_from_cluster = Opinion.objects.filter(cluster_id=cluster_id) + + if not columbia_single_opinion: + # File has multiple opinions, then we can exclude combined opinions + opinions_from_cluster = opinions_from_cluster.exclude( + type="010combined" + ) + is_harvard = False for i, op in enumerate(opinions_from_cluster): @@ -133,13 +145,17 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]: def update_matching_opinions( - matches: dict, cl_cleaned_opinions: list, columbia_opinions: list + matches: dict, + cl_cleaned_opinions: list, + columbia_opinions: list, + filepath: str, ) -> None: """Store matching opinion content in html_columbia field from Opinion object :param matches: dict with matching position from cl and columbia opinions - :param cl_cleaned_opinions: list of cl opinions + :param cl_cleaned_opinions: list of cl opinions from a single cluster :param columbia_opinions: list of columbia opinions + :param filepath: xml file from which the opinion was extracted :return: None """ for columbia_pos, cl_pos in matches.items(): @@ -158,9 +174,10 @@ def update_matching_opinions( if op.author_str == "": # We have an empty author name if author_str: - # Store the name extracted from the author tag + # Store the name extracted from the author tag of the xml file op.author_str = author_str else: + # opinion already has an author in cl if author_str: if ( find_just_name(op.author_str).lower() @@ -185,21 +202,32 @@ def update_matching_opinions( file_opinion["opinion"], columbia_pos ) op.html_columbia = str(converted_text) + if not op.local_path: + # Store file path only if it is empty in the Opinion object + op.local_path = filepath op.save() def map_and_merge_opinions( cluster_id: int, columbia_opinions: list[dict], + filepath: str, ) -> None: """Map and merge opinion data - :param cluster_id: Cluster id + :param cluster_id: Cluster id to merge with :param columbia_opinions: list of columbia opinions from file + :param filepath: xml file from which the opinion was extracted :return: None """ - cl_cleaned_opinions = get_cl_opinion_content(cluster_id) + # Check if columbia source only has one opinion + columbia_single_opinion = True if len(columbia_opinions) == 1 else False + + # We exclude combined opinions only if we have more than one opinion in the xml + cl_cleaned_opinions = get_cl_opinion_content( + cluster_id, columbia_single_opinion + ) if len(columbia_opinions) == len(cl_cleaned_opinions): # We need that both list to be cleaned, so we can have a more @@ -212,13 +240,20 @@ def map_and_merge_opinions( [op.get("opinion") for op in cl_cleaned_opinions], ) if len(matches) == len(columbia_opinions): + # We were able to match all opinions, add opinion content to + # html_columbia field update_matching_opinions( - matches, cl_cleaned_opinions, columbia_opinions + matches, cl_cleaned_opinions, columbia_opinions, filepath ) else: raise OpinionMatchingException("Failed to match opinions") - elif len(columbia_opinions) > len(cl_cleaned_opinions) == 1: + elif (len(columbia_opinions) > len(cl_cleaned_opinions)) and len( + cl_cleaned_opinions + ) == 0: + # We have more opinions in file than in CL and if cl_cleaned_opinions == 0 it + # means that we probably excluded the combined opinion, we create each + # opinion from file for op in columbia_opinions: opinion_type = op.get("type") file = op.get("file") @@ -235,6 +270,7 @@ def map_and_merge_opinions( per_curiam=op["per_curiam"], cluster_id=cluster_id, type=opinion_type, + local_path=filepath, author_str=( titlecase(find_just_name(author.strip(":"))) if author @@ -464,7 +500,9 @@ def process_cluster( try: with transaction.atomic(): - map_and_merge_opinions(cluster_id, columbia_data["opinions"]) + map_and_merge_opinions( + cluster_id, columbia_data["opinions"], filepath + ) merged_data = {} for field in ["syllabus", "attorneys", "posture", "judges"]: @@ -523,6 +561,10 @@ def process_cluster( ) except JudgeException: logger.warning(msg=f"Judge exception for cluster id: {cluster_id}") + except CitationException: + logger.warning( + msg=f"Invalid citation found in {filepath} while merging cluster id: {cluster_id}" + ) def merge_columbia_into_cl(options) -> None: diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 5b3d858897..4ce7beb6ef 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -3289,7 +3289,7 @@ def test_merger(self): docket=DocketFactory(source=Docket.HARVARD), sub_opinions__data=[ { - "type": "010combined", + "type": "020lead", "xml_harvard": "
Lorem ipsum dolor sit amet, consectetur " "adipiscing elit. Nullam quis elit sed dui " "interdum feugiat.
", diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index efcab347a5..41f4df82d6 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -88,6 +88,13 @@ def __init__(self, message: str) -> None: self.message = message +class CitationException(Exception): + """Error found in cite.""" + + def __init__(self, message: str) -> None: + self.message = message + + async def mark_ia_upload_needed(d: Docket, save_docket: bool) -> None: """Mark the docket as needing upload if it's not already marked.