Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle combined opinions in columbia merger #3799

Open
wants to merge 42 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
0b3e5fb
fix(columbia_merger): exclude combined opinions from cluster
quevon24 Feb 13, 2024
7e5f98f
Merge branch 'main' into fix-columbia-merger
quevon24 Feb 16, 2024
efc3931
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 16, 2024
f9d4b87
tests(columbia_merger): update tests
quevon24 Feb 16, 2024
f30372c
Merge branch 'main' into fix-columbia-merger
quevon24 Feb 17, 2024
99942ea
Merge branch 'main' into fix-columbia-merger
quevon24 Feb 20, 2024
6be40ee
fix(columbia_merger): improve date extraction
quevon24 Feb 20, 2024
dde60d3
Merge branch 'main' into fix-columbia-merger
quevon24 Feb 21, 2024
c83dda6
Merge branch 'main' into fix-columbia-merger
quevon24 Feb 26, 2024
65d89f4
fix(columbia_merger): handle invalid citation volume
quevon24 Feb 26, 2024
f3f7604
Merge remote-tracking branch 'origin/fix-columbia-merger' into fix-co…
quevon24 Feb 26, 2024
4a9f4ec
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 26, 2024
b2432e5
fix(columbia_merger): handle invalid citation volume
quevon24 Feb 26, 2024
1fd2433
Merge remote-tracking branch 'origin/fix-columbia-merger' into fix-co…
quevon24 Feb 26, 2024
4971e00
Merge branch 'main' into fix-columbia-merger
quevon24 Feb 26, 2024
78adef7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 26, 2024
2bfd205
Merge branch 'main' into fix-columbia-merger
quevon24 Feb 27, 2024
e0ef67f
Merge branch 'main' into fix-columbia-merger
quevon24 Feb 29, 2024
272716e
Merge branch 'main' into fix-columbia-merger
quevon24 Mar 1, 2024
0e699c9
fix(columbia_merger): save local_path when creating a new opinion
quevon24 Mar 1, 2024
ebd9eec
Merge branch 'main' into fix-columbia-merger
quevon24 Mar 26, 2024
3b8257d
Merge remote-tracking branch 'origin/main' into fix-columbia-merger
quevon24 May 4, 2024
5d7aff9
Merge branch 'main' into fix-columbia-merger
quevon24 May 6, 2024
30bbab2
feat(columbia_merge): save file path for opinion when possible
quevon24 May 6, 2024
2fc0aa9
Merge branch 'main' into fix-columbia-merger
quevon24 May 8, 2024
a7667b2
Merge branch 'main' into fix-columbia-merger
quevon24 May 8, 2024
c27dbe5
Merge branch 'main' into fix-columbia-merger
quevon24 May 9, 2024
bd822fd
Merge branch 'main' into fix-columbia-merger
quevon24 May 20, 2024
4fed27c
Merge branch 'main' into fix-columbia-merger
quevon24 May 22, 2024
156a2ec
feat(columbia_merger): remove unused import
quevon24 May 23, 2024
354ae5b
Merge branch 'main' into fix-columbia-merger
quevon24 Jun 5, 2024
e767eac
Merge branch 'main' into fix-columbia-merger
flooie Jul 10, 2024
eb68a53
Merge branch 'main' into fix-columbia-merger
flooie Jul 12, 2024
b2ee005
Merge branch 'main' into fix-columbia-merger
quevon24 Jul 15, 2024
177d721
Merge branch 'main' into fix-columbia-merger
quevon24 Jul 30, 2024
0d2b27f
Merge branch 'main' into fix-columbia-merger
flooie Aug 2, 2024
4cf0b37
Merge remote-tracking branch 'origin/main' into fix-columbia-merger
quevon24 Aug 22, 2024
f7e0e8b
Merge branch 'main' into fix-columbia-merger
quevon24 Sep 5, 2024
f7a9a2a
Merge branch 'main' into fix-columbia-merger
quevon24 Sep 11, 2024
6dc053c
Merge branch 'main' into fix-columbia-merger
quevon24 Sep 26, 2024
192783d
Merge branch 'main' into fix-columbia-merger
quevon24 Sep 28, 2024
97e8eca
Merge branch 'main' into fix-columbia-merger
quevon24 Dec 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion cl/corpus_importer/import_columbia/columbia_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,22 @@
"affirmed and opinion filed",
"dismissed and opinion filed",
"decided and entered",
"memorandum opinion filed",
"memorandum opinion delivered and filed",
"granted",
"affirmed",
"submitted and decided",
"affirmed and memorandum opinion filed",
"memorandum filed",
"modified opinion filed",
"opinion modified and refiled",
"opinion filed on",
"opinion on merits filed",
"opinion delivered and filed on",
"order delivered and filed",
"date filed",
"opinion filed in",
"affirmed opinion filed",
]
DECIDED_TAGS = ["decided", "date decided", "decided on", "decided date"]
ARGUED_TAGS = [
Expand Down Expand Up @@ -538,7 +554,7 @@ def convert_columbia_html(text: str, opinion_index: int) -> str:
)

# We use opinion index to ensure that all footnotes are linked to the
# corresponding opinion
# corresponding opinion (when a case has multiple opinions)
for ref in foot_references:
if (match := re.search(r"[*\d]+", ref)) is not None:
f_num = match.group()
Expand Down
62 changes: 52 additions & 10 deletions cl/corpus_importer/management/commands/columbia_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import pandas as pd
from bs4 import BeautifulSoup
from django.db import transaction
from django.db.models import Q
from juriscraper.lib.string_utils import titlecase

from cl.corpus_importer.import_columbia.columbia_utils import (
Expand All @@ -43,6 +42,7 @@
)
from cl.corpus_importer.utils import (
AuthorException,
CitationException,
JudgeException,
OpinionMatchingException,
OpinionTypeException,
Expand Down Expand Up @@ -93,14 +93,26 @@ def clean_opinion_content(content: str, is_harvard: bool) -> str:
return prep_text


def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]:
def get_cl_opinion_content(
cluster_id: int, columbia_single_opinion: bool = False
) -> list[dict[Any, Any]]:
"""Get the opinions content for a cluster object

:param cluster_id: Cluster ID for a set of opinions
:param columbia_single_opinion: True if xml file only has one opinion else False
:return: list with opinion content from cl
"""
cl_cleaned_opinions = []

# Get all opinions from cluster
opinions_from_cluster = Opinion.objects.filter(cluster_id=cluster_id)

if not columbia_single_opinion:
# File has multiple opinions, then we can exclude combined opinions
opinions_from_cluster = opinions_from_cluster.exclude(
type="010combined"
)

is_harvard = False

for i, op in enumerate(opinions_from_cluster):
Expand Down Expand Up @@ -133,13 +145,17 @@ def get_cl_opinion_content(cluster_id: int) -> list[dict[Any, Any]]:


def update_matching_opinions(
matches: dict, cl_cleaned_opinions: list, columbia_opinions: list
matches: dict,
cl_cleaned_opinions: list,
columbia_opinions: list,
filepath: str,
) -> None:
"""Store matching opinion content in html_columbia field from Opinion object

:param matches: dict with matching position from cl and columbia opinions
:param cl_cleaned_opinions: list of cl opinions
:param cl_cleaned_opinions: list of cl opinions from a single cluster
:param columbia_opinions: list of columbia opinions
:param filepath: xml file from which the opinion was extracted
:return: None
"""
for columbia_pos, cl_pos in matches.items():
Expand All @@ -158,9 +174,10 @@ def update_matching_opinions(
if op.author_str == "":
# We have an empty author name
if author_str:
# Store the name extracted from the author tag
# Store the name extracted from the author tag of the xml file
op.author_str = author_str
else:
# opinion already has an author in cl
if author_str:
if (
find_just_name(op.author_str).lower()
Expand All @@ -185,21 +202,32 @@ def update_matching_opinions(
file_opinion["opinion"], columbia_pos
)
op.html_columbia = str(converted_text)
if not op.local_path:
# Store file path only if it is empty in the Opinion object
op.local_path = filepath
op.save()


def map_and_merge_opinions(
cluster_id: int,
columbia_opinions: list[dict],
filepath: str,
) -> None:
"""Map and merge opinion data

:param cluster_id: Cluster id
:param cluster_id: Cluster id to merge with
:param columbia_opinions: list of columbia opinions from file
:param filepath: xml file from which the opinion was extracted
:return: None
"""

cl_cleaned_opinions = get_cl_opinion_content(cluster_id)
# Check if columbia source only has one opinion
columbia_single_opinion = True if len(columbia_opinions) == 1 else False

# We exclude combined opinions only if we have more than one opinion in the xml
cl_cleaned_opinions = get_cl_opinion_content(
cluster_id, columbia_single_opinion
)

if len(columbia_opinions) == len(cl_cleaned_opinions):
# We need that both list to be cleaned, so we can have a more
Expand All @@ -212,13 +240,20 @@ def map_and_merge_opinions(
[op.get("opinion") for op in cl_cleaned_opinions],
)
if len(matches) == len(columbia_opinions):
# We were able to match all opinions, add opinion content to
# html_columbia field
update_matching_opinions(
matches, cl_cleaned_opinions, columbia_opinions
matches, cl_cleaned_opinions, columbia_opinions, filepath
)
else:
raise OpinionMatchingException("Failed to match opinions")

elif len(columbia_opinions) > len(cl_cleaned_opinions) == 1:
elif (len(columbia_opinions) > len(cl_cleaned_opinions)) and len(
cl_cleaned_opinions
) == 0:
# We have more opinions in file than in CL and if cl_cleaned_opinions == 0 it
# means that we probably excluded the combined opinion, we create each
# opinion from file
for op in columbia_opinions:
opinion_type = op.get("type")
file = op.get("file")
Expand All @@ -235,6 +270,7 @@ def map_and_merge_opinions(
per_curiam=op["per_curiam"],
cluster_id=cluster_id,
type=opinion_type,
local_path=filepath,
author_str=(
titlecase(find_just_name(author.strip(":")))
if author
Expand Down Expand Up @@ -464,7 +500,9 @@ def process_cluster(

try:
with transaction.atomic():
map_and_merge_opinions(cluster_id, columbia_data["opinions"])
map_and_merge_opinions(
cluster_id, columbia_data["opinions"], filepath
)

merged_data = {}
for field in ["syllabus", "attorneys", "posture", "judges"]:
Expand Down Expand Up @@ -523,6 +561,10 @@ def process_cluster(
)
except JudgeException:
logger.warning(msg=f"Judge exception for cluster id: {cluster_id}")
except CitationException:
logger.warning(
msg=f"Invalid citation found in {filepath} while merging cluster id: {cluster_id}"
)


def merge_columbia_into_cl(options) -> None:
Expand Down
2 changes: 1 addition & 1 deletion cl/corpus_importer/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3289,7 +3289,7 @@ def test_merger(self):
docket=DocketFactory(source=Docket.HARVARD),
sub_opinions__data=[
{
"type": "010combined",
"type": "020lead",
"xml_harvard": "<p>Lorem ipsum dolor sit amet, consectetur "
"adipiscing elit. Nullam quis elit sed dui "
"interdum feugiat.</p>",
Expand Down
7 changes: 7 additions & 0 deletions cl/corpus_importer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@ def __init__(self, message: str) -> None:
self.message = message


class CitationException(Exception):
"""Error found in cite."""

def __init__(self, message: str) -> None:
self.message = message


async def mark_ia_upload_needed(d: Docket, save_docket: bool) -> None:
"""Mark the docket as needing upload if it's not already marked.

Expand Down
Loading