Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 1601 match partial page citations #2209

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 76 additions & 25 deletions cl/citations/match_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,47 +139,98 @@ def search_db_for_fullcitation(
"fq": [
"status:Precedential", # Non-precedential documents aren't cited
],
"caller": "citation.match_citations.match_citation",
"caller": "citations.match_citations.search_db_for_fullcitation",
}

# Filter out self-cites
if full_citation.citing_opinion is not None:
# Eliminate self-cites.
main_params["fq"].append(f"-id:{full_citation.citing_opinion.pk}")
# Set up filter parameters

# Filter by court if possible
if full_citation.metadata.court:
main_params["fq"].append(f"court_exact:{full_citation.metadata.court}")

# Filter by citation's known year if possible
if full_citation.year:
start_year = end_year = full_citation.year
else:
start_year, end_year = get_years_from_reporter(full_citation)
start_year = end_year = None

# Take 1: If the citation is missing its page, the best we can do is try
# a case name query
if full_citation.groups["page"] is None:
# If we have a defendant and year, search
if (
full_citation.citing_opinion is not None
and full_citation.citing_opinion.cluster.date_filed
and full_citation.metadata.defendant
and (end_year or full_citation.citing_opinion.cluster.date_filed)
):
end_year = min(
end_year, full_citation.citing_opinion.cluster.date_filed.year
# Unless already known, set the date range to be within five years
# of the citing opinion's filing year to guard against false
# positives. (Citations missing their pages are likely to have
# been cited pretty recently, relative to the citing opinion year.)
if not end_year:
end_year = full_citation.citing_opinion.cluster.date_filed.year
start_year = end_year - 5

# Update the filter params
main_params["fq"].append(
f"dateFiled:{build_date_range(start_year, end_year)}"
)
main_params["fq"].append(
f"dateFiled:{build_date_range(start_year, end_year)}"
)

if full_citation.metadata.court:
main_params["fq"].append(f"court_exact:{full_citation.metadata.court}")
return case_name_query(
si, main_params, full_citation, full_citation.citing_opinion
)

# Take 1: Use a phrase query to search the citation field.
main_params["fq"].append(
f'citation:("{full_citation.corrected_citation()}")'
)
results = si.query().add_extra(**main_params).execute()
si.conn.http_connection.close()
if len(results) == 1:
return results
if len(results) > 1:
# If not, don't search because there will be too many false positives
# with only a volume and reporter to go off of
else:
return []

# Take 2: If the citation *does* have its full page information,
# we can use that information to perform a citation query first
else:
main_params["fq"].append(
f'citation:("{full_citation.corrected_citation()}")',
)

# Unless already known, set the date range to simply be the range
# of years covered by the reporter
if not end_year:
start_year, end_year = get_years_from_reporter(full_citation)

# Tighten the end year date if possible
if (
full_citation.citing_opinion is not None
and full_citation.metadata.defendant
): # Refine using defendant, if there is one
results = case_name_query(
si, main_params, full_citation, full_citation.citing_opinion
and full_citation.citing_opinion.cluster.date_filed
):
end_year = min(
end_year,
full_citation.citing_opinion.cluster.date_filed.year,
)

# Update the filter params
main_params["fq"].append(
f"dateFiled:{build_date_range(start_year, end_year)}",
)

results = si.query().add_extra(**main_params).execute()
si.conn.http_connection.close()
if len(results) == 1:
return results
if len(results) > 1:
# Refine using the citation's case name, if it at least knows who
# the defendant is
if (
full_citation.citing_opinion is not None
and full_citation.metadata.defendant
):
return case_name_query(
si,
main_params,
full_citation,
full_citation.citing_opinion,
)

# Give up.
return []
Expand Down
101 changes: 96 additions & 5 deletions cl/citations/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ def test_make_html_from_plain_text(self) -> None:
'citation no-link">123\nU.S. 456</span><pre class="inline">, '
'upholding foo bar</pre>'),

# Full citation missing a page number
('asdf John v. Doe, 123 U.S. __, upholding foo bar',
'<pre class="inline">asdf John v. Doe, </pre><span class="'
'citation no-link">123 U.S. __</span><pre class="inline">, '
'upholding foo bar</pre>'),

# Basic short form citation
('existing text asdf, 515 U.S., at 240. foobar',
'<pre class="inline">existing text asdf, </pre><span class="'
Expand Down Expand Up @@ -328,6 +334,12 @@ def setUpTestData(cls) -> None:
), # Year must equal text in citation4
),
)
cls.citation1a = CitationWithParentsFactory.create( # Extra citation for same OpinionCluster as above
volume="2",
reporter="S.Ct.",
page="2",
cluster=OpinionCluster.objects.get(pk=cls.citation1.cluster_id),
)

# Citation 2
cls.citation2 = CitationWithParentsFactory.create(
Expand Down Expand Up @@ -355,6 +367,10 @@ def setUpTestData(cls) -> None:
cluster=OpinionClusterFactoryWithChildrenAndParents(
docket=DocketFactory(court=court_scotus),
case_name="Lorem v. Ipsum",
date_filed=date.today()
- timedelta(
days=365
), # Must be within 5 years of opinion5 for missing page test
),
)

Expand All @@ -366,6 +382,12 @@ def setUpTestData(cls) -> None:
cluster=OpinionClusterFactoryWithChildrenAndParents(
docket=DocketFactory(court=court_scotus),
case_name="Abcdef v. Ipsum",
date_filed=OpinionCluster.objects.get(
pk=cls.citation1.cluster_id
).date_filed
+ timedelta(
days=1
), # Must be after citation1 date for test_no_duplicate_parentheticals_from_parallel_cites test
sub_opinions=RelatedFactory(
OpinionWithChildrenFactory,
factory_related_name="cluster",
Expand All @@ -386,7 +408,7 @@ def setUpTestData(cls) -> None:
sub_opinions=RelatedFactory(
OpinionWithChildrenFactory,
factory_related_name="cluster",
plain_text="Blah blah Foo v. Bar 1 U.S. 1, 77 blah blah. Asdf asdf Qwerty v. Uiop 2 F.3d 2, 555. Also check out Foo, 1 U.S. at 99 (holding that crime is illegal). Then let's cite Qwerty, supra, at 666 (noting that CourtListener is a great tool and everyone should use it). See also Foo, supra, at 101 as well. Another full citation is Lorem v. Ipsum 1 U. S. 50. Quoting Qwerty, “something something”, 2 F.3d 2, at 59. This case is similar to Fake, supra, and Qwerty supra, as well. This should resolve to the foregoing. Ibid. This should also convert appropriately, see Id., at 57. This should fail to resolve because the reporter and citation is ambiguous, 1 U. S., at 51. However, this should succeed, Lorem, 1 U.S., at 52.",
plain_text="Blah blah Foo v. Bar 1 U.S. 1, 77 blah blah. Asdf asdf Qwerty v. Uiop 2 F.3d 2, 555. Also check out Foo, 1 U.S. at 99 (holding that crime is illegal). Then let's cite Qwerty, supra, at 666 (noting that CourtListener is a great tool and everyone should use it). See also Foo, supra, at 101 as well. Another full citation is Lorem v. Ipsum 1 U. S. 50. Same with missing page Lorem v. Ipsum 1 U.S. ___, __ blah blah. Quoting Qwerty, “something something”, 2 F.3d 2, at 59. This case is similar to Fake, supra, and Qwerty supra, as well. This should resolve to the foregoing. Ibid. This should also convert appropriately, see Id., at 57. This should fail to resolve because the reporter and citation is ambiguous, 1 U. S., at 51. However, this should succeed, Lorem, 1 U.S., at 52.",
),
),
)
Expand All @@ -410,6 +432,14 @@ def test_citation_resolution(self) -> None:
reporter_found="U.S.",
metadata={"court": "scotus"},
)
full1_without_page = case_citation(
volume="1",
reporter="U.S.",
page=None,
index=1,
reporter_found="U.S.",
metadata={"court": "scotus", "defendant": "Bar"},
)
full2 = case_citation(
volume="2",
reporter="F.3d",
Expand All @@ -426,6 +456,34 @@ def test_citation_resolution(self) -> None:
reporter_found="U.S.",
metadata={"court": "scotus"},
)
full3_without_page = case_citation(
volume="1",
reporter="U.S.",
page=None,
index=1,
reporter_found="U.S.",
metadata={"court": "scotus"},
)
full3_without_page_but_with_defendant = case_citation(
volume="1",
reporter="U.S.",
page=None,
index=1,
reporter_found="U.S.",
metadata={"court": "scotus", "defendant": "Ipsum"},
)
full3_without_page_but_with_year = case_citation(
volume="1",
reporter="U.S.",
page=None,
index=1,
reporter_found="U.S.",
metadata={
"court": "scotus",
"defendant": "Ipsum",
"year": opinion3.cluster.date_filed.year, # Must equal year of opinion3
},
)
full4 = case_citation(
volume="1",
reporter="U.S.",
Expand Down Expand Up @@ -515,8 +573,31 @@ def test_citation_resolution(self) -> None:
([full1], {opinion1: [full1]}),
# Test matching multiple full citations to different documents
([full1, full2], {opinion1: [full1], opinion2: [full2]}),
# Test matching an unmatchacble full citation
# Test matching an unmatchable full citation
([full_na], {NO_MATCH_RESOURCE: [full_na]}),
# Test matching a full citation with a missing page. We expect this
# to fail since there's not enough information.
([full3_without_page], {NO_MATCH_RESOURCE: [full3_without_page]}),
# Test matching a full citation with a missing page, but with a
# useful defendant, and the year given by the citing opinion
(
[full3_without_page_but_with_defendant],
{opinion3: [full3_without_page_but_with_defendant]},
),
# Test matching a full citation with a missing page, but with a
# useful defendant, and the year given by the citation itself
(
[full3_without_page_but_with_year],
{opinion3: [full3_without_page_but_with_year]},
),
# Test matching a full citation with a missing page when the cited
# opinion's date is not within 5 years of the citing opinion's
# date. We expect this match to fail since the years are too far
# away and it may simply be a false positive.
(
[full1_without_page],
{NO_MATCH_RESOURCE: [full1_without_page]},
),
# Test resolving a supra citation
([full1, supra1], {opinion1: [full1, supra1]}),
# Test resolving a supra citation when its antecedent guess matches
Expand Down Expand Up @@ -580,6 +661,15 @@ def test_citation_resolution(self) -> None:
[full1, full_na, id],
{opinion1: [full1], NO_MATCH_RESOURCE: [full_na, id]},
),
# Test resolving an Id. citation when the previous citation match
# failed because it was missing a page. We expect the Id. citation
# to fail.
(
[full3_without_page, id],
{
NO_MATCH_RESOURCE: [full3_without_page],
},
),
# Test resolving an Id. citation when the previous citation is to a
# non-opinion document. Since we can't match those documents (yet),
# we expect the Id. citation to also not be matched.
Expand Down Expand Up @@ -658,10 +748,11 @@ def test_opinionscited_creation(self) -> None:
# test all combinations, but this test case is made to be deliberately
# complex, in an effort to "trick" the algorithm. Cited opinions:
# opinion1: 1 FullCaseCitation, 1 ShortCaseCitation, 1 SupraCitation (depth=3)
# (case name Foo)
# (case name Foo)
# opinion2: 1 FullCaseCitation, 2 IdCitation (one Id. and one Ibid.),
# 1 ShortCaseCitation, 2 SupraCitation (depth=6) (case name Qwerty)
# opinion3: 1 FullCaseCitation, 1 ShortCaseCitation (depth=2) (case name Lorem)
# opinion3: 2 FullCaseCitation (one with missing page),
# 1 ShortCaseCitation (depth=3) (case name Lorem)
opinion1 = Opinion.objects.get(cluster__pk=self.citation1.cluster_id)
opinion2 = Opinion.objects.get(cluster__pk=self.citation2.cluster_id)
opinion3 = Opinion.objects.get(cluster__pk=self.citation3.cluster_id)
Expand All @@ -675,7 +766,7 @@ def test_opinionscited_creation(self) -> None:
citation_test_pairs = [
(opinion1, 3),
(opinion2, 6),
(opinion3, 2),
(opinion3, 3),
]

for cited, depth in citation_test_pairs:
Expand Down