From d871b4aae29ebad5be842df18e7bf93e85fec1fe Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Tue, 1 Oct 2024 11:09:28 -0500 Subject: [PATCH 001/143] feat(scrapers.update_from_text): new command Helps solve: https://github.com/freelawproject/juriscraper/issues/858 - New command to re-run Site.extract_from_text over downloaded opinions - Able to filter by Docket.court_id , OpinionCluster.date_filed, OpinionCluster.precedential_status - Updates tasks.update_from_document_text to return information for logging purposes - Updates test_opinion_scraper to get a Site.extract_from_text method --- .../management/commands/update_from_text.py | 159 ++++++++++++++++++ cl/scrapers/tasks.py | 12 +- .../test_assets/test_opinion_scraper.py | 21 +++ cl/scrapers/tests.py | 110 +++++++++++- 4 files changed, 297 insertions(+), 5 deletions(-) create mode 100644 cl/scrapers/management/commands/update_from_text.py diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py new file mode 100644 index 0000000000..77fe5966af --- /dev/null +++ b/cl/scrapers/management/commands/update_from_text.py @@ -0,0 +1,159 @@ +from datetime import datetime + +from django.db import transaction + +from cl.lib.command_utils import VerboseCommand, logger +from cl.scrapers.tasks import update_document_from_text +from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster + + +def update_from_text( + opinion: Opinion, juriscraper_module: str, stats: dict[str, int] +): + """Calls `update_document_from_text` as used in the scraper flow + and calls the corresponding model's .save() + + :param opinion: the Opinion on which to apply extract_from_text + :param juriscraper_module: the scraper module path + :param stats: dict to accumulate counts for reporting. Modified in place + + :return None + """ + with transaction.atomic(): + changes = update_document_from_text(opinion, juriscraper_module) + if not changes: + logger.info("Did not get any metadata for opinion %s", opinion.id) + return + + logger.info("Processing opinion %s", opinion.id) + + # Check if changes exist before saving, to prevent unecessary DB queries + if changes.get("Docket"): + opinion.cluster.docket.save() + logger.debug( + "Docket %s updated with data %s", + opinion.cluster.docket.id, + changes["Docket"], + ) + stats["Docket"] += 1 + + if changes.get("OpinionCluster"): + opinion.cluster.save() + logger.debug( + "OpinionCluster %s updated with data %s", + opinion.cluster.id, + changes["OpinionCluster"], + ) + stats["OpinionCluster"] += 1 + + if changes.get("Opinion"): + opinion.save() + logger.debug("Opinion updated with data %s", changes["Opinion"]) + stats["Opinion"] += 1 + + if changes.get("Citation"): + if changes["Citation"].get("citation_created"): + logger.info( + "Citation created with data %s", changes["Citation"] + ) + stats["Citation"] += 1 + else: + logger.debug( + "Citation not created. Data %s", changes["Citation"] + ) + + +class Command(VerboseCommand): + help = """Updates objects by running Site.extract_from_text + over extracted content found on Opinion.plain_text or Opinion.html. + + If `--opinion-ids` is used, filters will be ignored. + If not, the 2 date filters will be required, to prevent triggering + unwanted reprocessing of the whole court's dataset + + Recommended use is to run over a sample of the target time period + and check if updates over Docket, OpinionCluster, Opinion and + Citation are as expected + """ + stats = {} # assigned at the end of a command run, for testing + + def add_arguments(self, parser): + parser.add_argument( + "--juriscraper-module", + help="""The Juriscraper file which contains the + `extract_from_text` method to be used. The `court_id` + will be deduced from this. Example: + juriscraper.opinions.united_states.federal_appellate.ca1 + """, + required=True, + ) + parser.add_argument( + "--opinion-ids", + nargs="+", + type=int, + help="""The Opinion ids to re-process. + May be more than one. If this argument is used, + other filters will be ignored""", + ) + parser.add_argument( + "date-filed-gte", + default="", + help=r"""A filter value in %Y/%m/%d format. + OpinionCluster.date_filed will have to be greater or equal""", + ) + parser.add_argument( + "date-filed-lte", + default="", + help=r"""A filter value in %Y/%m/%d format. + OpinionCluster.date_filed will have to be less or equal""", + ) + parser.add_argument( + "--cluster-status", + default="", + choices=[value for value, name in PRECEDENTIAL_STATUS.NAMES], + help="""A value of OpinionCluster.precedential_status. To be + used for filtering the Opinions to be processed + """, + ) + + def handle(self, *args, **options): + super().handle(*args, **options) + juriscraper_module = options["juriscraper_module"] + # For aggregate reporting + stats = {"Docket": 0, "OpinionCluster": 0, "Opinion": 0, "Citation": 0} + + if options["opinion_ids"]: + opinions = Opinion.objects.filter(id__in=options["opinion_ids"]) + for op in opinions: + update_from_text(op, juriscraper_module, stats) + + logger.info("Modified objects counts: %s", stats) + return + + if not (options["date_filed_gte"] and options["date_filed_lte"]): + raise ValueError( + "Both `date-filed-gte` and `date-filed-lte` arguments should have values" + ) + + court_id = juriscraper_module.split(".")[-1].split("_")[0] + gte_date = datetime.strptime(options["date_filed_gte"], "%Y/%m/%d") + lte_date = datetime.strptime(options["date_filed_lte"], "%Y/%m/%d") + query = { + "docket__court_id": court_id, + "date_filed__gte": gte_date, + "date_filed__lte": lte_date, + } + + if options["cluster_status"]: + query["precedential_status"] = options["cluster_status"] + + qs = OpinionCluster.objects.filter(**query).prefetch_related( + "sub_opinions" + ) + for cluster in qs: + opinions = cluster.sub_opinions.all() + for op in opinions: + update_from_text(op, juriscraper_module, stats) + + logger.info("Modified objects counts: %s", stats) + self.stats = stats diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index c60971c572..15500e94bb 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -39,7 +39,7 @@ def update_document_from_text( opinion: Opinion, juriscraper_module: str = "" -) -> None: +) -> dict: """Extract additional metadata from document text We use this code with BIA decisions. Previously Tax. @@ -54,12 +54,13 @@ def update_document_from_text( :param opinion: Opinion object :param juriscraper_module: full module to get Site object - :return: None + :return: the extracted data dictionary """ court = opinion.cluster.docket.court.pk site = get_scraper_object_by_name(court, juriscraper_module) if site is None: - return + logger.debug("No site found %s", juriscraper_module) + return {} metadata_dict = site.extract_from_text(opinion.plain_text or opinion.html) for model_name, data in metadata_dict.items(): @@ -70,7 +71,8 @@ def update_document_from_text( opinion.cluster.__dict__.update(data) elif model_name == "Citation": data["cluster_id"] = opinion.cluster_id - ModelClass.objects.get_or_create(**data) + _, citation_created = ModelClass.objects.get_or_create(**data) + metadata_dict["Citation"]["created"] = citation_created elif model_name == "Opinion": opinion.__dict__.update(data) else: @@ -78,6 +80,8 @@ def update_document_from_text( f"Object type of {model_name} not yet supported." ) + return metadata_dict + @app.task( bind=True, diff --git a/cl/scrapers/test_assets/test_opinion_scraper.py b/cl/scrapers/test_assets/test_opinion_scraper.py index 508be0dfec..18a28d71de 100644 --- a/cl/scrapers/test_assets/test_opinion_scraper.py +++ b/cl/scrapers/test_assets/test_opinion_scraper.py @@ -1,3 +1,4 @@ +import re from datetime import datetime from os.path import join @@ -53,3 +54,23 @@ def _get_nature_of_suit(self): def _get_judges(self): path = "//judge/text()" return list(self.html.xpath(path)) + + def extract_from_text(self, scraped_text): + metadata = {} + docket_regex = r"Docket Number: (?P\d+-\d+)" + disposition_regex = r"Disposition: (?P\w+)" + citation_regex = r"(?P20\d{2}) (?PVT) (?P\d+)" + if docket_match := re.search(docket_regex, scraped_text): + metadata["Docket"] = { + "docket_number": docket_match.group("docket") + } + + if disposition_match := re.search(disposition_regex, scraped_text): + metadata["OpinionCluster"] = { + "disposition": disposition_match.group("disposition") + } + + if citation_match := re.search(citation_regex, scraped_text): + metadata["Citation"] = {**citation_match.groupdict(), "type": 8} + + return metadata diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index 375987426a..1d818d4f39 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -1,5 +1,5 @@ import os -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta from http import HTTPStatus from pathlib import Path from unittest import TestCase, mock @@ -30,6 +30,7 @@ cl_back_scrape_citations, cl_scrape_opinions, cl_scrape_oral_arguments, + update_from_text, ) from cl.scrapers.models import UrlHash from cl.scrapers.tasks import extract_doc_content, process_audio_file @@ -867,3 +868,110 @@ def test_federal_jurisdictions(self): self.assertEqual( docket, self.ca2_docket, "Should match using docket number core" ) + + +class UpdateFromTestCommandTest(TestCase): + """Test the input processing and DB querying for the command""" + + def setUp(self): + self.vt = CourtFactory(id="vt") + self.sc = CourtFactory(id="sc") + self.docket_sc = DocketFactory(court=self.sc, docket_number="20") + + # Different dates, status and courts to test command behaviour + self.opinion_2020 = OpinionFactory( + cluster=OpinionClusterFactory( + docket=DocketFactory(court=self.vt, docket_number="12"), + date_filed=date(2020, 6, 1), + precedential_status="Published", + ), + plain_text="""Docket Number: 2020-12 + Disposition: Affirmed + 2020 VT 11""", + ) + self.opinion_2020_unpub = OpinionFactory( + cluster=OpinionClusterFactory( + docket=DocketFactory(court=self.vt, docket_number="13"), + date_filed=date(2020, 7, 1), + precedential_status="Unpublished", + ), + plain_text="Docket Number: 2020-13\nDisposition: Affirmed", + ) + + self.opinion_sc = OpinionFactory( + cluster=OpinionClusterFactory( + docket=self.docket_sc, + date_filed=date(2021, 6, 1), + precedential_status="Published", + ), + plain_text="Some text with no matches", + id=101, + ) + + self.opinion_2022 = OpinionFactory( + cluster=OpinionClusterFactory( + docket=DocketFactory(court=self.vt, docket_number="13"), + date_filed=date(2022, 6, 1), + precedential_status="Unpublished", + ), + id=100, + plain_text="Docket Number: 2022-13\n2022 VT 11", + ) + + def test_inputs(self): + """Do all command inputs work properly?""" + + # will target a single opinion, for which extract_from_text + # extracts no metadata. No object should be updated + cmd = update_from_text.Command() + with mock.patch( + "cl.scrapers.tasks.get_scraper_object_by_name", + return_value=test_opinion_scraper.Site(), + ): + cmd.handle(juriscraper_module="somepath.sc", opinion_ids=[101]) + + self.assertFalse( + any(cmd.stats.values()), "No object should be modified" + ) + + # will target 1 opinion, there are 2 in the time period + # and 3 for the court + with mock.patch( + "cl.scrapers.tasks.get_scraper_object_by_name", + return_value=test_opinion_scraper.Site(), + ): + update_from_text.Command().handle( + juriscraper_module="somepath.vt", + opinion_ids=[], + date_filed_gte="2020/06/01", + date_filed_lte="2021/06/01", + cluster_status="Published", + ) + + # Test that objects were actually updated / created + self.assertEqual( + Citation.objects.filter(cluster=self.opinion_2020.cluster).count(), + 1, + "There should be a single citation for this cluster", + ) + self.opinion_2020.refresh_from_db() + self.opinion_2020.cluster.refresh_from_db() + self.opinion_2020.cluster.docket.refresh_from_db() + self.assertEqual( + self.opinion_2020.cluster.disposition, + "Affirmed", + "OpinionCluster.disposition was not updated", + ) + self.assertEqual( + self.opinion_2020.cluster.docket.docket_number, + "2020-12", + "Docket.docket_number was not updated", + ) + + # Check that other objects in the time period and court + # were not modified. Meaning, the filter worked + self.assertEqual( + self.opinion_2020_unpub.cluster.docket.docket_number, + "13", + "Unpublished docket should not be modified", + ) From 4cb004fe9fe82c34cd6061efc3908ff2bb5140b2 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 17 Oct 2024 15:30:11 -0400 Subject: [PATCH 002/143] feat(new_ui): NEW HTML and CSS and JS --- cl/assets/static-global/css/override.css | 706 +++++++++++++++++- cl/assets/static-global/js/base.js | 271 ++++++- .../includes/add_download_button.html | 46 ++ .../templates/includes/add_note_button.html | 2 +- .../templates/includes/opinion_tabs.html | 337 +++++++++ cl/opinion_page/templates/opinion.html | 2 +- cl/opinion_page/templates/opinions.html | 346 +++++++++ cl/search/models.py | 32 + 8 files changed, 1735 insertions(+), 7 deletions(-) create mode 100644 cl/opinion_page/templates/includes/add_download_button.html create mode 100644 cl/opinion_page/templates/includes/opinion_tabs.html create mode 100644 cl/opinion_page/templates/opinions.html diff --git a/cl/assets/static-global/css/override.css b/cl/assets/static-global/css/override.css index 7a27e9f08f..32c21672a1 100644 --- a/cl/assets/static-global/css/override.css +++ b/cl/assets/static-global/css/override.css @@ -155,7 +155,30 @@ header { /* Standard target color. */ *:target { - background-color: lightyellow; + -webkit-animation: target-fade 3s; + -moz-animation: target-fade 3s; + -o-animation: target-fade 3s; + animation: target-fade 3s; +} + +@-webkit-keyframes target-fade { + from { background-color: lightyellow; } + to { background-color: transparent; } +} + +@-moz-keyframes target-fade { + from { background-color: lightyellow; } + to { background-color: transparent; } +} + +@-o-keyframes target-fade { + from { background-color: lightyellow; } + to { background-color: transparent; } +} + +@keyframes target-fade { + from { background-color: lightyellow; } + to { background-color: transparent; } } .alt { @@ -1603,7 +1626,7 @@ textarea { /* Prevent images inside opinion from overflowing */ -#opinion-content img { +div.subopinion-content img { max-width: 100%; height: auto; } @@ -1723,3 +1746,682 @@ rect.series-segment { opacity 150ms 150ms ease-in; transform: translate3d(0, 0, 0); } + + + +/*Wrap all our changes around an opinion-body class we load up + in the opinion template*/ + +.opinion-body { + + #headmatter { + font-family: Merriweather, "Times New Roman", Times, serif; + font-size: 15px; + letter-spacing: 0.2px; + text-align: justify; + padding:0px; + margin: 0px; + background-color: white; + border: none; + + } + #headmatter > parties { + text-align: center; + font-style: initial; + font-size: 2em; + display: block; + } + #headmatter > div.footnotes > .footnote > p { + line-height: 1em; + } + + #headmatter > * { + text-indent: 2em; + } + + #headmatter docketnumber, + #headmatter court, + #headmatter parties, + #headmatter attorneys, + #headmatter syllabus, + #headmatter decisiondate { + display: block; + } + + #headmatter > div.footnotes { + border-top: None; + padding-top: 1em; + } + + .jump-links > a{ + position: relative; + margin: -8px 20px 0 0; + width: 140px; + line-height: 18px; + font-size: 14px; + cursor: pointer; + white-space: nowrap; + text-overflow: ellipsis; + opacity: 1; + } + + .hr-opinion { + border-top: 2px solid black; + } + + /*Clean up the Case Caption section to look large and clean*/ + .case-caption { + font-size: 3em; + font-weight: 500; + text-align: left; + line-height: 1.1em; + margin-top: 50px; + } + + + .case-court { + font-size: 25px; + text-align: left; + } + +/*Update sidebar jump links to look nice*/ +.jump-links { + font-size: 12px; + padding-top: 5px; +} + + li.jump-links.active { + color: #B53C2C; + font-weight: bold; + } + + li.jump-links { + list-style-type: none; + padding-left: 0; + } + + li.jump-links::before { + content: ""; + border-left: 3px solid lightgrey; + height: 1em; + padding-right: 8px; + display: inline-block; + margin-right: 5px; + } + + li.jump-links.active::before { + content: ""; + border-left: 2px solid #B53C2C; + padding-right: 8px; + display: inline-block; + margin-right: 5px; + } + + + .jump-links { + font-size: 12px; + padding-top: 5px; +} + +li.jump-links { + height:2.5em; + list-style-type: none; + padding-left: 0; + position: relative; +} + +li.jump-links::before { + content: ""; + border-left: 2px solid lightgrey; + height: 100%; + position: absolute; + left: 0; + top: 0; + padding-right: 8px; + display: inline-block; +} + +/* Active link styles */ +li.jump-links > a.active { + font-weight: 500; + color: black; +} + +li.jump-links > a { + padding-left:10px; + color: black; +} + + +div.footnote:first-of-type { + border-top: 1px solid black; + width: 100%; + display: block; + } + + /*Columbia specific Fix*/ + /*Columbia/HTML Law box special footnotes data almost awlays starts with fn1*/ + footnote_body sup#fn1 { + padding-top: 10px; + border-top: 1px solid black; + width: 100%; + display: block; + } + + /*HTML law box page numbers*/ + strong[data-ref] { + font-size: 0.8em; + fon: italic; + } + + strong[data-ref]::before { + content: attr(data-ref); + display: inline; + position: relative; + float: right; + left: -.5em; + font-size: 0.8em; + color: dimgray; + width: 0; + } + + + div.footnote { + padding-top: 10px; + display: block; + line-height: 1em; + } + + div.footnote > p { + display: inline; + } + + div.footnote::before { + content: attr(label) " "; + font-weight: bold; + color: #000; + margin-right: 5px; + padding-top: 2em; + } + + div.footnote { + padding-top: 10px; + font-size: 12px; + } + + div.footnote > * { + padding-top: 10px; + font-size: 12px; + } + + + /*To help separate footnotes from opinion document*/ + footnote:first-of-type { + border-top: 1px solid black; + width: 100%; + display: block; + } + + footnote { + padding-top: 10px; + display: block; + line-height: 1.5em; + /*margin-left: 1em;*/ + padding-left: 40px; + } + + footnote > p { + display: inline; + } + + footnote::before { + content: attr(label); + font-weight: bold; + color: #000; + margin-right: 26px; + padding-top: 2em; + margin-left: -35px; + } + + /*Handle CSS in Columbia opinions*/ + footnotemark { + font-weight: bold; + font-size: 0.8em; + vertical-align: super; + line-height: 0; + } + + + #cited-by { + z-index: 1; + } + + footnotemark { + cursor: pointer; + color: blue; + text-decoration: underline; + } + + footnote { + padding-top: 10px; + font-size: 12px; + } + + + .jumpback { + color: blue; + cursor: pointer; + font-weight: bold; + margin-left: 5px; + } + + + footnote > * { + font-size: 12px; + } + + author > page-number { + display: block; + font-size: 15px; + } + + author { + display: inline; + margin: 0; /* Remove any default margin */ + text-indent: 2em; /* Indents the first line by 2em */ + } + + /*Important for indenting harvard opinions correctly*/ + opinion > p[id^="b"] { + text-indent: 2em; + } + + + opinion > [id^="p-"] { + padding-left: 2em; + text-indent: 2em; + } +} + +[id^="A"] { + text-indent: 2em; + display: inline; + +} + +.opinion-body { + /*I think i did this but i dont know why so im leaving it for now*/ + /*.tab-pane {*/ + /* display: none; */ + /*}*/ + + .tab-pane.active { + display: block; + } + + @media (min-width: 767px) { + + #sidebar { + display: flex; + flex-direction: column; + height: 100vh; + justify-content: space-between; /* Push content apart */ + padding: 20px; + padding-top: 3px; + overflow-y: auto; + position: -webkit-sticky; /* For Safari */ + position: sticky; + top: 0; /* Stick to the top of the viewport */ + + } + } + + @media (min-width: 100px) { + #sidebar { + height: auto; + } + } + + .sidebar-bottom { + margin-top: auto; + } + + .support-flp, .sponsored-by { + margin-bottom: 20px; + text-align: center; + } + + #opinion > article > * > p { + text-indent: 2em; + } + + .active > a { + border-bottom-color: #B53C2C; + } + + #opinion p { + text-indent: 2em; + } + + + .nav-pills > li > a { + padding: 1px 15px; + } + + blockquote > * { + text-indent: 0em; + } + + sup { + font-size: .9em; + } + + .main-document { + padding-bottom: 5em; + } + + /*Case Caption CSS*/ + #caption-square { + background-color: #F6F2EE; + margin-left: -15px; + margin-right: -15px; + margin-top: -20px; + } + + #caption-square > ul > li { + background-color: #fcfaf9; + border-top-right-radius: 5px 5px; /* Rounds the corners */ + border-top-left-radius: 5px 5px; /* Rounds the corners */ + margin-left: 4px; + } + + #caption-square > ul > li.active { + background-color: #ffffff; + border-bottom: 1px solid lightgrey; + } + + #caption-square > ul > li.active { + background-color: #ffffff; + border-bottom: 1px solid white; + } + + #caption-square > ul > li.active > a { + border: 1px solid white; + } + + /*Opinion Date File*/ + .case-date-new { + border: 1px solid #B53C2C; + border-radius: 20px; /* Rounds the corners */ + padding: 5px; + padding-left: 8px; + padding-right: 8px; + padding-top: 8px; + color: #B53C2C; + + } + + /*Buttons on Top of Page*/ + .add-a-note { + margin-left: 5px; + border: 1px solid black; + border-radius: 10px; + padding-left: 8px; + padding-right: 8px; + } + + .add-citation-alert { + border: 1px solid black; + border-radius: 10px; + padding-left: 8px; + padding-right: 8px; + } + + cross_reference { + font-style: italic; + } + + #opinion-caption { + margin-top: 20px; + font-family: Merriweather, "Times New Roman", Times, serif; + font-size: 15px; + letter-spacing: 0.2px; + line-height: 2.3em; + margin-bottom: 20px; + padding-left: 20px; + padding-top: 10px; + padding-right: 10px; + } + + .case-details { + font-size: 16px; + } + + .case-details li { + line-height: 1.5em; + } + + span.citation.no-link { + font-style: italic; + } + + .opinion-button-row { + padding-top: 40px; + } + + #download-original { + color: black; + border-color: black; + background-color: white; + vertical-align: top; + float:right; + display:block; + } + + #btn-group-download-original { + float:right; + margin-top: 0px; + margin-left:10px; + padding-right: 10px; + } + + #add-note-button { + color: black; + border-color: black; + background-color: white; + vertical-align: top; + float: right; + } + + #get-citation-btn-group { + float:right; + } + + #get-citation-btn-group > a { + + color: black; + border-color: black; + background-color: white; + vertical-align: top; + } + + + p > span.star-pagination::after { + display: inline; + position: relative; + content: attr(label);; + float: left; + left: -4.5em; + font-size: 1em; + color: dimgray; + width: 0; + } + + div > span.star-pagination::after { + display: inline; + position: relative; + content: attr(label);; + float: left; + left: -2.5em; + font-size: 1em; + color: dimgray; + width: 0; + } + + div.subopinion-content > .harvard { + font-family: Merriweather, "Times New Roman", Times, serif; + font-size: 15px; + letter-spacing: 0.2px; + line-height: 2.3em; + text-align: justify; + } + + #columbia-text { + font-family: Merriweather, "Times New Roman", Times, serif; + font-size: 15px; + letter-spacing: 0.2px; + line-height: 2.3em; + text-align: justify; + } + + #columbia-text > div.subopinion-content > div > p > span.star-pagination { + color: #555555; + } + + #columbia-text > div.subopinion-content > div > p > span.star-pagination::after { + display: inline; + position: relative; + content: attr(label);; + float: left; + left: -4.5em; + font-size: 1em; + color: dimgray; + width: 0; + } + + + page-number::after { + display: inline; + position: relative; + content: attr(label); + float: right; + font-size: 1em; + color: dimgray; + width: 0; + } + + page-number { + font-style: italic; + font-size: 0.8em; + margin-right: 4px; + margin-left: 2px; + } + + a.page-label { + font-style: italic; + font-size: 0.8em; + margin-right: 4px; + margin-left: 2px; + color: #555555; + } + + + a.page-label::after { + display: inline; + position: relative; + content: attr(data-label); + float: right; + font-size: 1em; + color: dimgray; + width: 0; + } + + footnote > blockquote > a.page-label::after { + right: -2.5em; + } + + blockquote[id^="A"] > a.page-label::after { + right: -2.5em; + } + + blockquote[id^="b"] > a.page-label::after { + right: -4.0em; + } + + opinion > a.page-label::after { + right: -2.5em; + } + + /* Adjust to move the entire blockquote to the right */ + blockquote { + margin-left: 3em; + } + + a.page-label::after { + display: inline; + position: relative; + attr(label); + float: right; + font-size: 1em; + color: dimgray; + width: 0; + } + + footnote > p > a.page-label::after { + display: none; + } + + footnote > blockquote > a.page-label::after { + display: none; + } + + /*Remove the header on the opinion page so its flush*/ + header { + margin-bottom: 0px; + } + + .harvard > opinion > author { + line-height: inherit; + font-size: inherit; + display: inline-block; + } + + .container > .content { + margin-bottom: 0em; + } + + .meta-data-header { + font-size:15px; + } + + .case-details { + font-family: Merriweather, "Times New Roman", Times, serif; + letter-spacing: 0.2px; + line-height:2.3em; + } + + .opinion-section-title { + margin-top: 50px; + font-family: Merriweather, "Times New Roman", Times, serif; + } + + /*Add style to align roman numerals */ + .center-header { + text-align: center; + font-size: 2em; + } + + /*If XS screen - remove the side page labels*/ + @media (max-width: 768px) { + a.page-label::after { + display: none; + } + a.page-number::after { + display: none; + } + } +} + +html { + scroll-behavior: smooth; +} diff --git a/cl/assets/static-global/js/base.js b/cl/assets/static-global/js/base.js index 99355aa207..31713c0df5 100644 --- a/cl/assets/static-global/js/base.js +++ b/cl/assets/static-global/js/base.js @@ -307,11 +307,8 @@ $(document).ready(function () { if (modal_exist) { $('#open-modal-on-load').modal(); } - }); - - // Debounce - rate limit a function // https://davidwalsh.name/javascript-debounce-function function debounce(func, wait, immediate) { @@ -369,3 +366,271 @@ if (form && button) { button.disabled = true; }); } + +////////////////// +// SCOTUS STYLE // +////////////////// + +document.querySelectorAll('p').forEach(function (element) { + // Bold and Center likely Roman Numerals this improves SCOTUS opinions + if (element.textContent.trim().length < 5) { + element.classList.add('center-header'); + } +}); + + +//////////////// +// Pagination // +//////////////// + +$('.star-pagination').each(function (index, element) { + $(this).attr('label', this.textContent.trim().replace('*Page ', '')); +}); + +// Systematize page numbers +$('page-number').each(function (index, element) { + // Get the label and citation index from the current element + const label = $(this).attr('label'); + const citationIndex = $(this).attr('citation-index'); + + // Clean up the label (remove '*') and use it for the new href and id + const cleanLabel = label.replace('*', '').trim(); + + // Create the new element + const $newAnchor = $('') + .addClass('page-label') + .attr('data-citation-index', citationIndex) + .attr('data-label', cleanLabel) + .attr('href', '#' + cleanLabel) + .attr('id', cleanLabel) + .text('*' + cleanLabel); + + // Replace the element with the new element + $(this).replaceWith($newAnchor); +}); + +// Systematize page numbers +$('span.star-pagination').each(function (index, element) { + // Get the label and citation index from the current element + const label = $(this).attr('label'); + const citationIndex = $(this).attr('citation-index'); + + // Clean up the label (remove '*') and use it for the new href and id + const cleanLabel = label.replace('*', '').trim(); + + // Create the new element + const $newAnchor = $('') + .addClass('page-label') + .attr('data-citation-index', citationIndex) + .attr('data-label', cleanLabel) + .attr('href', '#' + cleanLabel) + .attr('id', cleanLabel) + .text('*' + cleanLabel); + + // Replace the element with the new element + $(this).replaceWith($newAnchor); +}); +// Fix weird data-ref bug +document.querySelectorAll('strong').forEach((el) => { + if (/\[\d+\]/.test(el.textContent)) { + // Check if the text matches the pattern [XXX] + const match = el.textContent.match(/\[\d+\]/)[0]; // Get the matched pattern + el.setAttribute('data-ref', match); // Set a data-ref attribute + } +}); + +/////////////// +// Footnotes // +/////////////// + +// We formatted the harvard footnotes oddly when they appeared inside the pre-opinion content. +// this removes the excess a tags and allows us to standardize footnotes across our contents +// footnote cleanup in harvard +// Update and modify footnotes to enable linking +$('div.footnote > a').remove(); +const headfootnotemarks = $('a.footnote'); +const divfootnotes = $('div.footnote'); + +if (headfootnotemarks.length === divfootnotes.length) { + headfootnotemarks.each(function (index) { + const footnoteMark = $(this); + const footnote = divfootnotes.eq(index); + + const $newElement = $(''); + $.each(footnoteMark.attributes, function () { + if (footnoteMark.specified) { + $newElement.attr(footnoteMark.name, footnoteMark.value); + } + }); + $newElement.html(footnoteMark.html()); + footnoteMark.replaceWith($newElement); + + const $newFootnote = $(''); + $.each(footnote.attributes, function () { + if (footnote.specified) { + $newFootnote.attr(footnote.name, footnote.value); + } + }); + $newFootnote.attr('label', footnote.attr('label')); + $newFootnote.html(footnote.html()); + footnote.replaceWith($newFootnote); + }); +} + +// This fixes many of the harvard footnotes so that they can +// easily link back and forth - we have a second set +// of harvard footnotes inside headnotes that need to be parsed out now +// okay. + +const footnoteMarks = $('footnotemark'); +const footnotes = $('footnote').not('[orphan="true"]'); + +if (footnoteMarks.length === footnotes.length) { + // we can make this work + footnoteMarks.each(function (index) { + const footnoteMark = $(this); + console.log(index, footnoteMark); + const $newElement = $(''); + // Copy attributes from the old element + $.each(footnoteMark.attributes, function () { + if (footnoteMark.specified) { + $newElement.attr(footnoteMark.name, footnoteMark.value); + console.log(footnoteMark.name, footnoteMark.value); + } + }); + $newElement.html(footnoteMark.html()); + const $supElement = $('').append($newElement); + footnoteMark.replaceWith($supElement); + const footnote = footnotes.eq(index); + $newElement.attr('href', `#fn${index}`); + $newElement.attr('id', `fnref${index}`); + footnote.attr('id', `fn${index}`); + console.log(footnoteMark, footnote); + + const $jumpback = $(''); + $jumpback.attr('href', `#fnref${index}`); + + footnote.append($jumpback); + }); +} else { + // If the number of footnotes and footnotemarks are inconsistent use the method to scroll to the nearest one + // we dont use this by default because many older opinions will reuse * ^ and other icons repeatedly on every page + // and so label is no usable to identify the correct footnote. + + footnotes.each(function (index) { + console.log($(this)); + + const $jumpback = $(''); + $jumpback.attr('label', $(this).attr('label')); + $(this).append($jumpback); + }); + + // There is no silver bullet for footnotes + $('footnotemark').on('click', function () { + const markText = $(this).text().trim(); // Get the text of the clicked footnotemark + const currentScrollPosition = $(window).scrollTop(); // Get the current scroll position + + // Find the first matching footnote below the current scroll position + const targetFootnote = $('footnote') + .filter(function () { + return $(this).attr('label') === markText && $(this).offset().top > currentScrollPosition; + }) + .first(); + + // If a matching footnote is found, scroll to it + if (targetFootnote.length > 0) { + $('html, body').animate( + { + scrollTop: targetFootnote.offset().top, + }, + 500 + ); // Adjust the animation duration as needed + } else { + console.warn('No matching footnote found below the current position for:', markText); + } + }); + + + ////////////// + // Sidebar // + ///////////// + + $('.jumpback').on('click', function () { + const footnoteLabel = $(this).attr('label').trim(); // Get the label attribute of the clicked footnote + const currentScrollPosition = $(window).scrollTop(); // Get the current scroll position + + // Find the first matching footnotemark above the current scroll position + const targetFootnotemark = $('footnotemark') + .filter(function () { + return $(this).text().trim() === footnoteLabel && $(this).offset().top < currentScrollPosition; + }) + .last(); + + // If a matching footnotemark is found, scroll to it + if (targetFootnotemark.length > 0) { + $('html, body').animate( + { + scrollTop: targetFootnotemark.offset().top, + }, + 500 + ); // Adjust the animation duration as needed + } else { + console.warn('No matching footnotemark found above the current position for label:', footnoteLabel); + } + }); +} + +$(document).ready(function () { + function adjustSidebarHeight() { + if ($(window).width() > 767) { + // Only apply the height adjustment for screens wider than 767px + var scrollTop = $(window).scrollTop(); + if (scrollTop <= 175) { + $('.opinion-sidebar').css('height', 'calc(100vh - ' + (175 - scrollTop) + 'px)'); + // $('.main-document').css('height', 'calc(100vh + ' + (scrollTop) + 'px)'); + } else { + $('.opinion-sidebar').css('height', '100vh'); + } + } else { + $('.opinion-sidebar').css('height', 'auto'); // Reset height for mobile view + } + } + + // Adjust height on document ready and when window is scrolled or resized + adjustSidebarHeight(); + $(window).on('scroll resize', adjustSidebarHeight); +}); + +// Update sidebar to show where we are on the page +document.addEventListener('scroll', function () { + let sections = document.querySelectorAll('.jump-link'); + let links = document.querySelectorAll('.jump-links > a'); + let currentSection = ''; + + // Determine which section is currently in view + sections.forEach((section) => { + let sectionTop = section.offsetTop; + let sectionHeight = section.offsetHeight; + if (window.scrollY >= sectionTop - sectionHeight / 3) { + currentSection = section.getAttribute('id'); + } + }); + + // Remove the active class from all links and their parent elements + links.forEach((link) => { + link.classList.remove('active'); + if (link.parentElement) { + link.parentElement.classList.remove('active'); + } + }); + + // Add the active class to the link and its parent that corresponds to the current section + links.forEach((link) => { + if (link.getAttribute('href') === `#${currentSection}`) { + link.classList.add('active'); + if (link.parentElement) { + link.parentElement.classList.add('active'); + } + } + }); +}); diff --git a/cl/opinion_page/templates/includes/add_download_button.html b/cl/opinion_page/templates/includes/add_download_button.html new file mode 100644 index 0000000000..1d7a4d828e --- /dev/null +++ b/cl/opinion_page/templates/includes/add_download_button.html @@ -0,0 +1,46 @@ +
+ + +
diff --git a/cl/opinion_page/templates/includes/add_note_button.html b/cl/opinion_page/templates/includes/add_note_button.html index c5392897e8..fb5fdaac40 100644 --- a/cl/opinion_page/templates/includes/add_note_button.html +++ b/cl/opinion_page/templates/includes/add_note_button.html @@ -3,4 +3,4 @@ data-toggle="modal" data-target="#modal-save-note, #modal-logged-out" title="{% if form_instance_id %}Edit this note{% else %}Save this record as a note in your profile{% endif %}"> - {% if form_instance_id %}Edit Note{% else %}Add Note{% endif %} + diff --git a/cl/opinion_page/templates/includes/opinion_tabs.html b/cl/opinion_page/templates/includes/opinion_tabs.html new file mode 100644 index 0000000000..9a5334847d --- /dev/null +++ b/cl/opinion_page/templates/includes/opinion_tabs.html @@ -0,0 +1,337 @@ +{% load humanize %} +{% load text_filters %} + +{% if tab == "authorities" %} +{# Table of Authorities #} +
+ +
+
+ {% for authority in authorities_with_data %} +
+

+ + {{ authority.caption|safe|v_wrapper }} + +

+
+
+ + +
+
+ + +
+
+ + +
+
+
+
+ +
+
+
+
+ {% endfor %} +
+
+{#{% elif tab == "details" %}#} +{# {% include "includes/tab_details.html" %}#} +{% elif tab == "summaries" %} + {# Summaries #} +
+ +
+
+
    + {% for group in parenthetical_groups %} + {% with representative=group.representative %} + {% with representative_cluster=representative.describing_opinion.cluster %} +
    +

    + + {{ representative_cluster|best_case_name|safe }} + +

    +
    +
    + + +
    +
    + + +
    +
    + + +
    +
    +
    +
  • + {{ representative.text|capfirst }} -- +
    + +
  • +
      + {% for summary in group.parentheticals.all %} + {% with describing_cluster=summary.describing_opinion.cluster %} + {% if summary != representative %} +
    • + {{ summary.text|capfirst }} +
      + {{ describing_cluster.date_filed }} + + + {{ describing_cluster|best_case_name|safe }} + + + {{ describing_cluster.docket.court }} +
    • + {% endif %} + {% endwith %} + {% endfor %} +
    + {% endwith %} + {% endwith %} + {% endfor %} +
+
+
+{% elif tab == "cited-by" %} + {# Cited By #} +
+ +
+ + {% if citing_cluster_count > 0 %} + {% for citing_cluster in citing_clusters %} + + {% endfor %} + {% else %} +

This case has not yet been cited in our system.

+ {% endif %} + +
+

+ View Citing Opinions +

+
+ +{% elif tab == "related-cases" %} + {# Related Cases #} + + +{% elif tab == "pdf" %} + {# PDF #} +
+
+ +
+
+
+ +
+
+
+

Oops! Your browser does not support embedded PDF viewing.

+
+ {% include "includes/rd_download_button.html" %} +
+
+
+
+
+
+{% else %} + + {# The section of the document I refer to as headmatter goes here #} +
+
+ {% with opinion_count=cluster.sub_opinions.all.count %} + {% if cluster.headnotes %} + +
+

{{ cluster.headnotes | safe}}

+ {% endif %} + + {% if cluster.headmatter %} + +
+
+ {{ cluster.headmatter|safe }} +
+ {% endif %} + + {% for sub_opinion in cluster.ordered_opinions %} + +
+ + {% if 'U' in cluster.source %} +
+ {% elif 'Z' in cluster.source %} +
+ {% elif 'L' in cluster.source %} +
+ {% elif 'R' in cluster.source %} +
+ {% else %} +
+ {% endif %} + +
+ {% if sub_opinion.xml_harvard and sub_opinion.html_with_citations %} +
{{ sub_opinion.html_with_citations|safe }}
+ {% elif sub_opinion.xml_harvard %} +
{{ sub_opinion.xml_harvard|safe }}
+ {% elif sub_opinion.html_with_citations %} + {% if cluster.source == "C" %} + {# It's a PDF with no HTML enrichment#} +
{{ sub_opinion.html_with_citations|safe|linebreaksbr }}
+ {% else %} +
{{ sub_opinion.html_with_citations|safe }}
+ {% endif %} + {% elif sub_opinion.html_columbia %} +
{{ sub_opinion.html_columbia|safe }}
+ {% elif sub_opinion.html_lawbox %} +
{{ sub_opinion.html_lawbox|safe }}
+ {% elif sub_opinion.html_anon_2020 %} +
{{ sub_opinion.html_anon_2020|safe }}
+ {% elif sub_opinion.html %} +
{{sub_opinion.html|safe}}
+ {% else %} +
{{sub_opinion.plain_text}}
+ {% endif %} +
+ + {% endfor %} + {% endwith %} +
+
+ +{% endif %} \ No newline at end of file diff --git a/cl/opinion_page/templates/opinion.html b/cl/opinion_page/templates/opinion.html index 16a33820fd..a0c4c797c7 100644 --- a/cl/opinion_page/templates/opinion.html +++ b/cl/opinion_page/templates/opinion.html @@ -100,7 +100,7 @@

Summaries ({{ summaries_count|intcomma }})

{% endfor %}

- View All Summaries diff --git a/cl/opinion_page/templates/opinions.html b/cl/opinion_page/templates/opinions.html new file mode 100644 index 0000000000..a32f1d0042 --- /dev/null +++ b/cl/opinion_page/templates/opinions.html @@ -0,0 +1,346 @@ +{% extends "base.html" %} +{% load extras %} +{% load humanize %} +{% load static %} +{% load text_filters %} + + +{% block canonical %}{% get_canonical_element %}{% endblock %} +{% block title %}{{ title }} – CourtListener.com{% endblock %} +{% block og_title %}{{ title }} – CourtListener.com{% endblock %} +{% block description %}{{ title }} — Brought to you by Free Law Project, a non-profit dedicated to creating high quality open legal information.{% endblock %} +{% block og_description %}{{ cluster|best_case_name }}{% if summaries_count > 0 %} — {{ top_parenthetical_groups.0.representative.text|capfirst }}{% else %} — Brought to you by Free Law Project, a non-profit dedicated to creating high quality open legal information.{% endif %} +{% endblock %} + +{% block head %} + +{% endblock %} + +{% block navbar-o %}active{% endblock %} + + +{% block sidebar %} + {% with sponsored_logo=STATIC_URL|add:'img/vlex-logo-150-75.png' %} + + + + {% endwith %} +{% endblock %} + +{% block body-classes %}opinion-body{% endblock %} + +{% block content %} + +
+
+ +
+ {{ cluster.date_filed }} + {% include "includes/add_note_button.html" with form_instance_id=note_form.instance.cluster_id %} + + {% if pdf_path %} + {% include "includes/add_download_button.html" %} + {% endif %} + + + + + +

{{ cluster.docket.court }}

+
+
+
    +
  • Citations: {{ cluster.citation_string|default:"None known" }}
  • + + {% if cluster.case_name_full != cluster.case_name and cluster.case_name_full != "" %} +
  • Full Case Name: + {{ cluster.case_name_full }} +
  • + {% endif %} + + {% if cluster.docket.court_id != "olc" %} +
  • Docket Number: {{ cluster.docket.docket_number|default:"Unknown" }}
  • + {% endif %} + + {% if cluster.get_precedential_status_display != "Precedential" %} +
  • Precedential Status: {{ cluster.get_precedential_status_display|default:"Unknown" }}
  • + {% endif %} + + {% if cluster.docket.court_id == 'scotus' and cluster.scbd %} +
  • Supreme Court DB ID: + + {{ cluster.scdb_id }} + +
  • + {% endif %} + + {% if cluster.panel.all.count > 0 %} +
  • Panel: + {% for p in cluster.panel.all %} + {{ p.name_full }}{% if not forloop.last %}, {% endif %} + {% endfor %} +
  • + {% endif %} + + {% if cluster.judges %} +
  • Judges: {{ cluster.judges }}
  • + {% endif %} + + {% if opinion.author %} +
  • Author: {{ opinion.author.name_full }}
  • + {% endif %} + + {% if opinion.joined_by.all.count > 0 %} +
  • Joined By: + {% for p in opinion.joined_by.all %} + {{ p.name_full }}{% if not forloop.last %}, {% endif %} + {% endfor %} +
  • + {% endif %} + + {% if cluster.nature_of_suit %} +
  • Nature of Suit: {{ cluster.nature_of_suit }}
  • + {% endif %} + + {% if cluster.nature_of_suit %} +
  • Posture: {{ cluster.posture }}
  • + {% endif %} + + {% if cluster.other_dates %} + {{ cluster.other_dates.items }} +
  • Other Dates: {{ cluster.other_dates }}
  • + {% endif %} + + {% if cluster.disposition %} +
  • Disposition: {{ cluster.disposition }}
  • + {% endif %} +
+
+
+ + +
+ {% include "includes/opinion_tabs.html" %} + {% include "includes/notes_modal.html" %} + +
+{% endblock %} + + +{% block footer-scripts %} + + + {% if request.user.is_staff %} + + {% if DEBUG %} + + {% else %} + + {% endif %} + {% endif %} +{% endblock %} diff --git a/cl/search/models.py b/cl/search/models.py index a0c808f3d3..3bacd929ab 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -2852,6 +2852,26 @@ def caption(self): caption += f" ({court} {year})" return caption + @property + def display_citation(self): + citation_list = [citation for citation in self.citations.all()] + citations = sorted(citation_list, key=sort_cites) + citation = "" + if not citations: + return "" + else: + if citations[0].type == Citation.NEUTRAL: + return citations[0] + elif ( + len(citations) >= 2 + and citations[0].type == Citation.WEST + and citations[1].type == Citation.LEXIS + ): + citation += f"{citations[0]}, {citations[1]}" + else: + citation += f"{citations[0]}" + return citation + @property def citation_string(self): """Make a citation string, joined by commas""" @@ -2991,6 +3011,18 @@ def __str__(self) -> str: def get_absolute_url(self) -> str: return reverse("view_case", args=[self.pk, self.slug]) + def ordered_opinions(self): + # Fetch all sub-opinions ordered by ordering_key + sub_opinions = self.sub_opinions.all().order_by("ordering_key") + + # Check if there is more than one sub-opinion + if sub_opinions.count() > 1: + # Return only sub-opinions with an ordering key + return sub_opinions.exclude(ordering_key__isnull=True) + + # If there's only one or no sub-opinions, return the main opinion + return sub_opinions + def save( self, update_fields=None, From 2ac21c6b3033d1ba97b3daa00d4c4c36bb514683 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 17 Oct 2024 15:31:12 -0400 Subject: [PATCH 003/143] feat(opinion.urls): Add/remove new endpoints Add multiple tab specific endpoints --- cl/opinion_page/urls.py | 48 ++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/cl/opinion_page/urls.py b/cl/opinion_page/urls.py index 5e7a9e1a54..be8c9214d8 100644 --- a/cl/opinion_page/urls.py +++ b/cl/opinion_page/urls.py @@ -12,14 +12,18 @@ download_docket_entries_csv, redirect_docket_recap, redirect_og_lookup, - view_authorities, view_docket, view_docket_feed, view_opinion, + view_opinion_authorities, + view_opinion_cited_by, + view_opinion_details, + view_opinion_pdf, + view_opinion_related_cases, + view_opinion_summaries, view_parties, view_recap_authorities, view_recap_document, - view_summaries, ) urlpatterns = [ @@ -31,16 +35,6 @@ name="court_publish_page", ), # Opinion pages - path( - "opinion///summaries/", - view_summaries, # type: ignore[arg-type] - name="view_summaries", - ), - path( - "opinion///authorities/", - view_authorities, # type: ignore[arg-type] - name="view_authorities", - ), path( "opinion///visualizations/", cluster_visualizations, # type: ignore[arg-type] @@ -52,6 +46,36 @@ name="docket_feed", ), path("opinion///", view_opinion, name="view_case"), # type: ignore[arg-type] + path( + "opinion///details/", + view_opinion_details, + name="view_case_details", + ), # with the tab + path( + "opinion///authorities/", + view_opinion_authorities, + name="view_case_authorities", + ), # with the tab + path( + "opinion///cited-by/", + view_opinion_cited_by, + name="view_case_cited_by", + ), # with the tab + path( + "opinion///summaries/", + view_opinion_summaries, + name="view_case_summaries", + ), # with the tab + path( + "opinion///related-cases/", + view_opinion_related_cases, + name="view_case_related_cases", + ), # with the tab + path( + "opinion///pdf/", + view_opinion_pdf, + name="view_case_pdf", + ), # with the tab path( "docket//download/", download_docket_entries_csv, # type: ignore[arg-type] From c260b60bd654a55e58c33b81d1c5044da2e6f4b2 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 17 Oct 2024 15:32:59 -0400 Subject: [PATCH 004/143] feat(opinion.views): Create new view methods Rewrite and waffle the new UI changes Added a number of methods to fetch and/or store related and cited by data quickly Implemented new view opinion with waffles --- cl/opinion_page/utils.py | 305 ++++++++++++++++++++++++++++++++++++- cl/opinion_page/views.py | 314 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 609 insertions(+), 10 deletions(-) diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index 160453bb1f..b8d5e581dc 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -157,8 +157,19 @@ async def build_cites_clusters_query( cluster_cites_query = cluster_search.query(cites_query) search_query = ( cluster_cites_query.sort({"citeCount": {"order": "desc"}}) - .source(includes=["absolute_url", "caseName", "dateFiled"]) - .extra(size=5, track_total_hits=True) + .source( + includes=[ + "absolute_url", + "caseName", + "cluster_id", + "docketNumber", + "citation", + "status", + "dateFiled", + ] + ) + .extra(size=20, track_total_hits=True) + .collapse(field="cluster_id") ) return search_query @@ -192,8 +203,18 @@ async def build_related_clusters_query( cluster_related_query = cluster_search.query(main_query) search_query = ( cluster_related_query.sort({"_score": {"order": "desc"}}) - .source(includes=["absolute_url", "caseName", "cluster_id"]) - .extra(size=5) + .source( + includes=[ + "absolute_url", + "caseName", + "cluster_id", + "docketNumber", + "citations", + "status", + "dateFiled", + ] + ) + .extra(size=20) .collapse(field="cluster_id") ) return search_query @@ -211,6 +232,202 @@ class RelatedCitingResults: timeout: bool = False +@dataclass +class RelatedClusterResults: + related_clusters: list[OpinionClusterDocument] = field( + default_factory=list + ) + sub_opinion_pks: list[int] = field(default_factory=list) + url_search_params: dict[str, str] = field(default_factory=dict) + timeout: bool = False + has_related_cases: bool = False + + +async def es_get_related_clusters_with_cache( + cluster: OpinionCluster, + request: HttpRequest, +) -> RelatedClusterResults: + """Elastic Related Clusters Search or Cache + + :param cluster:The cluster to use + :param request:The user request + :return:Related Cluster Data + """ + cache = caches["db_cache"] + mlt_cache_key = f"clusters-mlt-es:{cluster.pk}" + # By default, all statuses are included. Retrieve the PRECEDENTIAL_STATUS + # attributes (since they're indexed in ES) instead of the NAMES values. + search_params: CleanData = {} + url_search_params = { + f"stat_{v[0]}": "on" for v in PRECEDENTIAL_STATUS.NAMES + } + sub_opinion_pks = [ + str(pk) + async for pk in cluster.sub_opinions.values_list("pk", flat=True) + ] + if settings.RELATED_FILTER_BY_STATUS: + # Filter results by status (e.g., Precedential) + # Update URL parameters accordingly + search_params[ + f"stat_{PRECEDENTIAL_STATUS.get_status_value(settings.RELATED_FILTER_BY_STATUS)}" + ] = True + url_search_params = { + f"stat_{PRECEDENTIAL_STATUS.get_status_value(settings.RELATED_FILTER_BY_STATUS)}": "on" + } + + related_cluster_result = RelatedClusterResults( + url_search_params=url_search_params + ) + + if is_bot(request) or not sub_opinion_pks: + return related_cluster_result + + cached_related_clusters, timeout_related = ( + await cache.aget(mlt_cache_key) or (None, False) + if settings.RELATED_USE_CACHE + else (None, False) + ) + + # Prepare related cluster query if not cached results. + cluster_search = OpinionClusterDocument.search() + + if cached_related_clusters is not None: + related_cluster_result.related_clusters = cached_related_clusters + related_cluster_result.timeout = timeout_related + related_cluster_result.has_related_cases = ( + True if len(cached_related_clusters) > 0 else False + ) + return related_cluster_result + + # if cached_related_clusters is None: + related_query = await build_related_clusters_query( + cluster_search, sub_opinion_pks, search_params + ) + + related_query = related_query.params( + timeout=f"{settings.ELASTICSEARCH_FAST_QUERIES_TIMEOUT}s" + ) + related_query = related_query.extra( + size=settings.RELATED_COUNT, track_total_hits=False + ) + try: + # Execute the Related Query if needed + response = related_query.execute() + timeout_related = False + except (ConnectionError, RequestError, ApiError) as e: + logger.warning("Error getting cited and related clusters: %s", e) + if settings.DEBUG is True: + traceback.print_exc() + return related_cluster_result + except ConnectionTimeout as e: + logger.warning( + "ConnectionTimeout getting cited and related clusters: %s", e + ) + response = None + timeout_related = True + + related_cluster_result.related_clusters = ( + response if response is not None else cached_related_clusters or [] + ) + related_cluster_result.timeout = False + related_cluster_result.sub_opinion_pks = list(map(int, sub_opinion_pks)) + related_cluster_result.has_related_cases = True if response else False + + if timeout_related == False: + # print("SETTING", ( + # related_cluster_result.related_clusters, + # timeout_related, + # related_cluster_result.has_related_cases, + # )) + await cache.aset( + mlt_cache_key, + (results.related_clusters, timeout_related), + settings.RELATED_CACHE_TIMEOUT, + ) + + await cache.aset( + mlt_cache_key, + ( + related_cluster_result.related_clusters, + timeout_related, + related_cluster_result.has_related_cases, + ), + settings.RELATED_CACHE_TIMEOUT, + ) + return related_cluster_result + + +async def es_get_cited_clusters_with_cache( + cluster: OpinionCluster, + request: HttpRequest, +): + """Elastic cited by cluster search or cache + + :param cluster:The cluster to check + :param request:The user request + :return:The cited by data + """ + cache = caches["db_cache"] + cache_citing_key = f"clusters-cited-es:{cluster.pk}" + + sub_opinion_pks = [ + str(pk) + async for pk in cluster.sub_opinions.values_list("pk", flat=True) + ] + if is_bot(request) or not sub_opinion_pks: + return related_cluster_result + + cached_citing_results, cahced_citing_clusters_count, timeout_cited = ( + await cache.aget(cache_citing_key) or (None, False, False) + if settings.RELATED_USE_CACHE + else (None, False, False) + ) + + if cached_citing_results is not None: + return ( + cached_citing_results, + cahced_citing_clusters_count, + timeout_cited, + ) + + cluster_search = OpinionClusterDocument.search() + cited_query = await build_cites_clusters_query( + cluster_search, sub_opinion_pks + ) + try: + # Execute the Related Query if needed + response = cited_query.execute() + timeout_cited = False + except (ConnectionError, RequestError, ApiError) as e: + logger.warning("Error getting cited and related clusters: %s", e) + if settings.DEBUG is True: + traceback.print_exc() + return related_cluster_result + except ConnectionTimeout as e: + logger.warning( + "ConnectionTimeout getting cited and related clusters: %s", e + ) + response = None + timeout_cited = True + citing_clusters = list(response) + citing_clusters_count = ( + response.hits.total.value if response is not None else 0 + ) + timeout_cited = False if citing_clusters else timeout_cited + + if not timeout_cited: + await cache.aset( + cache_citing_key, + ( + citing_clusters, + citing_clusters_count, + timeout_cited, + ), + settings.RELATED_CACHE_TIMEOUT, + ) + return citing_clusters, citing_clusters_count, timeout_cited + + async def es_get_citing_and_related_clusters_with_cache( cluster: OpinionCluster, request: HttpRequest, @@ -251,9 +468,11 @@ async def es_get_citing_and_related_clusters_with_cache( if is_bot(request) or not sub_opinion_pks: return RelatedCitingResults(url_search_params=url_search_params) - cached_citing_results, cached_citing_cluster_count, timeout_cited = ( - await cache.aget(cache_citing_key) or (None, 0, False) - ) + ( + cached_citing_results, + cached_citing_cluster_count, + timeout_cited, + ) = await cache.aget(cache_citing_key) or (None, 0, False) cached_related_clusters, timeout_related = ( await cache.aget(mlt_cache_key) or (None, False) if settings.RELATED_USE_CACHE @@ -340,3 +559,75 @@ async def es_get_citing_and_related_clusters_with_cache( results.timeout = any([timeout_cited, timeout_related]) results.sub_opinion_pks = list(map(int, sub_opinion_pks)) return results + + +async def es_cited_case_count(cluster_id, sub_opinion_pks: [int]): + """Elastic quick cited by count query + + :param cluster_id: The cluster id to search with + :param sub_opinion_pks: The subopinion ids of the cluster + :return: + """ + cache = caches["db_cache"] + cache_cited_by_key = f"cited-by-count-es:{cluster_id}" + cached_cited_by_count = await cache.aget(cache_cited_by_key) or None + if cached_cited_by_count is not None: + return cached_cited_by_count + + cluster_search = OpinionClusterDocument.search() + cites_query = Q( + "bool", + filter=[ + Q("match", cluster_child="opinion"), + Q("terms", **{"cites": sub_opinion_pks}), + ], + ) + cluster_cites_query = cluster_search.query(cites_query) + cited_by_count = cluster_cites_query.count() + + await cache.aset( + cache_cited_by_key, + cited_by_count, + settings.RELATED_CACHE_TIMEOUT, + ) + + return cited_by_count + + +async def es_related_case_count(cluster_id, sub_opinion_pks: [int]): + """Elastic quick related cases count + + :param cluster_id: The cluster id of the object + :param sub_opinion_pks: The sub opinion ids of the cluster + :return: The count of related cases in elastic + """ + cache = caches["db_cache"] + cache_related_cases_key = f"related-cases-count-es:{cluster_id}" + cached_related_cases_count = ( + await cache.aget(cache_related_cases_key) or None + ) + if cached_related_cases_count is not None: + return cached_related_cases_count + + cluster_search = OpinionClusterDocument.search() + mlt_query = await build_more_like_this_query(sub_opinion_pks) + parent_filters = await sync_to_async(build_join_es_filters)( + {"type": SEARCH_TYPES.OPINION, "stat_published": True} + ) + default_parent_filter = [Q("match", cluster_child="opinion")] + parent_filters.extend(default_parent_filter) + main_query = Q( + "bool", + filter=default_parent_filter, + should=mlt_query, + minimum_should_match=1, + ) + cluster_related_query = cluster_search.query(main_query) + related_cases_count = cluster_related_query.count() + await cache.aset( + cache_related_cases_key, + related_cases_count, + settings.RELATED_CACHE_TIMEOUT, + ) + + return related_cases_count diff --git a/cl/opinion_page/views.py b/cl/opinion_page/views.py index c96cc3af85..e3f774945a 100644 --- a/cl/opinion_page/views.py +++ b/cl/opinion_page/views.py @@ -72,7 +72,11 @@ from cl.opinion_page.types import AuthoritiesContext from cl.opinion_page.utils import ( core_docket_data, + es_cited_case_count, + es_get_cited_clusters_with_cache, es_get_citing_and_related_clusters_with_cache, + es_get_related_clusters_with_cache, + es_related_case_count, generate_docket_entries_csv_data, get_case_title, ) @@ -352,7 +356,6 @@ async def fetch_docket_entries(docket): async def view_docket( request: HttpRequest, pk: int, slug: str ) -> HttpResponse: - sort_order_asc = True form = DocketEntryFilterForm(request.GET, request=request) docket, context = await core_docket_data(request, pk) @@ -770,7 +773,9 @@ async def view_recap_authorities( @never_cache -async def view_opinion(request: HttpRequest, pk: int, _: str) -> HttpResponse: +async def view_opinion_old( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: """Using the cluster ID, return the cluster of opinions. We also test if the cluster ID has a user note, and send data @@ -855,7 +860,7 @@ async def view_opinion(request: HttpRequest, pk: int, _: str) -> HttpResponse: sponsored = True view_authorities_url = reverse( - "view_authorities", args=[cluster.pk, cluster.slug] + "view_case_authorities", args=[cluster.pk, cluster.slug] ) authorities_context: AuthoritiesContext = AuthoritiesContext( citation_record=cluster, @@ -896,6 +901,151 @@ async def view_opinion(request: HttpRequest, pk: int, _: str) -> HttpResponse: ) +async def setup_opinion_context( + cluster: OpinionCluster, request: HttpRequest, tab: str +): + """Generate the basic page information we need to load the page + + :param cluster: The opinon cluster + :param request: The HTTP request from the user + :param tab: The tab to load + :return: + """ + title = ", ".join( + [ + s + for s in [ + trunc(best_case_name(cluster), 100, ellipsis="..."), + await cluster.acitation_string(), + ] + if s.strip() + ] + ) + has_downloads = False + pdf_path = None + if cluster.filepath_pdf_harvard: + has_downloads = True + pdf_path = cluster.filepath_pdf_harvard + else: + async for sub_opinion in cluster.sub_opinions.all(): + if str(sub_opinion.local_path).endswith(".pdf"): + has_downloads = True + pdf_path = sub_opinion.local_path.url + break + elif sub_opinion.download_url: + has_downloads = True + pdf_path = sub_opinion.local_path.url + + get_string = make_get_string(request) + + sub_opinion_pks = [ + str(pk) + async for pk in cluster.sub_opinions.values_list("pk", flat=True) + ] + + es_has_cited_opinions = await es_cited_case_count( + cluster.id, sub_opinion_pks + ) + es_has_related_opinions = await es_related_case_count( + cluster.id, sub_opinion_pks + ) + + try: + note = await Note.objects.aget( + cluster_id=cluster.pk, + user=await request.auser(), # type: ignore[attr-defined] + # type: ignore[attr-defined] + ) + except (ObjectDoesNotExist, TypeError): + # Not note or anonymous user + note_form = NoteForm( + initial={ + "cluster_id": cluster.pk, + "name": trunc(best_case_name(cluster), 100, ellipsis="..."), + } + ) + else: + note_form = NoteForm(instance=note) + + # Identify opinions updated/added in partnership with v|lex for 3 years + sponsored = False + if ( + cluster.date_created.date() > datetime.datetime(2022, 6, 1).date() + and cluster.filepath_json_harvard + ): + sponsored = True + + context = { + "tab": tab, + "title": title, + "caption": await cluster.acaption(), + "cluster": cluster, + "has_downloads": has_downloads, + "pdf_path": pdf_path, + "note_form": note_form, + "get_string": get_string, + "private": cluster.blocked, + "sponsored": sponsored, + "summaries_count": await cluster.parentheticals.acount(), + "authorities_count": await cluster.aauthority_count(), + "related_cases_count": es_has_related_opinions, + "cited_by_count": es_has_cited_opinions, + } + + return context + + +async def render_opinion_view( + request: HttpRequest, pk: int, tab: str, additional_context: dict = None +) -> HttpResponse: + """Helper function to render opinion views with common context. + + :param request: The HttpRequest object + :param pk: The primary key for the OpinionCluster + :param tab: The tab name to display + :param additional_context: Any additional context to be passed to the template + :return: HttpResponse + """ + cluster: OpinionCluster = await aget_object_or_404(OpinionCluster, pk=pk) + + ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "ui_flag_for_o" + ) + user_flag_active = await sync_to_async(waffle.flag_is_active)( + request.user, "ui_flag_for_o" + ) + if not any([ui_flag_for_o, user_flag_active]): + return await view_opinion_old(request, pk, "str") + + context = await setup_opinion_context(cluster, request, tab=tab) + + if additional_context: + context.update(additional_context) + + # Just redirect if people attempt to URL hack to pages without content + tab_count_mapping = { + "pdf": "has_downloads", + "authorities": "authorities_count", + "cited-by": "cited_by_count", + "related-by": "related_by_count", + "summaries": "summaries_count", + } + + # Check if the current tab needs a redirect based on the mapping + if context["tab"] in tab_count_mapping: + count_key = tab_count_mapping[context["tab"]] + if not context[count_key]: + return HttpResponseRedirect( + reverse("view_case", args=[cluster.pk, cluster.slug]) + ) + + return TemplateResponse( + request, + "opinions.html", + context, + ) + + async def view_summaries( request: HttpRequest, pk: int, slug: str ) -> HttpResponse: @@ -948,6 +1098,164 @@ async def view_authorities( ) +async def check_flag_exists(flag_name: str) -> bool: + return await sync_to_async( + waffle.get_waffle_flag_model().objects.filter(name=flag_name).exists + )() + + +@never_cache +async def view_opinion(request: HttpRequest, pk: int, _: str) -> HttpResponse: + """View for displaying opinions.""" + + flag_exists = await check_flag_exists("ui_flag_for_o") + if flag_exists: + ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "ui_flag_for_o" + ) + user_flag_active = await sync_to_async(waffle.flag_is_active)( + request.user, "ui_flag_for_o" + ) + if ui_flag_for_o or user_flag_active: + return await render_opinion_view(request, pk, "opinions") + # else: + # print("~~~~1:", ui_flag_for_o, "~~~2:", user_flag_active, request.user) + return await view_opinion_old(request, pk, "str") + + +async def view_opinion_details( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """View for displaying opinion case details.""" + + return await render_opinion_view(request, pk, "details") + + +async def view_opinion_pdf( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """View for displaying opinion case details.""" + return await render_opinion_view(request, pk, "pdf") + + +async def view_opinion_authorities( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """View for displaying opinion authorities.""" + cluster: OpinionCluster = await aget_object_or_404(OpinionCluster, pk=pk) + + authorities_context: AuthoritiesContext = AuthoritiesContext( + citation_record=cluster, + query_string=request.META["QUERY_STRING"], + total_authorities_count=await cluster.aauthority_count(), + view_all_url="view_authorities_url", + doc_type="opinion", + ) + await authorities_context.post_init() + + additional_context = { + "authorities_context": authorities_context, + "authorities_with_data": await cluster.aauthorities_with_data(), + } + ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "ui_flag_for_o" + ) + user_flag_active = await sync_to_async(waffle.flag_is_active)( + request.user, "ui_flag_for_o" + ) + + if ui_flag_for_o or user_flag_active: + return await render_opinion_view( + request, pk, "authorities", additional_context + ) + else: + # Old page to load for people outside the flag + return await view_authorities( + request=request, pk=pk, slug="authorities" + ) + + +async def view_opinion_cited_by( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """""" + cluster: OpinionCluster = await aget_object_or_404(OpinionCluster, pk=pk) + + ( + citing_clusters, + citing_cluster_count, + _, + ) = await es_get_cited_clusters_with_cache(cluster, request) + additional_context = { + "citing_clusters": citing_clusters, + "citing_cluster_count": citing_cluster_count, + } + return await render_opinion_view( + request, pk, "cited-by", additional_context + ) + + +async def view_opinion_summaries( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """""" + cluster: OpinionCluster = await aget_object_or_404(OpinionCluster, pk=pk) + parenthetical_groups_qs = await get_or_create_parenthetical_groups(cluster) + parenthetical_groups = [ + parenthetical_group + async for parenthetical_group in parenthetical_groups_qs.prefetch_related( + Prefetch( + "parentheticals", + queryset=Parenthetical.objects.order_by("-score"), + ), + "parentheticals__describing_opinion__cluster__citations", + "parentheticals__describing_opinion__cluster__docket__court", + "representative__describing_opinion__cluster__citations", + "representative__describing_opinion__cluster__docket__court", + ) + ] + ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "ui_flag_for_o" + ) + user_flag_active = await sync_to_async(waffle.flag_is_active)( + request.user, "ui_flag_for_o" + ) + + if ui_flag_for_o or user_flag_active: + additional_context = { + "parenthetical_groups": parenthetical_groups, + "ui_flag_for_o": ui_flag_for_o, + "user_flag_active": user_flag_active, + } + return await render_opinion_view( + request, pk, "summaries", additional_context + ) + else: + # Old page to load for people outside the flag + return await view_summaries(request=request, pk=pk, slug="summaries") + + +async def view_opinion_related_cases( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """""" + cluster: OpinionCluster = await aget_object_or_404(OpinionCluster, pk=pk) + related_cluster_object = await es_get_related_clusters_with_cache( + cluster, request + ) + additional_context = { + "related_algorithm": "mlt", + "related_clusters": related_cluster_object.related_clusters, + "sub_opinion_ids": related_cluster_object.sub_opinion_pks, + "related_search_params": f"&{urlencode(related_cluster_object.url_search_params)}", + "queries_timeout": related_cluster_object.timeout, + "has_related_cases": related_cluster_object.has_related_cases, + } + return await render_opinion_view( + request, pk, "related-cases", additional_context + ) + + async def cluster_visualizations( request: HttpRequest, pk: int, slug: str ) -> HttpResponse: From 08a4e8624cc012b0a7741bd1b227d472b1353ed6 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 17 Oct 2024 15:40:16 -0400 Subject: [PATCH 005/143] feat(tests): Update to tests Generally just override flags to avoid testing old view opinion page against the new ui changes. --- cl/favorites/tests.py | 3 +++ cl/opinion_page/tests.py | 3 +++ cl/search/tests/tests.py | 3 +++ cl/search/tests/tests_es_opinion.py | 2 ++ cl/tests/test_feeds.py | 3 +++ cl/tests/test_visualizations.py | 2 ++ 6 files changed, 16 insertions(+) diff --git a/cl/favorites/tests.py b/cl/favorites/tests.py index 61d549477b..bdde7f8393 100644 --- a/cl/favorites/tests.py +++ b/cl/favorites/tests.py @@ -11,6 +11,7 @@ from django.utils.timezone import now from selenium.webdriver.common.by import By from timeout_decorator import timeout_decorator +from waffle.testutils import override_flag from cl.favorites.factories import NoteFactory, PrayerFactory from cl.favorites.models import DocketTag, Note, Prayer, UserTag @@ -96,6 +97,7 @@ def setUp(self) -> None: super().setUp() @timeout_decorator.timeout(SELENIUM_TIMEOUT) + @override_flag("ui_flag_for_o", False) def test_anonymous_user_is_prompted_when_favoriting_an_opinion( self, ) -> None: @@ -156,6 +158,7 @@ def test_anonymous_user_is_prompted_when_favoriting_an_opinion( modal_title = self.browser.find_element(By.ID, "save-note-title") self.assertIn("Save Note", modal_title.text) + @override_flag("ui_flag_for_o", False) @timeout_decorator.timeout(SELENIUM_TIMEOUT) def test_logged_in_user_can_save_note(self) -> None: # Meta: assure no Faves even if part of fixtures diff --git a/cl/opinion_page/tests.py b/cl/opinion_page/tests.py index c77afc5ee9..59fc9038b6 100644 --- a/cl/opinion_page/tests.py +++ b/cl/opinion_page/tests.py @@ -19,6 +19,7 @@ from django.urls import reverse from django.utils.text import slugify from factory import RelatedFactory +from waffle.models import Flag from waffle.testutils import override_flag from cl.lib.models import THUMBNAIL_STATUSES @@ -111,6 +112,7 @@ async def test_simple_rd_page(self) -> None: self.assertEqual(response.status_code, HTTPStatus.OK) +@override_flag("ui_flag_for_o", False) class OpinionPageLoadTest( ESIndexTestCase, CourtTestCase, @@ -649,6 +651,7 @@ async def test_volume_pagination(self) -> None: self.assertEqual(volume_next, None) @override_flag("o-es-active", False) + @override_flag("ui_flag_for_o", False) def test_full_citation_redirect(self) -> None: """Do we get redirected to the correct URL when we pass in a full citation?""" diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index b8f85f719d..8fdd6fbd88 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -25,6 +25,7 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from timeout_decorator import timeout_decorator +from waffle.testutils import override_flag from cl.audio.factories import AudioFactory from cl.lib.elasticsearch_utils import simplify_estimated_count @@ -1120,6 +1121,7 @@ def test_pagerank_calculation(self) -> None: ) +@override_flag("ui_flag_for_o", False) class OpinionSearchFunctionalTest(AudioTestCase, BaseSeleniumTest): """ Test some of the primary search functionality of CL: searching opinions. @@ -1260,6 +1262,7 @@ def test_search_and_facet_docket_numbers(self) -> None: for result in search_results.find_elements(By.TAG_NAME, "article"): self.assertIn("1337", result.text) + @override_flag("ui_flag_for_o", False) @timeout_decorator.timeout(SELENIUM_TIMEOUT) def test_opinion_search_result_detail_page(self) -> None: # Dora navitages to CL and does a simple wild card search diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index 60c72aa8d9..6a493aa478 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -19,6 +19,7 @@ from elasticsearch_dsl import Q from factory import RelatedFactory from lxml import etree, html +from waffle.models import Flag from waffle.testutils import override_flag from cl.custom_filters.templatetags.text_filters import html_decode @@ -2247,6 +2248,7 @@ def test_uses_exact_version_for_case_name_field(self) -> None: cluster_2.delete() +@override_flag("ui_flag_for_o", False) class RelatedSearchTest( ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase ): diff --git a/cl/tests/test_feeds.py b/cl/tests/test_feeds.py index a9fb9c8c7c..90bac42ae5 100644 --- a/cl/tests/test_feeds.py +++ b/cl/tests/test_feeds.py @@ -10,6 +10,7 @@ from django.urls import reverse from selenium.webdriver.common.by import By from timeout_decorator import timeout_decorator +from waffle.testutils import override_flag from cl.search.models import Court from cl.tests.base import SELENIUM_TIMEOUT, BaseSeleniumTest @@ -28,6 +29,7 @@ class FeedsFunctionalTest(BaseSeleniumTest): "functest_audio.json", ] + @override_flag("ui_flag_for_o", False) @timeout_decorator.timeout(SELENIUM_TIMEOUT) def test_can_get_to_feeds_from_homepage(self) -> None: """Can we get to the feeds/podcasts page from the homepage?""" @@ -49,6 +51,7 @@ def test_can_get_to_feeds_from_homepage(self) -> None: self.assert_text_in_node("Podcasts", "body") @timeout_decorator.timeout(SELENIUM_TIMEOUT) + @override_flag("ui_flag_for_o", False) def test_feeds_page_shows_jurisdiction_links(self) -> None: """ Does the feeds page show all the proper links for each jurisdiction? diff --git a/cl/tests/test_visualizations.py b/cl/tests/test_visualizations.py index 0e5acb46f7..a0962ede8c 100644 --- a/cl/tests/test_visualizations.py +++ b/cl/tests/test_visualizations.py @@ -5,6 +5,7 @@ from django.contrib.auth.hashers import make_password from selenium.webdriver.common.by import By from timeout_decorator import timeout_decorator +from waffle.testutils import override_flag from cl.tests.base import SELENIUM_TIMEOUT, BaseSeleniumTest from cl.users.factories import UserProfileWithParentsFactory @@ -30,6 +31,7 @@ def tearDown(self) -> None: SCOTUSMap.objects.all().delete() JSONVersion.objects.all().delete() + @override_flag("ui_flag_for_o", False) @timeout_decorator.timeout(SELENIUM_TIMEOUT) def test_creating_new_visualization(self) -> None: """Test if a user can create a new Visualization""" From bc92162addf4c5b532cbef32897de94d753c2646 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 18 Oct 2024 11:06:02 -0400 Subject: [PATCH 006/143] fix(tests): Fix tests Remove decorator for selenium tests unaffected And modify css to only affect scrolling on opinion page --- cl/assets/static-global/css/override.css | 7 ++++--- cl/assets/static-global/js/base.js | 8 ++++++++ cl/tests/test_feeds.py | 5 ++--- cl/tests/test_visualizations.py | 1 - 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/cl/assets/static-global/css/override.css b/cl/assets/static-global/css/override.css index 32c21672a1..822b799e09 100644 --- a/cl/assets/static-global/css/override.css +++ b/cl/assets/static-global/css/override.css @@ -2420,8 +2420,9 @@ div.footnote:first-of-type { display: none; } } -} -html { - scroll-behavior: smooth; } + +html.smooth-scroll { + scroll-behavior: smooth; +} \ No newline at end of file diff --git a/cl/assets/static-global/js/base.js b/cl/assets/static-global/js/base.js index 31713c0df5..149e42a7f8 100644 --- a/cl/assets/static-global/js/base.js +++ b/cl/assets/static-global/js/base.js @@ -367,6 +367,14 @@ if (form && button) { }); } + +////////////////////////////////// +// Smooth Scrolling on Opinions // +///////////////////////////////// +if (document.body.classList.contains('opinion-body')) { + document.documentElement.classList.add('smooth-scroll'); +} + ////////////////// // SCOTUS STYLE // ////////////////// diff --git a/cl/tests/test_feeds.py b/cl/tests/test_feeds.py index 90bac42ae5..7a67cd7e6d 100644 --- a/cl/tests/test_feeds.py +++ b/cl/tests/test_feeds.py @@ -29,7 +29,6 @@ class FeedsFunctionalTest(BaseSeleniumTest): "functest_audio.json", ] - @override_flag("ui_flag_for_o", False) @timeout_decorator.timeout(SELENIUM_TIMEOUT) def test_can_get_to_feeds_from_homepage(self) -> None: """Can we get to the feeds/podcasts page from the homepage?""" @@ -51,7 +50,6 @@ def test_can_get_to_feeds_from_homepage(self) -> None: self.assert_text_in_node("Podcasts", "body") @timeout_decorator.timeout(SELENIUM_TIMEOUT) - @override_flag("ui_flag_for_o", False) def test_feeds_page_shows_jurisdiction_links(self) -> None: """ Does the feeds page show all the proper links for each jurisdiction? @@ -67,7 +65,8 @@ def test_feeds_page_shows_jurisdiction_links(self) -> None: link.get_attribute("href"), f"{self.live_server_url}/feed/court/{court.pk}/", ) - link.click() + with self.wait_for_page_load(timeout=10): + link.click() print("clicked...", end=" ") self.assertIn( 'feed xml:lang="en-us" xmlns="http://www.w3.org/2005/Atom"', diff --git a/cl/tests/test_visualizations.py b/cl/tests/test_visualizations.py index a0962ede8c..d6760944d4 100644 --- a/cl/tests/test_visualizations.py +++ b/cl/tests/test_visualizations.py @@ -31,7 +31,6 @@ def tearDown(self) -> None: SCOTUSMap.objects.all().delete() JSONVersion.objects.all().delete() - @override_flag("ui_flag_for_o", False) @timeout_decorator.timeout(SELENIUM_TIMEOUT) def test_creating_new_visualization(self) -> None: """Test if a user can create a new Visualization""" From be333b364225bf4524a6c51f25344a6100b8079b Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 18 Oct 2024 11:11:16 -0400 Subject: [PATCH 007/143] fix(opinion_page): Remove comments and fix lint Remove print statement and fix return for bot or scraping detection --- cl/opinion_page/utils.py | 17 ++++++----------- cl/opinion_page/views.py | 6 +++--- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index b8d5e581dc..3bd0eb6144 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -3,7 +3,7 @@ import traceback from dataclasses import dataclass, field from io import StringIO -from typing import Dict, Tuple, Union +from typing import Dict, List, Tuple, Union from asgiref.sync import sync_to_async from django.conf import settings @@ -334,14 +334,9 @@ async def es_get_related_clusters_with_cache( related_cluster_result.has_related_cases = True if response else False if timeout_related == False: - # print("SETTING", ( - # related_cluster_result.related_clusters, - # timeout_related, - # related_cluster_result.has_related_cases, - # )) await cache.aset( mlt_cache_key, - (results.related_clusters, timeout_related), + (related_cluster_result.related_clusters, timeout_related), settings.RELATED_CACHE_TIMEOUT, ) @@ -375,7 +370,7 @@ async def es_get_cited_clusters_with_cache( async for pk in cluster.sub_opinions.values_list("pk", flat=True) ] if is_bot(request) or not sub_opinion_pks: - return related_cluster_result + return (None, False, False) cached_citing_results, cahced_citing_clusters_count, timeout_cited = ( await cache.aget(cache_citing_key) or (None, False, False) @@ -402,7 +397,7 @@ async def es_get_cited_clusters_with_cache( logger.warning("Error getting cited and related clusters: %s", e) if settings.DEBUG is True: traceback.print_exc() - return related_cluster_result + return (None, False, False) except ConnectionTimeout as e: logger.warning( "ConnectionTimeout getting cited and related clusters: %s", e @@ -561,7 +556,7 @@ async def es_get_citing_and_related_clusters_with_cache( return results -async def es_cited_case_count(cluster_id, sub_opinion_pks: [int]): +async def es_cited_case_count(cluster_id: int, sub_opinion_pks: List[str]): """Elastic quick cited by count query :param cluster_id: The cluster id to search with @@ -594,7 +589,7 @@ async def es_cited_case_count(cluster_id, sub_opinion_pks: [int]): return cited_by_count -async def es_related_case_count(cluster_id, sub_opinion_pks: [int]): +async def es_related_case_count(cluster_id, sub_opinion_pks: List[str]): """Elastic quick related cases count :param cluster_id: The cluster id of the object diff --git a/cl/opinion_page/views.py b/cl/opinion_page/views.py index e3f774945a..fe7e93bc33 100644 --- a/cl/opinion_page/views.py +++ b/cl/opinion_page/views.py @@ -996,14 +996,14 @@ async def setup_opinion_context( async def render_opinion_view( - request: HttpRequest, pk: int, tab: str, additional_context: dict = None + request: HttpRequest, pk: int, tab: str, additional_context: dict = {} ) -> HttpResponse: """Helper function to render opinion views with common context. :param request: The HttpRequest object :param pk: The primary key for the OpinionCluster - :param tab: The tab name to display - :param additional_context: Any additional context to be passed to the template + :param tab: The selected tab + :param additional_context: Any additional context to be passed to the view :return: HttpResponse """ cluster: OpinionCluster = await aget_object_or_404(OpinionCluster, pk=pk) From 5b0cf27610ebd8607cc08a0526f47e4722a56bf4 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 18 Oct 2024 14:04:48 -0400 Subject: [PATCH 008/143] feat(printing): Prettify Printing Hide unwanted content during printing --- cl/opinion_page/templates/includes/add_download_button.html | 2 +- cl/opinion_page/templates/opinions.html | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cl/opinion_page/templates/includes/add_download_button.html b/cl/opinion_page/templates/includes/add_download_button.html index 1d7a4d828e..a4844bc075 100644 --- a/cl/opinion_page/templates/includes/add_download_button.html +++ b/cl/opinion_page/templates/includes/add_download_button.html @@ -1,4 +1,4 @@ -
+
+
+ +
\ No newline at end of file diff --git a/cl/opinion_page/templates/opinions.html b/cl/opinion_page/templates/opinions.html index bf2cf23ebc..320dbb40d9 100644 --- a/cl/opinion_page/templates/opinions.html +++ b/cl/opinion_page/templates/opinions.html @@ -61,64 +61,58 @@

Admin

{% endif %} - {% if cluster.sub_opinions.all.first.extracted_by_ocr or "U" in cluster.source and tab == "opinions" %}
@@ -196,30 +190,31 @@

+
{{ cluster.date_filed }} +
+ + {% if pdf_path %} + {% include "includes/add_download_button.html" %} + {% endif %} {% include "includes/add_note_button.html" with form_instance_id=note_form.instance.cluster_id %} - - {% if pdf_path %} - {% include "includes/add_download_button.html" %} - {% endif %} - - - +
+

{{ cluster.docket.court }}

diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index d199bb395c..9fc779b37f 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -330,7 +330,6 @@ async def es_get_related_clusters_with_cache( ) related_cluster_result.timeout = False related_cluster_result.sub_opinion_pks = list(map(int, sub_opinion_pks)) - # related_cluster_result.has_related_cases = True if response else False if timeout_related == False: await cache.aset( diff --git a/cl/search/models.py b/cl/search/models.py index b7c4d808b4..94275afdbe 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -16,6 +16,7 @@ from django.urls import NoReverseMatch, reverse from django.utils import timezone from django.utils.encoding import force_str +from django.utils.functional import cached_property from django.utils.text import slugify from eyecite import get_citations from eyecite.tokenizers import HyperscanTokenizer From abaa31a1a7680ac9ac3dc6a2818182f6e0c69fe7 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 22 Nov 2024 13:47:53 -0500 Subject: [PATCH 082/143] fix(search.models): Update aauthorities with data Add prefetch related objects along with authorities data query --- cl/search/models.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cl/search/models.py b/cl/search/models.py index 94275afdbe..1b0197bdd2 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -2989,7 +2989,13 @@ async def aauthorities_with_data(self): The returned list is sorted by that citation count field. """ authorities_with_data = [] - async for authority in await self.aauthorities(): + authorities_base = await self.aauthorities() + authorities_qs = ( + authorities_base.prefetch_related("citations") + .select_related("docket__court") + .order_by("-citation_count", "-date_filed") + ) + async for authority in authorities_qs: authority.citation_depth = ( await get_citation_depth_between_clusters( citing_cluster_pk=self.pk, cited_cluster_pk=authority.pk From fe1ee4f5c5d4103ef58f70b5fe2a83d6a3780c01 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 22 Nov 2024 13:51:59 -0500 Subject: [PATCH 083/143] fix(search.models): Update acaption Remove extra cluster query Make docket and court async --- cl/search/models.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cl/search/models.py b/cl/search/models.py index 1b0197bdd2..ab875f7661 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -2799,9 +2799,8 @@ async def acaption(self): else: caption += f", {citations[0]}" - cluster = await OpinionCluster.objects.aget(pk=self.pk) - docket = await Docket.objects.aget(id=cluster.docket_id) - court = await Court.objects.aget(pk=docket.court_id) + docket = await sync_to_async(lambda: self.docket)() + court = await sync_to_async(lambda: docket.court)() if docket.court_id != "scotus": court = re.sub(" ", " ", court.citation_string) # Strftime fails before 1900. Do it this way instead. From 70f562524fd6e8dc8441004b5601a0861eb7a33a Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 22 Nov 2024 15:00:58 -0400 Subject: [PATCH 084/143] feat(alerts): Refines logic to re-run aux queries --- cl/lib/elasticsearch_utils.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 4c561b0166..92acd4ca61 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -3217,14 +3217,20 @@ def do_es_sweep_alert_query( if parent_query: parent_search = search_query.query(parent_query) # Ensure accurate tracking of total hit count for up to 10,001 query results - parent_search = parent_search.extra(from_=0, track_total_hits=10_001) + parent_search = parent_search.extra( + from_=0, + track_total_hits=settings.ELASTICSEARCH_MAX_RESULT_COUNT + 1, + ) parent_search = parent_search.source(includes=["docket_id"]) multi_search = multi_search.add(parent_search) if child_query: child_search = child_search_query.query(child_query) # Ensure accurate tracking of total hit count for up to 10,001 query results - child_search = child_search.extra(from_=0, track_total_hits=10_001) + child_search = child_search.extra( + from_=0, + track_total_hits=settings.ELASTICSEARCH_MAX_RESULT_COUNT + 1, + ) child_search = child_search.source(includes=["id"]) multi_search = multi_search.add(child_search) @@ -3240,7 +3246,9 @@ def do_es_sweep_alert_query( # Re-run parent query to fetch potentially missed docket IDs due to large # result sets. should_repeat_parent_query = ( - docket_results and docket_results.hits.total.value >= 10_000 + docket_results + and docket_results.hits.total.value + >= settings.ELASTICSEARCH_MAX_RESULT_COUNT ) if should_repeat_parent_query: docket_ids = [int(d.docket_id) for d in main_results] @@ -3259,18 +3267,20 @@ def do_es_sweep_alert_query( # from the main results and refines the child query filter with these IDs. # Finally, it re-executes the child search. should_repeat_child_query = ( - rd_results and rd_results.hits.total.value >= 10_000 + rd_results + and rd_results.hits.total.value + >= settings.ELASTICSEARCH_MAX_RESULT_COUNT ) if should_repeat_child_query: rd_ids = [ - int(rd.to_dict()["id"]) + int(rd["_source"]["id"]) for docket in main_results if hasattr(docket, "child_docs") for rd in docket.child_docs ] child_query.filter.append(Q("terms", id=rd_ids)) child_search = child_search_query.query(child_query) - child_search = child_search.source(includes=["docket_id"]) + child_search = child_search.source(includes=["id"]) rd_results = child_search.execute() return main_results, docket_results, rd_results From 3b60523ce90c06dc623ad49e4bc081e980c37beb Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 22 Nov 2024 14:35:50 -0500 Subject: [PATCH 085/143] fix(search.models): Optimize opinion views Optimize opinion view rendering by removing redundant cluster query --- cl/opinion_page/views.py | 84 +++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/cl/opinion_page/views.py b/cl/opinion_page/views.py index 6e266bf362..5b059bb0c5 100644 --- a/cl/opinion_page/views.py +++ b/cl/opinion_page/views.py @@ -10,7 +10,7 @@ from django.contrib import messages from django.core.exceptions import ObjectDoesNotExist, PermissionDenied from django.core.paginator import EmptyPage, PageNotAnInteger, Paginator -from django.db.models import IntegerField, Prefetch +from django.db.models import IntegerField, Prefetch, QuerySet from django.db.models.functions import Cast from django.http import HttpRequest, HttpResponseRedirect from django.http.response import ( @@ -994,8 +994,17 @@ async def setup_opinion_context( return context +async def get_opinions_base_queryset() -> QuerySet: + return OpinionCluster.objects.prefetch_related( + "sub_opinions__opinions_cited", "citations" + ).select_related("docket__court") + + async def render_opinion_view( - request: HttpRequest, pk: int, tab: str, additional_context: dict = {} + request: HttpRequest, + cluster: OpinionCluster, + tab: str, + additional_context: dict = {}, ) -> HttpResponse: """Helper function to render opinion views with common context. @@ -1005,15 +1014,15 @@ async def render_opinion_view( :param additional_context: Any additional context to be passed to the view :return: HttpResponse """ - queryset = OpinionCluster.objects.prefetch_related("sub_opinions") - cluster: OpinionCluster = await aget_object_or_404(queryset, pk=pk) - ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( request, "ui_flag_for_o" ) if not ui_flag_for_o: return await view_opinion_old(request, pk, "str") + if not any([ui_flag_for_o]): + return await view_opinion_old(request, cluster.pk, "str") + context = await setup_opinion_context(cluster, request, tab=tab) if additional_context: @@ -1107,9 +1116,13 @@ async def view_opinion(request: HttpRequest, pk: int, _: str) -> HttpResponse: ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( request, "ui_flag_for_o" ) - if ui_flag_for_o: - return await render_opinion_view(request, pk, "opinions") - return await view_opinion_old(request, pk, "str") + if not ui_flag_for_o: + return await view_opinion_old(request, pk, "str") + + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) + return await render_opinion_view(request, cluster, "opinions") async def view_opinion_pdf( @@ -1122,7 +1135,10 @@ async def view_opinion_pdf( :param _: url slug :return: Opinion PDF tab """ - return await render_opinion_view(request, pk, "pdf") + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) + return await render_opinion_view(request, cluster, "pdf") async def view_opinion_authorities( @@ -1135,22 +1151,25 @@ async def view_opinion_authorities( :param _: url slug :return: Table of Authorities tab """ - cluster: OpinionCluster = await aget_object_or_404(OpinionCluster, pk=pk) - - additional_context = { - "authorities_with_data": await cluster.aauthorities_with_data(), - } - ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( request, "ui_flag_for_o" ) - if ui_flag_for_o: - return await render_opinion_view( - request, pk, "authorities", additional_context + if not ui_flag_for_o: + # Old page to load for people outside the flag + return await view_authorities( + request=request, pk=pk, slug="authorities" ) - # Old page to load for people outside the flag - return await view_authorities(request=request, pk=pk, slug="authorities") + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) + + additional_context = { + "authorities_with_data": await cluster.aauthorities_with_data(), + } + return await render_opinion_view( + request, cluster, "authorities", additional_context + ) async def view_opinion_cited_by( @@ -1163,14 +1182,16 @@ async def view_opinion_cited_by( :param _: url slug :return: Cited By tab """ - cluster: OpinionCluster = await aget_object_or_404(OpinionCluster, pk=pk) + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) cited_query = await es_get_cited_clusters_with_cache(cluster, request) additional_context = { "citing_clusters": cited_query.citing_clusters, "citing_cluster_count": cited_query.citing_cluster_count, } return await render_opinion_view( - request, pk, "cited-by", additional_context + request, cluster, "cited-by", additional_context ) @@ -1184,7 +1205,16 @@ async def view_opinion_summaries( :param _: url slug :return: Summaries tab """ - cluster: OpinionCluster = await aget_object_or_404(OpinionCluster, pk=pk) + ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "ui_flag_for_o" + ) + if not ui_flag_for_o: + # Old page to load for people outside the flag + return await view_summaries(request=request, pk=pk, slug="summaries") + + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) parenthetical_groups_qs = await get_or_create_parenthetical_groups(cluster) parenthetical_groups = [ parenthetical_group @@ -1210,7 +1240,7 @@ async def view_opinion_summaries( "ui_flag_for_o": ui_flag_for_o, } return await render_opinion_view( - request, pk, "summaries", additional_context + request, cluster, "summaries", additional_context ) @@ -1224,7 +1254,9 @@ async def view_opinion_related_cases( :param _: url slug :return: Related Cases tab """ - cluster: OpinionCluster = await aget_object_or_404(OpinionCluster, pk=pk) + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) related_cluster_object = await es_get_related_clusters_with_cache( cluster, request ) @@ -1236,7 +1268,7 @@ async def view_opinion_related_cases( "queries_timeout": related_cluster_object.timeout, } return await render_opinion_view( - request, pk, "related-cases", additional_context + request, cluster, "related-cases", additional_context ) From ebfbcb9ba9543de22e81ed30bbf6859522a3d4a4 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 22 Nov 2024 16:15:41 -0500 Subject: [PATCH 086/143] fix(opinion-page.views): Remove extra code --- cl/opinion_page/views.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cl/opinion_page/views.py b/cl/opinion_page/views.py index 5b059bb0c5..df3e7fdbd1 100644 --- a/cl/opinion_page/views.py +++ b/cl/opinion_page/views.py @@ -1017,8 +1017,6 @@ async def render_opinion_view( ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( request, "ui_flag_for_o" ) - if not ui_flag_for_o: - return await view_opinion_old(request, pk, "str") if not any([ui_flag_for_o]): return await view_opinion_old(request, cluster.pk, "str") From b580beacc9f928104a89b85f52ff3e09898584fa Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 22 Nov 2024 15:31:54 -0600 Subject: [PATCH 087/143] fix(elasticsearch): Improved set_results_child_docs helper method --- cl/lib/elasticsearch_utils.py | 40 +++++++++++++---------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index a3c2019095..92594a0eb1 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -3289,33 +3289,23 @@ def set_results_child_docs( """ for result in results: - child_result_objects = [] - child_docs = None - - # Get child_docs based on result type - if isinstance(result, dict): - child_docs = result.get("child_docs") - elif hasattr(result, "child_docs"): - child_docs = result.child_docs - - # Process child documents if they exist - if child_docs: - for child_doc in child_docs: - if isinstance(result, dict): - child_result_objects.append(child_doc) - else: - child_result_objects.append( - defaultdict( - lambda: None, - child_doc["_source"].to_dict(), - ) - ) - - # Set processed child docs back to result - result["child_docs"] = child_result_objects + result_is_dict = isinstance(result, dict) + if result_is_dict: + # If the result is a dictionary, do nothing, or assign [] to + # child_docs if it is not present. + child_docs = result.get("child_docs", []) + result["child_docs"] = child_docs + else: + # Process child hits if the result is an ES AttrDict instance, + # so they can be properly serialized. + child_docs = getattr(result, "child_docs", []) + result["child_docs"] = [ + defaultdict(lambda: None, doc["_source"].to_dict()) + for doc in child_docs + ] # Optionally merges highlights. Used for integrating percolator # highlights into the percolated document. - if merge_highlights and isinstance(result, dict): + if merge_highlights and result_is_dict: meta_hl = result.get("meta", {}).get("highlight", {}) merge_highlights_into_result(meta_hl, result) From c05c40a4061f5d360daec4f11abe078a247501ab Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 22 Nov 2024 16:14:26 -0600 Subject: [PATCH 088/143] feat(casenames): refactor code to parse and add citations --- .../commands/update_casenames_wl_dataset.py | 101 +++++++++++------- 1 file changed, 65 insertions(+), 36 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py index 49c51b1582..8f203ad8ba 100644 --- a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -5,11 +5,13 @@ import pandas as pd from django.core.management.base import BaseCommand, CommandError +from django.db import transaction from django.db.models import Q, QuerySet from eyecite import get_citations from eyecite.models import FullCaseCitation from eyecite.tokenizers import HyperscanTokenizer +from cl.citations.utils import map_reporter_db_cite_type from cl.corpus_importer.utils import add_citations_to_cluster from cl.search.models import Citation, OpinionCluster @@ -127,25 +129,34 @@ def parse_citations(citation_strings: list[str]) -> list[dict]: for cite_str in citation_strings: # Get citations from the string found_cites = get_citations(cite_str, tokenizer=HYPERSCAN_TOKENIZER) + if not found_cites: + continue + + citation = found_cites[0] # Ensure we have valid citations to process - for citation in found_cites: - if isinstance(citation, FullCaseCitation): - volume = citation.groups.get("volume") - - # Validate the volume - if volume and volume.isdigit(): - # Append the validated citation as a dictionary - validated_citations.append( - { - "volume": citation.groups["volume"], - "reporter": citation.corrected_reporter(), - "page": citation.groups["page"], - } - ) - else: - # If volume is invalid, skip this citation - continue + if isinstance(citation, FullCaseCitation): + volume = citation.groups.get("volume") + + # Validate the volume + if not volume or not volume.isdigit(): + continue + + if not citation.corrected_reporter(): + reporter_type = Citation.STATE + else: + cite_type_str = citation.all_editions[0].reporter.cite_type + reporter_type = map_reporter_db_cite_type(cite_type_str) + + # Append the validated citation as a dictionary + validated_citations.append( + { + "volume": citation.groups["volume"], + "reporter": citation.corrected_reporter(), + "page": citation.groups["page"], + "type": reporter_type, + } + ) return validated_citations @@ -305,28 +316,46 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: # Dry run, don't save anything continue - # Update case names - cluster_updated, docket_updated = update_matched_case_name( - matches[0].cluster, west_case_name - ) - - if cluster_updated: - total_clusters_updated = +1 + with transaction.atomic(): + matched_cluster = matches[0].cluster - if docket_updated: - total_dockets_updated = +1 + # Update case names + cluster_updated, docket_updated = update_matched_case_name( + matched_cluster, west_case_name + ) - # Add any of the citations if possible - add_citations_to_cluster( - [ - f"{cite.get('volume')} {cite.get('reporter')} {cite.get('page')}" - for cite in valid_citations - ], - matches[0].cluster_id, - ) + if cluster_updated: + total_clusters_updated = +1 + + if docket_updated: + total_dockets_updated = +1 + + # Add any of the citations if possible + for citation in valid_citations: + if Citation.objects.filter( + cluster_id=matched_cluster.id, + reporter=citation.get("reporter"), + ).exists(): + # Avoid adding a citation if we already have a citation from the + # citation's reporter. + logger.info( + f"Can't add: {citation.get('volume')} {citation.get('reporter')} {citation.get('page')} to cluster id: {matched_cluster.id}. There is already " + f"a citation from that reporter." + ) + continue + citation["cluster_id"] = matched_cluster.id + Citation.objects.get_or_create(**citation) + + add_citations_to_cluster( + [ + f"{cite.get('volume')} {cite.get('reporter')} {cite.get('page')}" + for cite in valid_citations + ], + matches[0].cluster_id, + ) - # Wait between each processed row to avoid sending to many indexing tasks - time.sleep(delay) + # Wait between each processed row to avoid sending to many indexing tasks + time.sleep(delay) if not dry_run: logger.info(f"Clusters updated: {total_clusters_updated}") From a6b05f14b60acdf69649100dac0861e432feea5d Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 22 Nov 2024 16:59:26 -0600 Subject: [PATCH 089/143] fix(elasticsearch): Solved merge conflicts - Removed score from Opinion Search Alert webhooks --- .../management/commands/cl_send_alerts.py | 4 ++-- cl/alerts/tests/tests.py | 18 ++++++++++++++++++ cl/api/tasks.py | 4 ++-- cl/api/webhooks.py | 4 ++-- cl/lib/elasticsearch_utils.py | 12 ++++++++++-- cl/search/api_serializers.py | 16 +++++++++++++++- cl/search/api_utils.py | 7 ++----- cl/tests/cases.py | 5 +++-- 8 files changed, 54 insertions(+), 16 deletions(-) diff --git a/cl/alerts/management/commands/cl_send_alerts.py b/cl/alerts/management/commands/cl_send_alerts.py index 19eeb511fd..4792e78fd2 100644 --- a/cl/alerts/management/commands/cl_send_alerts.py +++ b/cl/alerts/management/commands/cl_send_alerts.py @@ -26,7 +26,7 @@ from cl.lib.elasticsearch_utils import ( do_es_api_query, limit_inner_hits, - set_results_child_docs, + set_child_docs_and_score, set_results_highlights, ) from cl.lib.scorched_utils import ExtraSolrInterface @@ -161,7 +161,7 @@ def query_alerts_es( results = responses[0] limit_inner_hits({}, results, cd["type"]) set_results_highlights(results, cd["type"]) - set_results_child_docs(results) + set_child_docs_and_score(results) if v1_webhook: v1_results = responses[1] return results, v1_results diff --git a/cl/alerts/tests/tests.py b/cl/alerts/tests/tests.py index b048ed9507..990658c6a3 100644 --- a/cl/alerts/tests/tests.py +++ b/cl/alerts/tests/tests.py @@ -963,6 +963,24 @@ def test_send_search_alert_webhooks(self): "opinions", ) + # Assert HL content in V2 webhooks. + self._assert_webhook_hit_hl( + webhook_events, + self.search_alert.name, + "caseName", + "California vs Lorem", + child_field=False, + nested_field="opinions", + ) + self._assert_webhook_hit_hl( + webhook_events, + self.search_alert.name, + "snippet", + "Lorem dolor california sit amet, consectetur adipiscing elit.", + child_field=True, + nested_field="opinions", + ) + # Assert V1 Opinion Search Alerts Webhook self._count_webhook_hits_and_child_hits( list(webhook_events), diff --git a/cl/api/tasks.py b/cl/api/tasks.py index c236367351..39c5fe7533 100644 --- a/cl/api/tasks.py +++ b/cl/api/tasks.py @@ -11,7 +11,7 @@ from cl.api.webhooks import send_webhook_event from cl.celery_init import app from cl.corpus_importer.api_serializers import DocketEntrySerializer -from cl.lib.elasticsearch_utils import set_results_child_docs +from cl.lib.elasticsearch_utils import set_child_docs_and_score from cl.search.api_serializers import ( RECAPESWebhookResultSerializer, V3OAESResultSerializer, @@ -152,7 +152,7 @@ def send_search_alert_webhook_es( es_results, many=True ).data case SEARCH_TYPES.RECAP: - set_results_child_docs(results, merge_highlights=True) + set_child_docs_and_score(results, merge_highlights=True) serialized_results = RECAPESWebhookResultSerializer( results, many=True ).data diff --git a/cl/api/webhooks.py b/cl/api/webhooks.py index b46c106b86..15f1d3cabf 100644 --- a/cl/api/webhooks.py +++ b/cl/api/webhooks.py @@ -28,7 +28,7 @@ from cl.recap.api_serializers import PacerFetchQueueSerializer from cl.recap.models import PROCESSING_STATUS, PacerFetchQueue from cl.search.api_serializers import ( - OpinionClusterESResultSerializer, + OpinionClusterWebhookResultSerializer, SearchResultSerializer, V3OpinionESResultSerializer, ) @@ -205,7 +205,7 @@ def send_search_alert_webhook( many=True, ).data case WebhookVersions.v2: - serialized_results = OpinionClusterESResultSerializer( + serialized_results = OpinionClusterWebhookResultSerializer( results, many=True, ).data diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 7b7cd6fc7e..119f78a754 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -3316,8 +3316,10 @@ def simplify_estimated_count(search_count: int) -> int: return search_count -def set_results_child_docs( - results: list[Hit] | list[dict[str, Any]], merge_highlights: bool = False +def set_child_docs_and_score( + results: list[Hit] | list[dict[str, Any]] | Response, + merge_highlights: bool = False, + merge_score: bool = False, ) -> None: """Process and attach child documents to the main search results. @@ -3325,6 +3327,8 @@ def set_results_child_docs( or a list of dicts. :param merge_highlights: A boolean indicating whether to merge highlight data into the results. + :param merge_score: A boolean indicating whether to merge + the BM25 score into the results. :return: None. Results are modified in place. """ @@ -3349,3 +3353,7 @@ def set_results_child_docs( if merge_highlights and result_is_dict: meta_hl = result.get("meta", {}).get("highlight", {}) merge_highlights_into_result(meta_hl, result) + + # Optionally merges the BM25 score for display in the API. + if merge_score and isinstance(result, Response): + result["bm25_score"] = result.meta.score diff --git a/cl/search/api_serializers.py b/cl/search/api_serializers.py index f27053e95d..31752a79af 100644 --- a/cl/search/api_serializers.py +++ b/cl/search/api_serializers.py @@ -619,7 +619,7 @@ class Meta: ) -class OpinionClusterESResultSerializer(MainMetaMixin, DocumentSerializer): +class OpinionClusterBaseESResultSerializer(DocumentSerializer): """The serializer for OpinionCluster Search results.""" opinions = OpinionDocumentESResultSerializer( @@ -649,6 +649,20 @@ class Meta: ) +class OpinionClusterESResultSerializer( + OpinionClusterBaseESResultSerializer, MainMetaMixin +): + """The serializer for OpinionCluster Search results.""" + + +class OpinionClusterWebhookResultSerializer( + OpinionClusterBaseESResultSerializer +): + """The serializer class for OpinionCluster search Webhooks results.""" + + meta = BaseMetaDataSerializer(source="*", read_only=True) + + class PositionESResultSerializer(ChildMetaMixin, DocumentSerializer): """The serializer for Positions Search results.""" diff --git a/cl/search/api_utils.py b/cl/search/api_utils.py index f7f9585148..f5c22e388a 100644 --- a/cl/search/api_utils.py +++ b/cl/search/api_utils.py @@ -19,7 +19,7 @@ do_es_api_query, limit_inner_hits, merge_unavailable_fields_on_parent_document, - set_results_child_docs, + set_child_docs_and_score, set_results_highlights, ) from cl.lib.scorched_utils import ExtraSolrInterface @@ -475,10 +475,7 @@ def process_results(self, results: Response) -> None: "v4", self.clean_data["highlight"], ) - set_results_child_docs(results) - for result in results: - # Include the ES main document score as bm25_score. - result["bm25_score"] = result.meta.score + set_child_docs_and_score(results, merge_score=True) if self.reverse: # If doing backward pagination, reverse the results of the current diff --git a/cl/tests/cases.py b/cl/tests/cases.py index 2b549f741a..0b95c311e1 100644 --- a/cl/tests/cases.py +++ b/cl/tests/cases.py @@ -663,6 +663,7 @@ def _assert_webhook_hit_hl( field_name, hl_expected, child_field, + nested_field="recap_documents", ): """Assert Hl in webhook fields.""" for webhook in webhooks: @@ -671,10 +672,10 @@ def _assert_webhook_hit_hl( if child_field: self.assertNotIn( "score", - hit["recap_documents"][0]["meta"], + hit[nested_field][0]["meta"], msg="score shouldn't be present on webhook nested documents", ) - child_field_content = hit["recap_documents"][0][field_name] + child_field_content = hit[nested_field][0][field_name] self.assertIn( hl_expected, child_field_content, From d12e30b648ca3d41777726181c940fc964fdc952 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 22 Nov 2024 17:20:44 -0600 Subject: [PATCH 090/143] fix(api): Fixed score merge and improved related test --- cl/lib/elasticsearch_utils.py | 2 +- cl/tests/cases.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 119f78a754..1b9366f713 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -3355,5 +3355,5 @@ def set_child_docs_and_score( merge_highlights_into_result(meta_hl, result) # Optionally merges the BM25 score for display in the API. - if merge_score and isinstance(result, Response): + if merge_score and isinstance(result, AttrDict): result["bm25_score"] = result.meta.score diff --git a/cl/tests/cases.py b/cl/tests/cases.py index 0b95c311e1..8b23dea418 100644 --- a/cl/tests/cases.py +++ b/cl/tests/cases.py @@ -270,6 +270,11 @@ async def _compare_field( set(meta_expected_value.keys()), f"The keys in field '{meta_field}' do not match.", ) + for score_value in meta_value.values(): + self.assertIsNotNone( + score_value, f"The score value can't be None." + ) + else: self.assertEqual( meta_value, From 9bef8df75773e891b7994c710df33a7b2d4f9ef0 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 22 Nov 2024 17:53:33 -0600 Subject: [PATCH 091/143] feat(casenames): add new date format found in dataset --- .../management/commands/update_casenames_wl_dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py index 8f203ad8ba..fe7eeeefe6 100644 --- a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -41,7 +41,14 @@ "for", } -DATE_FORMATS = ("%B %d, %Y", "%d-%b-%y", "%m/%d/%Y", "%m/%d/%y", "%b. %d, %Y") +DATE_FORMATS = ( + "%B %d, %Y", + "%d-%b-%y", + "%m/%d/%Y", + "%m/%d/%y", + "%b. %d, %Y", + "%Y-%m-%d", +) def tokenize_case_name(case_name: str) -> set[str]: @@ -105,6 +112,7 @@ def parse_date(date_str: str) -> date | None: 12/1/1960 26-Sep-00 Feb. 28, 2001 + 2007-01-24 :param date_str: date string :return: date object or none From 9c1b7cc3d61fecd295c128eb7d11df3ecbd4048b Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 22 Nov 2024 18:37:11 -0600 Subject: [PATCH 092/143] fix(api): Fixed people V4 API test to return scores --- cl/search/tests/tests_es_person.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cl/search/tests/tests_es_person.py b/cl/search/tests/tests_es_person.py index 6c59b01cf6..eb82285286 100644 --- a/cl/search/tests/tests_es_person.py +++ b/cl/search/tests/tests_es_person.py @@ -616,6 +616,7 @@ async def test_results_api_fields(self) -> None: search_params = { "type": SEARCH_TYPES.PEOPLE, "q": f"id:{self.person_2.pk} AND nomination_process:(U.S. Senate)", + "order_by": "score desc", } # API r = await self._test_api_results_count(search_params, 1, "API fields") @@ -662,6 +663,7 @@ def test_results_api_empty_fields(self) -> None: search_params = { "type": SEARCH_TYPES.PEOPLE, "q": f"id:{person.pk}", + "order_by": "score desc", } # API r = async_to_sync(self._test_api_results_count)( @@ -869,6 +871,7 @@ async def test_results_api_highlighted_fields(self) -> None: "q": f"id:{self.person_2.pk} name:Sheindlin dob_city:Brookyln nomination_process:(U.S. Senate) political_affiliation:Democratic", "school": "New York Law School", "dob_state": "NY", + "order_by": "score desc", } # Judged Search type HL disabled. From 73fa6e5d19c24dd7196695fc965ff882af89ad66 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 25 Nov 2024 10:22:15 -0600 Subject: [PATCH 093/143] fix(pacer_free_opinions): add ReadError to get_and_process_free_pdf decorator reduce to one second the sleep between courts cycle --- .../management/commands/scrape_pacer_free_opinions.py | 4 ++-- cl/corpus_importer/tasks.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py index 43611d240f..08b2de837d 100644 --- a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py +++ b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py @@ -331,10 +331,10 @@ def get_pdfs( throttle.update_min_items(min_items) logger.info( - f"Court cycle completed for: {row.court_id}. Current iteration: {cycle_checker.current_iteration}. Sleep 2 seconds " + f"Court cycle completed for: {row.court_id}. Current iteration: {cycle_checker.current_iteration}. Sleep 1 second " f"before starting the next cycle." ) - time.sleep(2) + time.sleep(1) logger.info(f"Processing row id: {row.id} from {row.court_id}") c = chain( process_free_opinion_result.si( diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 8ed46333f7..bfa21e43b5 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -25,6 +25,7 @@ from httpx import ( HTTPStatusError, NetworkError, + ReadError, RemoteProtocolError, TimeoutException, ) @@ -598,6 +599,7 @@ def process_free_opinion_result( ConnectionError, ReadTimeout, RedisConnectionError, + ReadError, ), max_retries=15, interval_start=5, From 249d8fcb6627e727a154aec2181c3ba0ea306125 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 25 Nov 2024 10:22:59 -0600 Subject: [PATCH 094/143] fix(api): Prioritize DjangoModelPermissionsOrAnonReadOnly when checking V3APIPermission --- cl/api/api_permissions.py | 15 +++++++++++---- cl/api/tests.py | 23 +++++++++++++++++++++++ cl/disclosures/api_views.py | 1 + 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/cl/api/api_permissions.py b/cl/api/api_permissions.py index 6562da789f..b73fc5fcae 100644 --- a/cl/api/api_permissions.py +++ b/cl/api/api_permissions.py @@ -2,9 +2,9 @@ from django.conf import settings from django.contrib.auth.models import AnonymousUser, User -from django.http import HttpRequest from rest_framework import permissions from rest_framework.exceptions import PermissionDenied +from rest_framework.request import Request from rest_framework.views import APIView from cl.lib.redis_utils import get_redis_interface @@ -19,7 +19,10 @@ def has_object_permission(self, request, view, obj): return obj.user == request.user -class V3APIPermission(permissions.BasePermission): +class V3APIPermission( + permissions.DjangoModelPermissionsOrAnonReadOnly, + permissions.BasePermission, +): r = get_redis_interface("STATS") v3_blocked_message = ( @@ -52,7 +55,7 @@ def is_user_v3_blocked(self, user: User) -> bool: return is_blocked_user @staticmethod - def is_v3_api_request(request: HttpRequest) -> bool: + def is_v3_api_request(request: Request) -> bool: return getattr(request, "version", None) == "v3" @staticmethod @@ -62,7 +65,7 @@ def check_request() -> bool: return True return False - def has_permission(self, request: HttpRequest, view: APIView) -> bool: + def has_permission(self, request: Request, view: APIView) -> bool: """Check if the user has permission to access the V3 API. :param request: The HTTPRequest object. @@ -70,6 +73,10 @@ def has_permission(self, request: HttpRequest, view: APIView) -> bool: :return: True if the user has permission to access V3, False if not. """ + # Prioritize DjangoModelPermissionsOrAnonReadOnly permissions + if not super().has_permission(request, view): + return False + if ( not self.is_v3_api_request(request) or not settings.BLOCK_NEW_V3_USERS # type: ignore diff --git a/cl/api/tests.py b/cl/api/tests.py index 63a8e14aa5..238c0d04a7 100644 --- a/cl/api/tests.py +++ b/cl/api/tests.py @@ -484,6 +484,8 @@ def setUpTestData(cls) -> None: cls.audio_path_v3 = reverse("audio-list", kwargs={"version": "v3"}) cls.audio_path_v4 = reverse("audio-list", kwargs={"version": "v4"}) + cls.debt_path_v4 = reverse("debt-list", kwargs={"version": "v4"}) + cls.debt_path_v3 = reverse("debt-list", kwargs={"version": "v3"}) def setUp(self) -> None: self.r = get_redis_interface("STATS") @@ -595,6 +597,27 @@ async def test_allow_v4_for_anonymous_users(self, mock_api_prefix) -> None: response = await self.async_client.get(self.audio_path_v4) self.assertEqual(response.status_code, HTTPStatus.OK) + async def test_confirm_v4_post_requests_are_not_allowed( + self, mock_api_prefix + ) -> None: + """Confirm V4 users are not allowed to POST requests.""" + response = await self.client_2.post(self.debt_path_v4, {}) + self.assertEqual(response.status_code, HTTPStatus.FORBIDDEN) + + async def test_confirm_v3_post_requests_are_not_allowed( + self, mock_api_prefix + ) -> None: + """Confirm V3 users are not allowed to POST requests.""" + response = await self.client_2.post(self.debt_path_v3, {}) + self.assertEqual(response.status_code, HTTPStatus.FORBIDDEN) + + async def test_confirm_anonymous_post_requests_are_not_allowed( + self, mock_api_prefix + ) -> None: + """Confirm anonymous users are not allowed to POST requests.""" + response = await self.async_client.post(self.debt_path_v4, {}) + self.assertEqual(response.status_code, HTTPStatus.UNAUTHORIZED) + class DRFOrderingTests(TestCase): """Does ordering work generally and specifically?""" diff --git a/cl/disclosures/api_views.py b/cl/disclosures/api_views.py index 1c1be6f3a4..98f03e67fc 100644 --- a/cl/disclosures/api_views.py +++ b/cl/disclosures/api_views.py @@ -1,4 +1,5 @@ from rest_framework import viewsets +from rest_framework.permissions import DjangoModelPermissionsOrAnonReadOnly from cl.api.api_permissions import V3APIPermission from cl.api.utils import LoggingMixin From c218189b347d8ecaef57a2e32f85bdd005d42a1b Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 25 Nov 2024 10:47:16 -0600 Subject: [PATCH 095/143] fix(api): Added DjangoModelPermissionsOrAnonReadOnly to required api views --- cl/api/api_permissions.py | 9 +------- cl/audio/api_views.py | 6 ++++- cl/disclosures/api_views.py | 45 ++++++++++++++++++++++++++++-------- cl/people_db/api_views.py | 46 +++++++++++++++++++++++++++++-------- cl/recap/views.py | 10 ++++++-- cl/search/api_views.py | 31 ++++++++++++++++++++----- 6 files changed, 112 insertions(+), 35 deletions(-) diff --git a/cl/api/api_permissions.py b/cl/api/api_permissions.py index b73fc5fcae..c5af0d0696 100644 --- a/cl/api/api_permissions.py +++ b/cl/api/api_permissions.py @@ -19,10 +19,7 @@ def has_object_permission(self, request, view, obj): return obj.user == request.user -class V3APIPermission( - permissions.DjangoModelPermissionsOrAnonReadOnly, - permissions.BasePermission, -): +class V3APIPermission(permissions.BasePermission): r = get_redis_interface("STATS") v3_blocked_message = ( @@ -73,10 +70,6 @@ def has_permission(self, request: Request, view: APIView) -> bool: :return: True if the user has permission to access V3, False if not. """ - # Prioritize DjangoModelPermissionsOrAnonReadOnly permissions - if not super().has_permission(request, view): - return False - if ( not self.is_v3_api_request(request) or not settings.BLOCK_NEW_V3_USERS # type: ignore diff --git a/cl/audio/api_views.py b/cl/audio/api_views.py index a444db4a98..fa6d518ec9 100644 --- a/cl/audio/api_views.py +++ b/cl/audio/api_views.py @@ -1,4 +1,5 @@ from rest_framework import viewsets +from rest_framework.permissions import DjangoModelPermissionsOrAnonReadOnly from cl.api.api_permissions import V3APIPermission from cl.api.utils import LoggingMixin @@ -10,7 +11,10 @@ class AudioViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = AudioSerializer filterset_class = AudioFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", diff --git a/cl/disclosures/api_views.py b/cl/disclosures/api_views.py index 98f03e67fc..64ce52bac4 100644 --- a/cl/disclosures/api_views.py +++ b/cl/disclosures/api_views.py @@ -41,7 +41,10 @@ class AgreementViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Agreement.objects.all().order_by("-id") serializer_class = AgreementSerializer - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") filterset_class = AgreementFilter # Default cursor ordering key @@ -57,7 +60,10 @@ class AgreementViewSet(LoggingMixin, viewsets.ModelViewSet): class DebtViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Debt.objects.all().order_by("-id") serializer_class = DebtSerializer - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") filterset_class = DebtFilter # Default cursor ordering key @@ -88,7 +94,10 @@ class FinancialDisclosureViewSet(LoggingMixin, viewsets.ModelViewSet): ) serializer_class = FinancialDisclosureSerializer filterset_class = FinancialDisclosureFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -104,7 +113,10 @@ class GiftViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Gift.objects.all().order_by("-id") serializer_class = GiftSerializer filterset_class = GiftFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -120,7 +132,10 @@ class InvestmentViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Investment.objects.all().order_by("-id") serializer_class = InvestmentSerializer filterset_class = InvestmentFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -136,7 +151,10 @@ class NonInvestmentIncomeViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = NonInvestmentIncome.objects.all().order_by("-id") serializer_class = NonInvestmentIncomeSerializer filterset_class = NonInvestmentIncomeFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -152,7 +170,10 @@ class PositionViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Position.objects.all().order_by("-id") serializer_class = PositionSerializer filterset_class = PositionFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -168,7 +189,10 @@ class ReimbursementViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Reimbursement.objects.all().order_by("-id") serializer_class = ReimbursementSerializer filterset_class = ReimbursementFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -184,7 +208,10 @@ class SpouseIncomeViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = SpouseIncome.objects.all().order_by("-id") serializer_class = SpouseIncomeSerializer filterset_class = SpouseIncomeFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" diff --git a/cl/people_db/api_views.py b/cl/people_db/api_views.py index c593c1789a..7675ef32b6 100644 --- a/cl/people_db/api_views.py +++ b/cl/people_db/api_views.py @@ -1,5 +1,6 @@ from django.db.models import Exists, OuterRef, Prefetch from rest_framework import viewsets +from rest_framework.permissions import DjangoModelPermissionsOrAnonReadOnly from cl.api.api_permissions import V3APIPermission from cl.api.pagination import TinyAdjustablePagination @@ -90,7 +91,10 @@ class PersonDisclosureViewSet(viewsets.ModelViewSet): serializer_class = PersonDisclosureSerializer filterset_class = PersonDisclosureFilter pagination_class = TinyAdjustablePagination - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -122,7 +126,10 @@ class PersonViewSet(LoggingMixin, viewsets.ModelViewSet): ) serializer_class = PersonSerializer filterset_class = PersonFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -145,7 +152,10 @@ class PositionViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Position.objects.all().order_by("-id") serializer_class = PositionSerializer filterset_class = PositionFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -175,7 +185,10 @@ class RetentionEventViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = RetentionEvent.objects.all().order_by("-id") serializer_class = RetentionEventSerializer filterset_class = RetentionEventFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified", "date_retention") # Default cursor ordering key ordering = "-id" @@ -191,7 +204,10 @@ class EducationViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Education.objects.all().order_by("-id") serializer_class = EducationSerializer filterset_class = EducationFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -207,7 +223,10 @@ class SchoolViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = School.objects.all().order_by("-id") serializer_class = SchoolSerializer filterset_class = SchoolFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified", "name") # Default cursor ordering key ordering = "-id" @@ -223,7 +242,10 @@ class PoliticalAffiliationViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = PoliticalAffiliation.objects.all().order_by("-id") serializer_class = PoliticalAffiliationSerializer filterset_class = PoliticalAffiliationFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -245,7 +267,10 @@ class SourceViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Source.objects.all().order_by("-id") serializer_class = SourceSerializer filterset_class = SourceFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_modified", @@ -261,7 +286,10 @@ class ABARatingViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = ABARating.objects.all().order_by("-id") serializer_class = ABARatingSerializer filterset_class = ABARatingFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", diff --git a/cl/recap/views.py b/cl/recap/views.py index 9bb70cb6cf..f2383b8868 100644 --- a/cl/recap/views.py +++ b/cl/recap/views.py @@ -3,7 +3,10 @@ from asgiref.sync import async_to_sync, sync_to_async from django.contrib.auth.models import User from rest_framework.exceptions import ValidationError -from rest_framework.permissions import IsAuthenticatedOrReadOnly +from rest_framework.permissions import ( + DjangoModelPermissionsOrAnonReadOnly, + IsAuthenticatedOrReadOnly, +) from rest_framework.viewsets import ModelViewSet from cl.api.api_permissions import V3APIPermission @@ -179,7 +182,10 @@ class FjcIntegratedDatabaseViewSet(LoggingMixin, ModelViewSet): queryset = FjcIntegratedDatabase.objects.all().order_by("-id") serializer_class = FjcIntegratedDatabaseSerializer filterset_class = FjcIntegratedDatabaseFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", diff --git a/cl/search/api_views.py b/cl/search/api_views.py index 2a2ca2eeeb..df798edd6f 100644 --- a/cl/search/api_views.py +++ b/cl/search/api_views.py @@ -4,6 +4,7 @@ from rest_framework import pagination, permissions, response, viewsets from rest_framework.exceptions import NotFound from rest_framework.pagination import PageNumberPagination +from rest_framework.permissions import DjangoModelPermissionsOrAnonReadOnly from cl.api.api_permissions import V3APIPermission from cl.api.pagination import ESCursorPagination @@ -65,7 +66,10 @@ class OriginatingCourtInformationViewSet(viewsets.ModelViewSet): serializer_class = OriginalCourtInformationSerializer - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] # Default cursor ordering key ordering = "-id" # Additional cursor ordering fields @@ -80,7 +84,10 @@ class OriginatingCourtInformationViewSet(viewsets.ModelViewSet): class DocketViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = DocketSerializer filterset_class = DocketFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -171,7 +178,10 @@ class RECAPDocumentViewSet( class CourtViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = CourtSerializer filterset_class = CourtFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_modified", @@ -191,7 +201,10 @@ class CourtViewSet(LoggingMixin, viewsets.ModelViewSet): class OpinionClusterViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = OpinionClusterSerializer filterset_class = OpinionClusterFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -216,7 +229,10 @@ class OpinionClusterViewSet(LoggingMixin, viewsets.ModelViewSet): class OpinionViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = OpinionSerializer filterset_class = OpinionFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -240,7 +256,10 @@ class OpinionViewSet(LoggingMixin, viewsets.ModelViewSet): class OpinionsCitedViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = OpinionsCitedSerializer filterset_class = OpinionsCitedFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] # Default cursor ordering key ordering = "-id" # Additional cursor ordering fields From 3337f5bfb273cc2ba3846310f828e3962eeae602 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Mon, 25 Nov 2024 12:21:51 -0500 Subject: [PATCH 096/143] feat(scrapers.admin): create admin page for a scraper status page - Includes a migration file for the materialized view - MV will have to be refreshed manually or via a cronjob - MV considers only courts that have an active scraper, and that have no updates in a week --- cl/scrapers/admin.py | 59 ++++------------ .../0004_create_mv_latest_opinion.py | 69 +++++++++++++++++++ .../0004_create_mv_latest_opinion.sql | 49 +++++++++++++ ...004_create_mv_latest_opinion_customers.sql | 49 +++++++++++++ 4 files changed, 181 insertions(+), 45 deletions(-) create mode 100644 cl/scrapers/migrations/0004_create_mv_latest_opinion.py create mode 100644 cl/scrapers/migrations/0004_create_mv_latest_opinion.sql create mode 100644 cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql diff --git a/cl/scrapers/admin.py b/cl/scrapers/admin.py index c7cc689a6e..56ec54df03 100644 --- a/cl/scrapers/admin.py +++ b/cl/scrapers/admin.py @@ -32,68 +32,37 @@ class PACERFreeDocumentRowAdmin(admin.ModelAdmin): admin.site.register(UrlHash) -class MVLatestOpinions(models.Model): +class MVLatestOpinion(models.Model): """ Model linked to materialized view for monitoring scrapers + The SQL for creating the view is on it's migration file. + Must use `REFRESH MATERIALIZED VIEW scrapers_mv_latest_opinion` periodically """ - query = """ - CREATE MATERIALIZED VIEW - scrapers_mv_latest_opinion - AS - ( - SELECT - court_id, - max(so.date_created) as latest_creation_date, - (now() - max(so.date_created))::text as time_since - FROM - ( - SELECT id, court_id - FROM search_docket - WHERE court_id IN ( - SELECT id - FROM search_court - /* - Only check courts with scrapers in use - */ - WHERE - has_opinion_scraper - AND in_use - ) - ) sd - INNER JOIN - (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id - INNER JOIN - search_opinion so ON so.cluster_id = soc.id - GROUP BY - sd.court_id - HAVING - /* - Only return results for courts with no updates in a week - */ - now() - max(so.date_created) > interval '7 days' - ORDER BY - 2 DESC - ) - """ # a django model must have a primary key court_id = models.TextField(primary_key=True) - latest_creation_date = models.DateField() + latest_creation_date = models.DateTimeField() time_since = models.TextField() + view_last_updated = models.DateTimeField() class Meta: - managed = False # ignore this model in migrations + managed = False db_table = "scrapers_mv_latest_opinion" -@admin.register(MVLatestOpinions) -class MVLatestOpinionsAdmin(admin.ModelAdmin): +@admin.register(MVLatestOpinion) +class MVLatestOpinionAdmin(admin.ModelAdmin): """Admin page to look at the latest opinion for each court Use this to monitor silently failing scrapers """ - list_display = ["court_id", "latest_creation_date", "time_since"] + list_display = [ + "court_id", + "latest_creation_date", + "time_since", + "view_last_updated", + ] diff --git a/cl/scrapers/migrations/0004_create_mv_latest_opinion.py b/cl/scrapers/migrations/0004_create_mv_latest_opinion.py new file mode 100644 index 0000000000..4570c75d97 --- /dev/null +++ b/cl/scrapers/migrations/0004_create_mv_latest_opinion.py @@ -0,0 +1,69 @@ +# Generated by Django 5.1.2 on 2024-11-25 15:27 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("scrapers", "0003_delete_errorlog"), + ] + + operations = [ + migrations.CreateModel( + name="MVLatestOpinion", + fields=[ + ( + "court_id", + models.TextField(primary_key=True, serialize=False), + ), + ("latest_creation_date", models.DateTimeField()), + ("time_since", models.TextField()), + ("view_last_updated", models.DateTimeField()), + ], + options={ + "db_table": "scrapers_mv_latest_opinion", + "managed": False, + }, + ), + migrations.RunSQL(""" + CREATE MATERIALIZED VIEW IF NOT EXISTS + scrapers_mv_latest_opinion + AS + ( + SELECT + court_id, + max(so.date_created) as latest_creation_date, + DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since, + now() as view_last_updated + FROM + ( + SELECT id, court_id + FROM search_docket + WHERE court_id IN ( + SELECT id + FROM search_court + /* + Only check courts with scrapers in use + */ + WHERE + has_opinion_scraper + AND in_use + ) + ) sd + INNER JOIN + (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id + INNER JOIN + search_opinion so ON so.cluster_id = soc.id + GROUP BY + sd.court_id + HAVING + /* + Only return results for courts with no updates in a week + */ + now() - max(so.date_created) > interval '7 days' + ORDER BY + 2 DESC + ) + """) + ] diff --git a/cl/scrapers/migrations/0004_create_mv_latest_opinion.sql b/cl/scrapers/migrations/0004_create_mv_latest_opinion.sql new file mode 100644 index 0000000000..45c212298e --- /dev/null +++ b/cl/scrapers/migrations/0004_create_mv_latest_opinion.sql @@ -0,0 +1,49 @@ +BEGIN; +-- +-- Create model MVLatestOpinion +-- +-- (no-op) +-- +-- Raw SQL operation +-- + + CREATE MATERIALIZED VIEW IF NOT EXISTS + scrapers_mv_latest_opinion + AS + ( + SELECT + court_id, + max(so.date_created) as latest_creation_date, + DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since, + now() as view_last_updated + FROM + ( + SELECT id, court_id + FROM search_docket + WHERE court_id IN ( + SELECT id + FROM search_court + /* + Only check courts with scrapers in use + */ + WHERE + has_opinion_scraper + AND in_use + ) + ) sd + INNER JOIN + (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id + INNER JOIN + search_opinion so ON so.cluster_id = soc.id + GROUP BY + sd.court_id + HAVING + /* + Only return results for courts with no updates in a week + */ + now() - max(so.date_created) > interval '7 days' + ORDER BY + 2 DESC + ) + ; +COMMIT; diff --git a/cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql b/cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql new file mode 100644 index 0000000000..45c212298e --- /dev/null +++ b/cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql @@ -0,0 +1,49 @@ +BEGIN; +-- +-- Create model MVLatestOpinion +-- +-- (no-op) +-- +-- Raw SQL operation +-- + + CREATE MATERIALIZED VIEW IF NOT EXISTS + scrapers_mv_latest_opinion + AS + ( + SELECT + court_id, + max(so.date_created) as latest_creation_date, + DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since, + now() as view_last_updated + FROM + ( + SELECT id, court_id + FROM search_docket + WHERE court_id IN ( + SELECT id + FROM search_court + /* + Only check courts with scrapers in use + */ + WHERE + has_opinion_scraper + AND in_use + ) + ) sd + INNER JOIN + (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id + INNER JOIN + search_opinion so ON so.cluster_id = soc.id + GROUP BY + sd.court_id + HAVING + /* + Only return results for courts with no updates in a week + */ + now() - max(so.date_created) > interval '7 days' + ORDER BY + 2 DESC + ) + ; +COMMIT; From 2136db8414dd17e927c873df27ba5d476f49ea3e Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Mon, 25 Nov 2024 12:23:45 -0500 Subject: [PATCH 097/143] refactor(scrapers.update_from_text): "C" to SOURCES.COURT_WEBSITE --- cl/scrapers/management/commands/update_from_text.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py index 399e49e1b1..ee093d9e01 100644 --- a/cl/scrapers/management/commands/update_from_text.py +++ b/cl/scrapers/management/commands/update_from_text.py @@ -5,7 +5,12 @@ from cl.lib.command_utils import ScraperCommand, logger from cl.scrapers.tasks import update_document_from_text -from cl.search.models import PRECEDENTIAL_STATUS, Opinion, OpinionCluster +from cl.search.models import ( + PRECEDENTIAL_STATUS, + SOURCES, + Opinion, + OpinionCluster, +) def rerun_extract_from_text( @@ -169,7 +174,7 @@ def handle(self, *args, **options): "docket__court_id": court_id, "date_filed__gte": options["date_filed_gte"], "date_filed__lte": options["date_filed_lte"], - "source__contains": "C", + "source__contains": SOURCES.COURT_WEBSITE, } if options["cluster_status"]: From 71dc92a6f8b62a608aa2b7cb41874b37546e28d5 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 25 Nov 2024 13:58:56 -0600 Subject: [PATCH 098/143] fix(elasticsearch): Applied suggestion in set_child_docs_and_score --- cl/lib/elasticsearch_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 1b9366f713..2c1f1053c9 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -3337,8 +3337,7 @@ def set_child_docs_and_score( if result_is_dict: # If the result is a dictionary, do nothing, or assign [] to # child_docs if it is not present. - child_docs = result.get("child_docs", []) - result["child_docs"] = child_docs + result["child_docs"] = result.get("child_docs", []) else: # Process child hits if the result is an ES AttrDict instance, # so they can be properly serialized. From 1bbb82e16ac43d64de32bf5a16e09b78a06b5d68 Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Mon, 25 Nov 2024 19:24:53 -0300 Subject: [PATCH 099/143] feat(webhook_logs): Display timezone in all datetimes --- cl/users/templates/includes/webhook-event-detail.html | 4 ++-- .../templates/includes/webhooks_htmx/webhook-logs-list.html | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cl/users/templates/includes/webhook-event-detail.html b/cl/users/templates/includes/webhook-event-detail.html index 20f631fb53..2510ac729c 100644 --- a/cl/users/templates/includes/webhook-event-detail.html +++ b/cl/users/templates/includes/webhook-event-detail.html @@ -13,11 +13,11 @@

Webhook Event Details{% if webhook_event.debug %} (

{% if webhook_event.webhook.enabled %} Enabled {% else %} Disabled {% endif %}

{{ webhook_event.webhook.get_event_type_display }}

{{ webhook_event.event_id }}

-

{{ webhook_event.date_created }}

+

{{ webhook_event.date_created|date:"M. j, Y, h:m a T" }}

{% if webhook_event.status_code %}{{ webhook_event.status_code }} {{ webhook_event.get_status_code_display }} {% else %}-{% endif %}

{{ webhook_event.get_event_status_display }}

{{ webhook_event.retry_counter }}

-

{% if not webhook_event.debug %}{% if webhook_event.next_retry_date %}{{ webhook_event.next_retry_date }}{% else %}-{% endif %}{% else %}Test events will not be retried{% endif %}

+

{% if not webhook_event.debug %}{% if webhook_event.next_retry_date %}{{ webhook_event.next_retry_date|date:"M. j, Y, h:m a T" }}{% else %}-{% endif %}{% else %}Test events will not be retried{% endif %}

diff --git a/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html b/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html index dc022dff94..a43f9eeb85 100644 --- a/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html +++ b/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html @@ -21,11 +21,11 @@ {% endif %}

- {{ webhook.date_created }} + {{ webhook.date_created|date:"M. j, Y, h:m a T" }} {% if not webhook.debug %} {% if webhook.next_retry_date %} - {{ webhook.next_retry_date }} + {{ webhook.next_retry_date|date:"M. j, Y, h:m a T" }} {% else %} - {% endif %} From 39761ebc4f0e943b1ec1841f56590ce07dcac311 Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Mon, 25 Nov 2024 19:44:40 -0300 Subject: [PATCH 100/143] fix(webhook_logs): Fix format string character --- cl/users/templates/includes/webhook-event-detail.html | 4 ++-- .../templates/includes/webhooks_htmx/webhook-logs-list.html | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cl/users/templates/includes/webhook-event-detail.html b/cl/users/templates/includes/webhook-event-detail.html index 2510ac729c..63c5ed7920 100644 --- a/cl/users/templates/includes/webhook-event-detail.html +++ b/cl/users/templates/includes/webhook-event-detail.html @@ -13,11 +13,11 @@

Webhook Event Details{% if webhook_event.debug %} (

{% if webhook_event.webhook.enabled %} Enabled {% else %} Disabled {% endif %}

{{ webhook_event.webhook.get_event_type_display }}

{{ webhook_event.event_id }}

-

{{ webhook_event.date_created|date:"M. j, Y, h:m a T" }}

+

{{ webhook_event.date_created|date:"M. j, Y, h:i a T" }}

{% if webhook_event.status_code %}{{ webhook_event.status_code }} {{ webhook_event.get_status_code_display }} {% else %}-{% endif %}

{{ webhook_event.get_event_status_display }}

{{ webhook_event.retry_counter }}

-

{% if not webhook_event.debug %}{% if webhook_event.next_retry_date %}{{ webhook_event.next_retry_date|date:"M. j, Y, h:m a T" }}{% else %}-{% endif %}{% else %}Test events will not be retried{% endif %}

+

{% if not webhook_event.debug %}{% if webhook_event.next_retry_date %}{{ webhook_event.next_retry_date|date:"M. j, Y, h:i a T" }}{% else %}-{% endif %}{% else %}Test events will not be retried{% endif %}

diff --git a/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html b/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html index a43f9eeb85..18b19f641a 100644 --- a/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html +++ b/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html @@ -21,11 +21,11 @@ {% endif %}

- {{ webhook.date_created|date:"M. j, Y, h:m a T" }} + {{ webhook.date_created|date:"M. j, Y, h:i a T" }} {% if not webhook.debug %} {% if webhook.next_retry_date %} - {{ webhook.next_retry_date|date:"M. j, Y, h:m a T" }} + {{ webhook.next_retry_date|date:"M. j, Y, h:i a T" }} {% else %} - {% endif %} From e8c71e779126dcd74a98fce90ab8e73b85a61267 Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Mon, 25 Nov 2024 20:58:00 -0300 Subject: [PATCH 101/143] fix(webhook_logs): Always display datetimes in UTC in webhook logs --- cl/custom_filters/templatetags/extras.py | 17 ++++++++++++++++- .../includes/webhook-event-detail.html | 5 +++-- .../webhooks_htmx/webhook-logs-list.html | 5 +++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/cl/custom_filters/templatetags/extras.py b/cl/custom_filters/templatetags/extras.py index 39d535b2df..6532ca2881 100644 --- a/cl/custom_filters/templatetags/extras.py +++ b/cl/custom_filters/templatetags/extras.py @@ -1,7 +1,7 @@ import random import re import urllib.parse -from datetime import datetime +from datetime import datetime, timezone import waffle from django import template @@ -337,6 +337,21 @@ def format_date(date_str: str) -> str: return date_str +@register.filter +def datetime_in_utc(date_obj) -> str: + """Formats a datetime object in UTC with timezone displayed. + For example: 'Nov. 25, 2024, 01:28 p.m. UTC'""" + if date_obj is None: + return "" + try: + return date_filter( + date_obj.astimezone(timezone.utc), + "M. j, Y, h:i a T", + ) + except (ValueError, TypeError): + return date_obj + + @register.filter def build_docket_id_q_param(request_q: str, docket_id: str) -> str: """Build a query string that includes the docket ID and any existing query diff --git a/cl/users/templates/includes/webhook-event-detail.html b/cl/users/templates/includes/webhook-event-detail.html index 63c5ed7920..9f70262daa 100644 --- a/cl/users/templates/includes/webhook-event-detail.html +++ b/cl/users/templates/includes/webhook-event-detail.html @@ -1,4 +1,5 @@ {% extends "profile/webhooks_base.html" %} +{% load extras %} {% load static %} {% load waffle_tags %} {% load humanize %} @@ -13,11 +14,11 @@

Webhook Event Details{% if webhook_event.debug %} (

{% if webhook_event.webhook.enabled %} Enabled {% else %} Disabled {% endif %}

{{ webhook_event.webhook.get_event_type_display }}

{{ webhook_event.event_id }}

-

{{ webhook_event.date_created|date:"M. j, Y, h:i a T" }}

+

{{ webhook_event.date_created|datetime_in_utc }}

{% if webhook_event.status_code %}{{ webhook_event.status_code }} {{ webhook_event.get_status_code_display }} {% else %}-{% endif %}

{{ webhook_event.get_event_status_display }}

{{ webhook_event.retry_counter }}

-

{% if not webhook_event.debug %}{% if webhook_event.next_retry_date %}{{ webhook_event.next_retry_date|date:"M. j, Y, h:i a T" }}{% else %}-{% endif %}{% else %}Test events will not be retried{% endif %}

+

{% if not webhook_event.debug %}{% if webhook_event.next_retry_date %}{{ webhook_event.next_retry_date|datetime_in_utc }}{% else %}-{% endif %}{% else %}Test events will not be retried{% endif %}

diff --git a/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html b/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html index 18b19f641a..a9f8596832 100644 --- a/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html +++ b/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html @@ -1,3 +1,4 @@ +{% load extras %} {% load widget_tweaks %} {% if results %} {% for webhook in results %} @@ -21,11 +22,11 @@ {% endif %}

- {{ webhook.date_created|date:"M. j, Y, h:i a T" }} + {{ webhook.date_created|datetime_in_utc }} {% if not webhook.debug %} {% if webhook.next_retry_date %} - {{ webhook.next_retry_date|date:"M. j, Y, h:i a T" }} + {{ webhook.next_retry_date|datetime_in_utc }} {% else %} - {% endif %} From b2c9ada3443a0ef37e4fac3bb59485dc7cd49c7b Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Mon, 25 Nov 2024 19:59:15 -0600 Subject: [PATCH 102/143] feat(casenames): update parse_citations function --- .../management/commands/update_casenames_wl_dataset.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py index fe7eeeefe6..8a0e85a815 100644 --- a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -150,11 +150,8 @@ def parse_citations(citation_strings: list[str]) -> list[dict]: if not volume or not volume.isdigit(): continue - if not citation.corrected_reporter(): - reporter_type = Citation.STATE - else: - cite_type_str = citation.all_editions[0].reporter.cite_type - reporter_type = map_reporter_db_cite_type(cite_type_str) + cite_type_str = citation.all_editions[0].reporter.cite_type + reporter_type = map_reporter_db_cite_type(cite_type_str) # Append the validated citation as a dictionary validated_citations.append( From 274112171af31e2d56f7c6b7f9607b8cb770b0d7 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 26 Nov 2024 11:11:19 -0600 Subject: [PATCH 103/143] fix(elasticsearch): Fixed ES MLT query Fixes: #4305 --- cl/lib/elasticsearch_utils.py | 34 +++++++++++++++++++++-------- cl/opinion_page/utils.py | 8 +++---- cl/search/constants.py | 8 +++---- cl/search/tests/tests_es_opinion.py | 1 + 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 2c1f1053c9..96cb01653f 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -176,22 +176,38 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query: exclusions for specific opinion clusters. """ - document_list = [{"_id": f"o_{id}"} for id in related_ids] + opinion_cluster_pairs = [ + opinion_pair + for opinion_id in related_ids + if ( + opinion_pair := await Opinion.objects.filter(pk=opinion_id) + .values("pk", "cluster_id") + .afirst() + ) + ] + unique_clusters = {pair["cluster_id"] for pair in opinion_cluster_pairs} + + document_list = [ + { + "_id": f'o_{opinion_pair["pk"]}', + "routing": opinion_pair["cluster_id"], + } + for opinion_pair in opinion_cluster_pairs + ] more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy() mlt_query = Q( "more_like_this", fields=more_like_this_fields, like=document_list, - min_term_freq=1, - max_query_terms=12, + min_term_freq=settings.RELATED_MLT_MINTF, + max_query_terms=settings.RELATED_MLT_MAXQT, + min_word_length=settings.RELATED_MLT_MINWL, + max_word_length=settings.RELATED_MLT_MAXWL, + max_doc_freq=settings.RELATED_MLT_MAXDF, + analyzer="search_analyzer_exact", ) # Exclude opinion clusters to which the related IDs to query belong. - cluster_ids_to_exclude = ( - OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids) - .distinct("pk") - .values_list("pk", flat=True) - ) - cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()] + cluster_ids_list = list(unique_clusters) exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)] bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids) return bool_query diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index 160453bb1f..b135d3b020 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -166,13 +166,11 @@ async def build_cites_clusters_query( async def build_related_clusters_query( cluster_search: Search, sub_opinion_pks: list[str], - search_params: dict[str, str], ) -> Search: """Build the ES related clusters query based on sub-opinion IDs. :param cluster_search: The Elasticsearch DSL Search object :param sub_opinion_pks: A list of IDs representing sub-opinions to be queried. - :param search_params: A dict of parameters used to form the query. :return: The ES DSL Search object representing the query to find the related clusters. """ @@ -267,11 +265,13 @@ async def es_get_citing_and_related_clusters_with_cache( related_index = citing_index = None if cached_related_clusters is None: related_query = await build_related_clusters_query( - cluster_search, sub_opinion_pks, search_params + cluster_search, sub_opinion_pks ) related_query = related_query.extra( - size=settings.RELATED_COUNT, track_total_hits=False + size=settings.RELATED_COUNT, + track_total_hits=False, ) + print("Related query opinion: ", related_query.to_dict()) multi_search = multi_search.add(related_query) related_index = response_index response_index += 1 diff --git a/cl/search/constants.py b/cl/search/constants.py index 333dfbca6c..f7e76cb8fb 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -110,10 +110,10 @@ "syllabus", ] SEARCH_MLT_OPINION_QUERY_FIELDS = [ - "procedural_history", - "posture", - "syllabus", - "text", + "procedural_history.exact", + "posture.exact", + "syllabus.exact", + "text.exact", ] # ES fields that are used for highlighting diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index c7d9c2568d..4996f7d985 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -2253,6 +2253,7 @@ def test_uses_exact_version_for_case_name_field(self) -> None: cluster_2.delete() +@override_settings(RELATED_MLT_MINTF=1) class RelatedSearchTest( ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase ): From 877cf130e2bc7c49dc2cd446cf8d98d794e20b5e Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Tue, 26 Nov 2024 14:27:44 -0300 Subject: [PATCH 104/143] test(webhooks): Adjust test so it's now less sensitive to trailing whitespaces Adding a {% load %} tag introduced an extra newline that was picked up by the test. Instead of counting newline chars we now only focus on whether the response contains meaningful content or not. --- cl/users/tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/users/tests.py b/cl/users/tests.py index 89170a445c..f5e26aac09 100644 --- a/cl/users/tests.py +++ b/cl/users/tests.py @@ -3467,7 +3467,7 @@ async def test_list_webhook_events(self) -> None: response = await self.client.get(webhook_event_path_list) self.assertEqual(response.status_code, HTTPStatus.OK) # There shouldn't be results for user_1 - self.assertEqual(response.content, b"\n\n") + self.assertEqual(response.content.strip(), b"") sa_webhook = await sync_to_async(WebhookFactory)( user=self.user_1, @@ -3485,7 +3485,7 @@ async def test_list_webhook_events(self) -> None: response = await self.client.get(webhook_event_path_list) self.assertEqual(response.status_code, HTTPStatus.OK) # There should be results for user_1 - self.assertNotEqual(response.content, b"\n\n") + self.assertNotEqual(response.content.strip(), b"") async def test_get_available_webhook_versions(self) -> None: """Can we get users available versions for a webhook event type?""" From d8b72b08aea225cb7f1e9cb1ee1f1f114349a1f5 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 26 Nov 2024 11:34:20 -0600 Subject: [PATCH 105/143] fix(elasticsearch): Added a fallback to the MLT query in case the IDs are not found in the DB --- cl/lib/elasticsearch_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 96cb01653f..f0a88ce0ea 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -189,11 +189,15 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query: document_list = [ { - "_id": f'o_{opinion_pair["pk"]}', - "routing": opinion_pair["cluster_id"], + "_id": f'o_{pair["pk"]}', + "routing": pair["cluster_id"], + # Important to match documents in the production cluster } - for opinion_pair in opinion_cluster_pairs - ] + for pair in opinion_cluster_pairs + ] or [ + {"_id": f"o_{pk}"} for pk in related_ids + ] # Fall back in case IDs are not found in DB. + more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy() mlt_query = Q( "more_like_this", From dd1c21453f4a05d31d5373e49fb0fb06496b8e0c Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 26 Nov 2024 11:37:53 -0600 Subject: [PATCH 106/143] feat(casenames): fix code to combine initials in case names --- .../commands/update_casenames_wl_dataset.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py index 8a0e85a815..6c1d4def01 100644 --- a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -231,9 +231,15 @@ def combine_initials(case_name: str) -> str: :return: the cleaned case caption """ - pattern = r"((?:[A-Z]\.?\s?){2,})(\s|$)" - - return re.sub(pattern, lambda m: m.group(0).replace(".", ""), case_name) + initials_pattern = re.compile(r"(\b[A-Z]\.?\s?){2,}(\s|$)") + + matches = initials_pattern.finditer(case_name) + if matches: + for match in matches: + initials = match.group() + compressed_initials = re.sub(r"(?!\s$)[\s\.]", "", initials) + case_name = case_name.replace(initials, compressed_initials) + return case_name def process_csv(filepath: str, delay: float, dry_run: bool) -> None: From 8247d391d00dc873e5b19fa53bc802adec147341 Mon Sep 17 00:00:00 2001 From: William Palin Date: Tue, 26 Nov 2024 12:40:23 -0500 Subject: [PATCH 107/143] feat(opinions): Update css A few minor tweaks to some obvious css issues when looking around. Also - when we reingest the harvard data it is going to modify footnotes and page numbers in some. I want to add some css to make sure we are prepared for them. --- cl/assets/static-global/css/opinions.css | 28 +++++++++++++++++++++++- cl/assets/static-global/css/override.css | 1 - cl/assets/static-global/js/opinions.js | 9 ++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/cl/assets/static-global/css/opinions.css b/cl/assets/static-global/css/opinions.css index b120143c0b..ff1b0200d3 100644 --- a/cl/assets/static-global/css/opinions.css +++ b/cl/assets/static-global/css/opinions.css @@ -578,6 +578,32 @@ div.footnote:first-of-type { margin-left: 2px; } + page-label { + font-style: italic; + font-size: 0.8em; + margin-right: 4px; + margin-left: 2px; + } + + page-label { + cursor: pointer; + } + + page-label:hover { + color: darkblue; + text-decoration: underline; /* Example hover styling */ + } + + page-label::after { + display: inline; + position: relative; + content: attr(data-label); + float: right; + font-size: 1em; + color: dimgray; + width: 0; + } + a.page-label { font-style: italic; font-size: 0.8em; @@ -623,6 +649,7 @@ div.footnote:first-of-type { /* Adjust to move the entire blockquote to the right */ blockquote { margin-left: 3em; + display: block; } div.counsel > a.page-label::after { @@ -687,7 +714,6 @@ div.footnote:first-of-type { display: block; text-indent: 1em; } - } html { diff --git a/cl/assets/static-global/css/override.css b/cl/assets/static-global/css/override.css index 021b6e6996..b0b0979f9e 100644 --- a/cl/assets/static-global/css/override.css +++ b/cl/assets/static-global/css/override.css @@ -1031,7 +1031,6 @@ closely the content in the book*/ #headmatter > .footnotes > .footnote > a { color: #000099; - position: absolute; font-size: 1em; } diff --git a/cl/assets/static-global/js/opinions.js b/cl/assets/static-global/js/opinions.js index 65d35e2248..e6665237bf 100644 --- a/cl/assets/static-global/js/opinions.js +++ b/cl/assets/static-global/js/opinions.js @@ -278,4 +278,13 @@ document.addEventListener('scroll', function () { if (activeLink.parentElement) { activeLink.parentElement.classList.add('active'); } +}); + +document.querySelectorAll("page-label").forEach(label => { + label.addEventListener("click", function() { + const href = this.getAttribute("href"); + if (href) { + window.location.href = href; + } + }); }); \ No newline at end of file From 9adcd298c410ef56ae7ced70b4b77893049a34fa Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Tue, 26 Nov 2024 16:07:28 -0300 Subject: [PATCH 108/143] feat(api): Enable filtering courts by parent court id --- cl/search/filters.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cl/search/filters.py b/cl/search/filters.py index d7f11e472c..6f1d6f6603 100644 --- a/cl/search/filters.py +++ b/cl/search/filters.py @@ -28,6 +28,10 @@ class CourtFilter(NoEmptyFilterSet): "cl.search.filters.DocketFilter", queryset=Docket.objects.all() ) jurisdiction = filters.MultipleChoiceFilter(choices=Court.JURISDICTIONS) + parent_court = filters.CharFilter( + field_name="parent_court__id", + lookup_expr="exact", + ) class Meta: model = Court From 00885f3e6e84243d99b35830e346cbc866a0a6d4 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 26 Nov 2024 13:17:51 -0600 Subject: [PATCH 109/143] fix(elasticsearch): Removed stray print --- cl/opinion_page/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index b135d3b020..a1c9d0eeeb 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -271,7 +271,6 @@ async def es_get_citing_and_related_clusters_with_cache( size=settings.RELATED_COUNT, track_total_hits=False, ) - print("Related query opinion: ", related_query.to_dict()) multi_search = multi_search.add(related_query) related_index = response_index response_index += 1 From adf676efbb37e85409686a5d20d48bbfbe128c98 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 26 Nov 2024 16:58:58 -0600 Subject: [PATCH 110/143] fix(elasticsearch): Avoid wrapping numbers in quotes in boost queries - Removed Solr failing tests. Fixes: #4737 --- cl/lib/utils.py | 2 +- cl/search/tests/tests.py | 73 ++++++++-------------------------------- 2 files changed, 15 insertions(+), 60 deletions(-) diff --git a/cl/lib/utils.py b/cl/lib/utils.py index 223056420f..592f8876d0 100644 --- a/cl/lib/utils.py +++ b/cl/lib/utils.py @@ -248,7 +248,7 @@ def cleanup_main_query(query_string: str) -> str: """ inside_a_phrase = False cleaned_items = [] - for item in re.split(r'([^a-zA-Z0-9_\-~":]+)', query_string): + for item in re.split(r'([^a-zA-Z0-9_\-^~":]+)', query_string): if not item: continue diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index fe59be528d..3dac875f01 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -1076,6 +1076,20 @@ def test_round_estimated_search_counts(self) -> None: with self.subTest(test=test, msg="Test estimated search counts."): self.assertEqual(simplify_estimated_count(test[0]), test[1]) + def test_avoid_wrapping_boosted_numbers_in_quotes(self) -> None: + """Confirm that numbers in boost queries are not wrapped in quotes + that makes the query to fail. + """ + search_params = { + "type": SEARCH_TYPES.ORAL_ARGUMENT, + "q": "Jose^3", + } + r = self.client.get( + reverse("show_results"), + search_params, + ) + self.assertNotIn("encountered an error", r.content.decode()) + class SearchAPIV4CommonTest(ESIndexTestCase, TestCase): """Common tests for the Search API V4 endpoints.""" @@ -1643,35 +1657,6 @@ def test_search_query_saving(self) -> None: "Repeated query not marked as having hit cache", ) - # Force Solr use - @override_flag("oa-es-active", False) - @override_flag("r-es-active", False) - @override_flag("p-es-active", False) - @override_flag("o-es-active", False) - def test_search_query_saving_solr(self) -> None: - """Are queries saved when using solr search (do_search)""" - for query in self.searches: - url = f"{reverse('show_results')}?{query}" - self.client.get(url) - last_query = SearchQuery.objects.last() - expected_query = self.normalize_query(query, replace_space=True) - stored_query = self.normalize_query(last_query.get_params) - self.assertEqual( - expected_query, - stored_query, - f"Query was not saved properly. Expected {expected_query}, got {stored_query}", - ) - self.assertEqual( - last_query.engine, - SearchQuery.SOLR, - f"Saved wrong `engine` value, expected {SearchQuery.SOLR}", - ) - self.assertEqual( - last_query.source, - SearchQuery.WEBSITE, - self.source_error_message, - ) - def test_failed_es_search_queries(self) -> None: """Do we flag failed ElasticSearch queries properly?""" query = "type=r&q=contains/sproximity token" @@ -1772,36 +1757,6 @@ def test_failed_es_search_v3_api_queries(self) -> None: f"Saved wrong `engine` value, expected {SearchQuery.ELASTICSEARCH}", ) - @override_flag("oa-es-active", False) - @override_flag("oa-es-activate", False) - @override_flag("r-es-search-api-active", False) - @override_flag("p-es-active", False) - @override_flag("o-es-search-api-active", False) - def test_search_solr_api_v3_query_saving(self) -> None: - """Do we save queries on all V3 Search Solr endpoints""" - for query in self.base_searches: - url = f"{reverse("search-list", kwargs={"version": "v3"})}?{query}" - self.client.get(url) - # Compare parsed query strings; - last_query = SearchQuery.objects.last() - expected_query = self.normalize_query(query, replace_space=True) - stored_query = self.normalize_query(last_query.get_params) - self.assertEqual( - expected_query, - stored_query, - f"Query was not saved properly. Expected {expected_query}, got {stored_query}", - ) - self.assertEqual( - last_query.engine, - SearchQuery.SOLR, - f"Saved wrong `engine` value, expected {SearchQuery.ELASTICSEARCH}", - ) - self.assertEqual( - last_query.source, - SearchQuery.API, - self.source_error_message, - ) - class CaptionTest(TestCase): """Can we make good looking captions?""" From 5d0938681837ca5b1855dd040062f70ef01f2dde Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Tue, 26 Nov 2024 20:59:29 -0300 Subject: [PATCH 111/143] test(api): Add tests for court filtering by parent_court --- cl/api/tests.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/cl/api/tests.py b/cl/api/tests.py index 238c0d04a7..275695b50f 100644 --- a/cl/api/tests.py +++ b/cl/api/tests.py @@ -86,7 +86,7 @@ TagViewSet, ) from cl.search.factories import CourtFactory, DocketFactory -from cl.search.models import SOURCES, Docket, Opinion +from cl.search.models import SOURCES, Court, Docket, Opinion from cl.stats.models import Event from cl.tests.cases import SimpleTestCase, TestCase, TransactionTestCase from cl.tests.utils import MockResponse, make_client @@ -673,6 +673,56 @@ async def assertCountInResults(self, expected_count): ) +class DRFCourtApiFilterTests(TestCase, FilteringCountTestCase): + @classmethod + def setUpTestData(cls): + Court.objects.all().delete() + + cls.parent_court = CourtFactory(id="parent1", full_name="Parent Court") + + cls.child_court1 = CourtFactory( + id="child1", + parent_court=cls.parent_court, + full_name="Child Court 1", + ) + cls.child_court2 = CourtFactory( + id="child2", + parent_court=cls.parent_court, + full_name="Child Court 2", + ) + + cls.orphan_court = CourtFactory(id="orphan", full_name="Orphan Court") + + @async_to_sync + async def setUp(self): + self.path = reverse("court-list", kwargs={"version": "v4"}) + self.q: Dict[str, Any] = {} + + async def test_parent_court_filter(self): + """Can we filter courts by parent_court id?""" + self.q["parent_court"] = "parent1" + await self.assertCountInResults(2) # Should return child1 and child2 + + # Verify the returned court IDs + response = await self.async_client.get(self.path, self.q) + court_ids = [court["id"] for court in response.data["results"]] + self.assertEqual(set(court_ids), {"child1", "child2"}) + + # Filter for courts with parent_court id='orphan' (none should match) + self.q["parent_court"] = "orphan" + await self.assertCountInResults(0) + + async def test_no_parent_court_filter(self): + """Do we get all courts when using no filters?""" + self.q = {} + await self.assertCountInResults(4) # Should return all four courts + + async def test_invalid_parent_court_filter(self): + """Do we handle invalid parent_court values correctly?""" + self.q["parent_court"] = "nonexistent" + await self.assertCountInResults(0) + + class DRFJudgeApiFilterTests( SimpleUserDataMixin, TestCase, FilteringCountTestCase ): From 9093b7f707088bdc794bd8fc56fd34b056f4cff2 Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Tue, 26 Nov 2024 22:02:36 -0300 Subject: [PATCH 112/143] test(api): Add more tests for court filtering using other fields --- cl/api/tests.py | 157 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 8 deletions(-) diff --git a/cl/api/tests.py b/cl/api/tests.py index 275695b50f..94f068b819 100644 --- a/cl/api/tests.py +++ b/cl/api/tests.py @@ -1,5 +1,5 @@ import json -from datetime import date, timedelta +from datetime import date, datetime, timedelta, timezone from http import HTTPStatus from typing import Any, Dict from unittest import mock @@ -666,11 +666,18 @@ async def assertCountInResults(self, expected_count): f"the JSON: \n{r.json()}", ) got = len(r.data["results"]) + try: + path = r.request.get("path") + query_string = r.request.get("query_string") + url = f"{path}?{query_string}" + except AttributeError: + url = self.path self.assertEqual( got, expected_count, - msg=f"Expected {expected_count}, but got {got}.\n\nr.data was: {r.data}", + msg=f"Expected {expected_count}, but got {got} in {url}\n\nr.data was: {r.data}", ) + return r class DRFCourtApiFilterTests(TestCase, FilteringCountTestCase): @@ -678,21 +685,67 @@ class DRFCourtApiFilterTests(TestCase, FilteringCountTestCase): def setUpTestData(cls): Court.objects.all().delete() - cls.parent_court = CourtFactory(id="parent1", full_name="Parent Court") + cls.parent_court = CourtFactory( + id="parent1", + full_name="Parent Court", + short_name="PC", + citation_string="PC", + in_use=True, + has_opinion_scraper=True, + has_oral_argument_scraper=False, + position=1, + start_date=date(2000, 1, 1), + end_date=None, + jurisdiction=Court.FEDERAL_APPELLATE, + date_modified=datetime(2021, 1, 1, tzinfo=timezone.utc), + ) cls.child_court1 = CourtFactory( id="child1", parent_court=cls.parent_court, full_name="Child Court 1", + short_name="CC1", + citation_string="CC1", + in_use=False, + has_opinion_scraper=False, + has_oral_argument_scraper=True, + position=2, + start_date=date(2010, 6, 15), + end_date=date(2020, 12, 31), + jurisdiction=Court.STATE_SUPREME, + date_modified=datetime(2022, 6, 15, tzinfo=timezone.utc), ) cls.child_court2 = CourtFactory( id="child2", parent_court=cls.parent_court, full_name="Child Court 2", + short_name="CC2", + citation_string="CC2", + in_use=True, + has_opinion_scraper=False, + has_oral_argument_scraper=False, + position=3, + start_date=date(2015, 5, 20), + end_date=None, + jurisdiction=Court.STATE_TRIAL, + date_modified=datetime(2023, 3, 10, tzinfo=timezone.utc), + ) + + cls.orphan_court = CourtFactory( + id="orphan", + full_name="Orphan Court", + short_name="OC", + citation_string="OC", + in_use=True, + has_opinion_scraper=False, + has_oral_argument_scraper=False, + position=4, + start_date=date(2012, 8, 25), + end_date=None, + jurisdiction=Court.FEDERAL_DISTRICT, + date_modified=datetime(2023, 5, 5, tzinfo=timezone.utc), ) - cls.orphan_court = CourtFactory(id="orphan", full_name="Orphan Court") - @async_to_sync async def setUp(self): self.path = reverse("court-list", kwargs={"version": "v4"}) @@ -701,15 +754,15 @@ async def setUp(self): async def test_parent_court_filter(self): """Can we filter courts by parent_court id?""" self.q["parent_court"] = "parent1" - await self.assertCountInResults(2) # Should return child1 and child2 + # Should return child1 and child2: + response = await self.assertCountInResults(2) # Verify the returned court IDs - response = await self.async_client.get(self.path, self.q) court_ids = [court["id"] for court in response.data["results"]] self.assertEqual(set(court_ids), {"child1", "child2"}) # Filter for courts with parent_court id='orphan' (none should match) - self.q["parent_court"] = "orphan" + self.q = {"parent_court": "orphan"} await self.assertCountInResults(0) async def test_no_parent_court_filter(self): @@ -722,6 +775,94 @@ async def test_invalid_parent_court_filter(self): self.q["parent_court"] = "nonexistent" await self.assertCountInResults(0) + async def test_id_filter(self): + """Can we filter courts by id?""" + self.q["id"] = "child1" + response = await self.assertCountInResults(1) + self.assertEqual(response.data["results"][0]["id"], "child1") + + async def test_in_use_filter(self): + """Can we filter courts by in_use field?""" + self.q = {"in_use": "true"} + await self.assertCountInResults(3) # parent1, child2, orphan + self.q = {"in_use": "false"} + await self.assertCountInResults(1) # child1 + + async def test_has_opinion_scraper_filter(self): + """Can we filter courts by has_opinion_scraper field?""" + self.q = {"has_opinion_scraper": "true"} + await self.assertCountInResults(1) # parent1 + self.q = {"has_opinion_scraper": "false"} + await self.assertCountInResults(3) # child1, child2, orphan + + async def test_has_oral_argument_scraper_filter(self): + """Can we filter courts by has_oral_argument_scraper field?""" + self.q = {"has_oral_argument_scraper": "true"} + await self.assertCountInResults(1) # child1 + self.q = {"has_oral_argument_scraper": "false"} + await self.assertCountInResults(3) # parent1, child2, orphan + + async def test_position_filter(self): + """Can we filter courts by position with integer lookups?""" + self.q = {"position__gt": "2"} + await self.assertCountInResults(2) # child2 (3), orphan (4) + self.q = {"position__lte": "2"} + await self.assertCountInResults(2) # parent1 (1), child1 (2) + + async def test_start_date_filter(self): + """Can we filter courts by start_date with date lookups?""" + self.q = {"start_date__year": "2015"} + await self.assertCountInResults(1) # child2 (2015-05-20) + self.q = {"start_date__gte": "2010-01-01"} + await self.assertCountInResults(3) # child1, child2, orphan + + async def test_end_date_filter(self): + """Can we filter courts by end_date with date lookups?""" + self.q = {"end_date__day": "31"} + await self.assertCountInResults(1) # parent1, child2, orphan + self.q = {"end_date__year": "2024"} + await self.assertCountInResults(0) + + async def test_short_name_filter(self): + """Can we filter courts by short_name with text lookups?""" + self.q = {"short_name__iexact": "Cc1"} + await self.assertCountInResults(1) # child1 + self.q = {"short_name__icontains": "cc"} + await self.assertCountInResults(2) # child1, child2 + + async def test_full_name_filter(self): + """Can we filter courts by full_name with text lookups?""" + self.q = {"full_name__istartswith": "Child"} + await self.assertCountInResults(2) # child1, child2 + self.q = {"full_name__iendswith": "Court"} + await self.assertCountInResults(2) # parent1, orphan + + async def test_citation_string_filter(self): + """Can we filter courts by citation_string with text lookups?""" + self.q = {"citation_string": "OC"} + await self.assertCountInResults(1) # orphan + self.q = {"citation_string__icontains": "2"} + await self.assertCountInResults(1) # child2 + + async def test_jurisdiction_filter(self): + """Can we filter courts by jurisdiction?""" + self.q = { + "jurisdiction": [ + Court.FEDERAL_APPELLATE, + Court.FEDERAL_DISTRICT, + ] + } + await self.assertCountInResults(2) # parent1 and orphan + + async def test_combined_filters(self): + """Can we filter courts with multiple filters applied?""" + self.q = { + "in_use": "true", + "has_opinion_scraper": "false", + "position__gt": "2", + } + await self.assertCountInResults(2) # child2 and orphan + class DRFJudgeApiFilterTests( SimpleUserDataMixin, TestCase, FilteringCountTestCase From fc3a2c727d42f702aa7fc3df860475bc6b6bb0e9 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 27 Nov 2024 10:40:20 -0600 Subject: [PATCH 113/143] fix(elasticsearch): Enabled child highlighting for the related: query --- cl/lib/elasticsearch_utils.py | 2 +- cl/search/tests/tests_es_opinion.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index f0a88ce0ea..f7dbb19708 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -1260,7 +1260,7 @@ def build_es_base_query( {"opinion": []}, [], mlt_query, - child_highlighting=False, + child_highlighting=True, api_version=api_version, ) ) diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index 4996f7d985..5266c76ff8 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -2358,6 +2358,9 @@ def test_more_like_this_opinion(self) -> None: < r.content.decode().index("/opinion/%i/" % expected_second_pk), msg="'Howard v. Honda' should come AFTER 'case name cluster 3'.", ) + # Confirm that results contain a snippet + self.assertIn("plain", r.content.decode()) + # Confirm "related to" cluster legend is within the results' header. h2_element = html.fromstring(r.content.decode()).xpath( '//h2[@id="result-count"]' From ad031f478155fc4a97e2ff3945f69a1588815c5f Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 27 Nov 2024 11:24:50 -0600 Subject: [PATCH 114/143] fix(elasticsearch): Fixed undefined error variable in do_es_search - Fixed do_collapse_count_query return value on errors --- cl/lib/elasticsearch_utils.py | 4 ++-- cl/search/tests/tests.py | 14 ++++++++++++++ cl/search/tests/tests_es_opinion.py | 17 +++++++++++++++++ cl/search/views.py | 3 +-- 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 2c1f1053c9..a494d32a2b 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -3084,7 +3084,7 @@ def build_cardinality_count(count_query: Search, unique_field: str) -> Search: def do_collapse_count_query( search_type: str, main_query: Search, query: Query -) -> int | None: +) -> int: """Execute an Elasticsearch count query for queries that uses collapse. Uses a query with aggregation to determine the number of unique opinions based on the 'cluster_id' or 'docket_id' according to the search_type. @@ -3109,7 +3109,7 @@ def do_collapse_count_query( f"Error on count query request: {search_query.to_dict()}" ) logger.warning(f"Error was: {e}") - total_results = None + total_results = 0 return total_results diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index 3dac875f01..57aa8b27d2 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -2,6 +2,7 @@ import io import os from datetime import date +from http import HTTPStatus from pathlib import Path from unittest import mock from urllib.parse import parse_qs @@ -1090,6 +1091,19 @@ def test_avoid_wrapping_boosted_numbers_in_quotes(self) -> None: ) self.assertNotIn("encountered an error", r.content.decode()) + def test_raise_forbidden_error_on_depth_pagination(self) -> None: + """Confirm that a 403 Forbidden error is raised on depth pagination.""" + search_params = { + "type": SEARCH_TYPES.OPINION, + "q": "Lorem", + "page": 101, + } + r = self.client.get( + reverse("show_results"), + search_params, + ) + self.assertEqual(r.status_code, HTTPStatus.FORBIDDEN) + class SearchAPIV4CommonTest(ESIndexTestCase, TestCase): """Common tests for the Search API V4 endpoints.""" diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index c7d9c2568d..b276cfb508 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -547,6 +547,23 @@ def test_o_results_api_pagination(self) -> None: for created_opinion in created_opinions: created_opinion.delete() + async def test_bad_syntax_error(self) -> None: + """Can we properly raise the ElasticServerError exception?""" + + # Bad syntax due to the / char in the query. + params = { + "type": SEARCH_TYPES.OPINION, + "q": "This query contains bad/syntax query", + } + r = await self.async_client.get( + reverse("search-list", kwargs={"version": "v3"}), params + ) + self.assertEqual(r.status_code, HTTPStatus.INTERNAL_SERVER_ERROR) + self.assertEqual( + r.data["detail"], + "Internal Server Error. Please try again later or review your query.", + ) + class OpinionV4APISearchTest( OpinionSearchAPICommonTests, diff --git a/cl/search/views.py b/cl/search/views.py index 10f3f4b7f9..e545b4a9c7 100644 --- a/cl/search/views.py +++ b/cl/search/views.py @@ -729,6 +729,7 @@ def do_es_search( query_citation = None facet_fields = [] missing_citations_str = [] + error = True search_form = SearchForm(get_params, is_es_form=True, courts=courts) match get_params.get("type", SEARCH_TYPES.OPINION): @@ -827,8 +828,6 @@ def do_es_search( cd if not error else {"type": cd["type"]}, search_form, ) - else: - error = True courts, court_count_human, court_count = merge_form_with_courts( courts, search_form From 06db105f24de5cc6d8316d3159f86aad7cfd549c Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 27 Nov 2024 14:28:52 -0600 Subject: [PATCH 115/143] feat(casenames): update tokenize_case_name() rename functions parameters update logger messages update docstrings --- .../commands/update_casenames_wl_dataset.py | 156 +++++++++++------- 1 file changed, 97 insertions(+), 59 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py index 6c1d4def01..b936bdafaa 100644 --- a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -10,9 +10,9 @@ from eyecite import get_citations from eyecite.models import FullCaseCitation from eyecite.tokenizers import HyperscanTokenizer +from juriscraper.lib.string_utils import harmonize from cl.citations.utils import map_reporter_db_cite_type -from cl.corpus_importer.utils import add_citations_to_cluster from cl.search.models import Citation, OpinionCluster logger = logging.getLogger(__name__) @@ -59,29 +59,29 @@ def tokenize_case_name(case_name: str) -> set[str]: :param case_name: case name to tokenize :return: list of words """ - return ( - set( - [ - word.lower() - for word in WORD_PATTERN.findall(case_name) - if len(word) > 1 - ] - ) - - FALSE_POSITIVES - ) + words = [] + for word in WORD_PATTERN.findall(case_name): + if len(word) > 1: + # Only keep words with more than one character + words.append(word) + # Return only valid words + return set(words) - FALSE_POSITIVES -def check_case_names_match(csv_case_name: str, cl_case_name: str) -> bool: + +def check_case_names_match(west_case_name: str, cl_case_name: str) -> bool: """Compare two case name and decide whether they are the same or not - :param csv_case_name: case name from csv + Tokenize each string, capturing both words and abbreviations with periods and + convert all words to lowercase for case-insensitive matching and check if there is + an overlap between case names + + :param west_case_name: case name from csv :param cl_case_name: case name from cluster :return: True if they match else False """ - # Tokenize each string, capturing both words and abbreviations with periods and - # convert all words to lowercase for case-insensitive matching and check if there - # is an overlap between case names - overlap = tokenize_case_name(csv_case_name) & tokenize_case_name( + + overlap = tokenize_case_name(west_case_name) & tokenize_case_name( cl_case_name ) @@ -90,14 +90,14 @@ def check_case_names_match(csv_case_name: str, cl_case_name: str) -> bool: return False # Check for "v." in title - if "v." not in csv_case_name.lower(): + if "v." not in west_case_name.lower(): # in the matter of Smith # if no V. - likely an "in re" case and only match on at least 1 name return True # otherwise check if a match occurs on both sides of the V - v_index = csv_case_name.lower().index("v.") - hit_indices = [csv_case_name.lower().find(hit) for hit in overlap] + v_index = west_case_name.lower().index("v.") + hit_indices = [west_case_name.lower().find(hit) for hit in overlap] return min(hit_indices) < v_index < max(hit_indices) @@ -122,7 +122,7 @@ def parse_date(date_str: str) -> date | None: return datetime.strptime(date_str, fmt).date() except (ValueError, TypeError): continue - logger.warning(f"Invalid date format: {date_str}") + logger.warning("Invalid date format: %s", date_str) return None @@ -136,6 +136,8 @@ def parse_citations(citation_strings: list[str]) -> list[dict]: for cite_str in citation_strings: # Get citations from the string + + # We find all the citations that could match a cluster to update the case name found_cites = get_citations(cite_str, tokenizer=HYPERSCAN_TOKENIZER) if not found_cites: continue @@ -194,12 +196,12 @@ def query_possible_matches( def update_matched_case_name( - matched_cluster: OpinionCluster, csv_case_name: str + matched_cluster: OpinionCluster, west_case_name: str ) -> tuple[bool, bool]: """Update case name of matched cluster and related docket if empty any of them :param matched_cluster: OpinionCluster object - :param csv_case_name: case name from csv row + :param west_case_name: case name from csv row :return: tuple with boolean values if cluster and related docket case name updated """ cluster_case_name_updated = False @@ -207,17 +209,17 @@ def update_matched_case_name( if not matched_cluster.case_name: # Save case name in cluster when we don't have it - matched_cluster.case_name = csv_case_name + matched_cluster.case_name = harmonize(west_case_name) matched_cluster.save() - logger.info(f"Case name updated for cluster id: {matched_cluster.id}") + logger.info("Case name updated for cluster id: %s", matched_cluster.id) cluster_case_name_updated = True if not matched_cluster.docket.case_name: # Save case name in docket when we don't have it - matched_cluster.docket.case_name = csv_case_name + matched_cluster.docket.case_name = harmonize(west_case_name) matched_cluster.docket.save() logger.info( - f"Case name updated for docket id: {matched_cluster.docket.id}" + "Case name updated for docket id: %s", matched_cluster.docket.id ) docket_case_name_updated = True @@ -227,6 +229,9 @@ def update_matched_case_name( def combine_initials(case_name: str) -> str: """Combine initials in case captions + This function identifies initials (e.g., "J. D. E.") in a case name and combines + them into a compressed format without spaces or periods (e.g., "JDE"). + :param case_name: the case caption :return: the cleaned case caption """ @@ -252,8 +257,9 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: total_clusters_updated = 0 total_dockets_updated = 0 + total_citations_added = 0 - logger.info(f"Processing {filepath}") + logger.info("Processing %s", filepath) df = pd.read_csv(filepath).dropna() for row in df.itertuples(): ( @@ -269,20 +275,21 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: clean_docket_num = docket.strip('="').strip('"') if not clean_docket_num: - logger.info(f"Row index: {index} - No docket number found.") + logger.info("Row index: %s - No docket number found.", index) continue date_filed = parse_date(date_str) if not date_filed: logger.info( - f"Row index: {index} - No valid date found: {date_str}" + "Row index: %s - No valid date found: %s", index, date_str ) continue - valid_citations = parse_citations([cite1, cite2]) + west_citations: list[str] = [cite1, cite2] + valid_citations = parse_citations(west_citations) if not valid_citations: - logger.info(f"Row index: {index} - Missing two valid citations.") + logger.info("Row index: %s - Missing valid citations.", index) continue # Query for possible matches using data from row @@ -293,7 +300,7 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: ) if not possible_matches: - logger.info(f"Row index: {index} - No matches found.") + logger.info("Row index: %s - No possible matches found.", index) continue matches = [] @@ -311,16 +318,31 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: west_case_name, cl_case_name ) if case_name_match: - matches.append(match) + matches.append(match.cluster) - if len(matches) != 1: + if len(matches) == 0: + # No match found within possible matches, go to next row + logger.info( + "Row index: %s - No match found within possible matches.", + index, + ) + continue + elif len(matches) > 1: + # More than one match, log and go to next row + matches_found = ", ".join([str(cluster.id) for cluster in matches]) logger.warning( - f"Row index: {index} - Failed, Matches found: {len(matches)} - Matches: {[cluster.id for cluster in matches]}" + "Row index: %s - Multiple matches found: %s", + index, + matches_found, ) continue + # Single match found logger.info( - f"Row index: {index} - Match found: {matches[0].cluster_id} - Csv case name: {west_case_name}" + "Row index: %s - Match found: %s - West case name: %s", + index, + matches[0].id, + west_case_name, ) if dry_run: @@ -328,7 +350,7 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: continue with transaction.atomic(): - matched_cluster = matches[0].cluster + matched_cluster = matches[0] # Update case names cluster_updated, docket_updated = update_matched_case_name( @@ -342,35 +364,51 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: total_dockets_updated = +1 # Add any of the citations if possible + citation_to_add = None + for citation in valid_citations: - if Citation.objects.filter( + + new_cite_str = f"{citation.get('volume')} {citation.get('reporter')} {citation.get('page')}" + + cites = Citation.objects.filter( cluster_id=matched_cluster.id, reporter=citation.get("reporter"), - ).exists(): - # Avoid adding a citation if we already have a citation from the - # citation's reporter. - logger.info( - f"Can't add: {citation.get('volume')} {citation.get('reporter')} {citation.get('page')} to cluster id: {matched_cluster.id}. There is already " - f"a citation from that reporter." + ) + + if cites.exists(): + if cites[0].__str__() == new_cite_str: + # We already have that citation + continue + # Same reporter, different citation, revert changes + logger.warning( + "Row index: %s - Revert changes for cluster id: %s", + index, + matched_cluster.id, ) - continue - citation["cluster_id"] = matched_cluster.id - Citation.objects.get_or_create(**citation) - - add_citations_to_cluster( - [ - f"{cite.get('volume')} {cite.get('reporter')} {cite.get('page')}" - for cite in valid_citations - ], - matches[0].cluster_id, - ) + transaction.set_rollback(True) + citation_to_add = None + break + + # We used one from the row to find the match, we only need to add the other citation + citation_to_add = citation + + if citation_to_add: + # Add the cluster id and create the new citation + citation_to_add["cluster_id"] = matched_cluster.id + new_citation = Citation.objects.create(**citation_to_add) + logger.info( + "New citation added: %s to cluster id: %s", + new_citation, + matched_cluster.id, + ) + total_citations_added += 1 # Wait between each processed row to avoid sending to many indexing tasks time.sleep(delay) - if not dry_run: - logger.info(f"Clusters updated: {total_clusters_updated}") - logger.info(f"Dockets updated: {total_dockets_updated}") + logger.info("Clusters updated: %s", total_clusters_updated) + logger.info("Dockets updated: %s", total_dockets_updated) + logger.info("Citations added: %s", total_citations_added) class Command(BaseCommand): From d8c994e8613a16c4c662c36dbdce7cf1c3299e3d Mon Sep 17 00:00:00 2001 From: grossir <14970769+grossir@users.noreply.github.com> Date: Wed, 27 Nov 2024 20:48:16 +0000 Subject: [PATCH 116/143] Update freelawproject dependencies --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 68f09e1cfb..9e83e42f9c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2320,13 +2320,13 @@ setuptools = "*" [[package]] name = "juriscraper" -version = "2.6.43" +version = "2.6.44" description = "An API to scrape American court websites for metadata." optional = false python-versions = "*" files = [ - {file = "juriscraper-2.6.43-py27-none-any.whl", hash = "sha256:c2765e5f0a6563fe4842bf72b13aec2b6feb873dc2350523ff6b5102bdf1f757"}, - {file = "juriscraper-2.6.43.tar.gz", hash = "sha256:99029ab83cbe99673e4598c8e9b30df9e3d21ef98bd78baef9907ab53ad96e10"}, + {file = "juriscraper-2.6.44-py27-none-any.whl", hash = "sha256:29278f6429c25b171d3aebd341d795f7aa611669a8ff26d694943776499cadac"}, + {file = "juriscraper-2.6.44.tar.gz", hash = "sha256:cded9d566ffafb97cf6af8a1d5933aa0db12be2e1c0e0f412b0bd3d4f9896a8b"}, ] [package.dependencies] From a22a3c8699f77639b494d13153dfca9e2f56d8bb Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Wed, 27 Nov 2024 17:22:28 -0600 Subject: [PATCH 117/143] feat(casenames): update tokenize_case_name() update process to add new citation add test for check_case_names_match() update query_possible_matches() --- .../commands/update_casenames_wl_dataset.py | 57 ++++++++----------- cl/corpus_importer/tests.py | 49 ++++++++++++++++ 2 files changed, 74 insertions(+), 32 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py index b936bdafaa..11d56d24c6 100644 --- a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -63,7 +63,7 @@ def tokenize_case_name(case_name: str) -> set[str]: for word in WORD_PATTERN.findall(case_name): if len(word) > 1: # Only keep words with more than one character - words.append(word) + words.append(word.lower()) # Return only valid words return set(words) - FALSE_POSITIVES @@ -173,6 +173,9 @@ def query_possible_matches( ) -> QuerySet[Citation]: """Find matches for row data + It will remove duplicates, it could happen if we already have both citations, if we + have multiple matches, these must be unique + :param valid_citations: list of FullCaseCitation objects :param docket_number: cleaned docket number from row :param date_filed: formatted filed date from row @@ -188,9 +191,11 @@ def query_possible_matches( cluster__date_filed=date_filed, ) citation_queries |= citation_query - possible_matches = Citation.objects.filter( - citation_queries - ).select_related("cluster") + possible_matches = ( + Citation.objects.filter(citation_queries) + .select_related("cluster") + .distinct("cluster__id") + ) return possible_matches @@ -364,44 +369,32 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: total_dockets_updated = +1 # Add any of the citations if possible - citation_to_add = None - for citation in valid_citations: - new_cite_str = f"{citation.get('volume')} {citation.get('reporter')} {citation.get('page')}" - - cites = Citation.objects.filter( - cluster_id=matched_cluster.id, + citation["cluster_id"] = matched_cluster.id + if Citation.objects.filter(**citation).exists(): + # We already have the citation + continue + elif Citation.objects.filter( + cluster_id=citation["cluster_id"], reporter=citation.get("reporter"), - ) - - if cites.exists(): - if cites[0].__str__() == new_cite_str: - # We already have that citation - continue - # Same reporter, different citation, revert changes + ).exists(): + # # Same reporter, different citation, revert changes logger.warning( "Row index: %s - Revert changes for cluster id: %s", index, matched_cluster.id, ) transaction.set_rollback(True) - citation_to_add = None break - - # We used one from the row to find the match, we only need to add the other citation - citation_to_add = citation - - if citation_to_add: - # Add the cluster id and create the new citation - citation_to_add["cluster_id"] = matched_cluster.id - new_citation = Citation.objects.create(**citation_to_add) - logger.info( - "New citation added: %s to cluster id: %s", - new_citation, - matched_cluster.id, - ) - total_citations_added += 1 + else: + new_citation = Citation.objects.create(**citation) + logger.info( + "New citation added: %s to cluster id: %s", + new_citation, + matched_cluster.id, + ) + total_citations_added += 1 # Wait between each processed row to avoid sending to many indexing tasks time.sleep(delay) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 7a76435ded..1e7d9de3da 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -62,6 +62,9 @@ log_added_items_to_redis, merge_rss_data, ) +from cl.corpus_importer.management.commands.update_casenames_wl_dataset import ( + check_case_names_match, +) from cl.corpus_importer.signals import ( handle_update_latest_case_id_and_schedule_iquery_sweep, update_latest_case_id_and_schedule_iquery_sweep, @@ -4078,3 +4081,49 @@ def test_probe_iquery_pages_daemon_court_got_stuck( f"iquery:court_empty_probe_attempts:{self.court_cacd.pk}" ) self.assertEqual(int(court_empty_attempts), 0) + + +class CaseNamesTest(SimpleTestCase): + def test_check_case_names_match(self) -> None: + """Can we check if the case names match?""" + case_names_tests = ( + ( + "US v. Guerrero-Martinez", # 736793 + "United States v. Hector Guerrero-Martinez, AKA Hector Guerrero AKA Hector Martinez-Guerrero", + False, + ), + ( + "In re CP", # 2140442 + "In Re CP", + True, + ), + ( + "Dennis v. City of Easton", # 730246 + "Richard Dennis, Penelope Dennis, Loretta M. Dennis v. City of Easton, Edward J. Ferraro, Robet S. Stein, Doris Asteak, Paul Schleuter, Howard B. White, Easton Board of Health", + True, + ), + ( + "Parmelee v. Bruggeman", # 736598 + "Allan Parmelee v. Milford Bruggeman Janine Bruggeman Friend of the Court for the State of Michigan Nancy Rose, Employee of the State of Michigan for the Friend of the Court Glenda Friday, Employee of the State of Michigan for the Friend of the Court Karen Dunn, Employee of the State of Michigan for the Friend of the Court Thomas Kreckman, Employee of the State of Michigan for the Friend of the Court State of Michigan", + True, + ), + ( + "Automobile Assur. Financial Corp. v. Syrett Corp.", # 735935 + "Automobile Assurance Financial Corporation, a Utah Corporation Venuti and Associates, Inc., a Utah Corporation Venuti Partners, Ltd., a Utah Limited Partnership Frank P. Venuti, an Individual, Parker M. Nielson v. Syrett Corporation, a Delaware Corporation, Formerly a Utah Corporation, John R. Riley, an Individual, Third-Party-Defendant", + True, + ), + ( + "Christopher Ambroze, M.D., PC v. Aetna Health Plans of New York, Inc.", # 735476 + "Christopher Ambroze, M.D., P.C., Rockville Anesthesia Group, Llp, Harvey Finkelstein, Plainview Anesthesiologists, P.C., Joseph A. Singer, Atlantic Anesthesia Associates, P.C. v. Aetna Health Plans of New York, Inc., Aetna Health Management, Inc., Aetna Life and Casualty Company, C. Frederick Berger, and Gregg Stolzberg", + True, + ), + ( + "O'Neal v. Merkel", # 730350 + "Terence Kenneth O'Neal v. T.E. Merkel Nurse Cashwell Nurse Allen Nurse Davis Mr. Conn, and Franklin E. Freeman, Jr. Gary Dixon Doctor Lowy Doctor Shaw Doctor Castalloe Harry Allsbrook Mr. Cherry", + True, + ), + ) + for wl_casename, cl_casename, overlap in case_names_tests: + self.assertEqual( + check_case_names_match(wl_casename, cl_casename), overlap + ) From 110b85ce93582905cf2b164680bb985f32727c23 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Thu, 28 Nov 2024 06:59:01 -0600 Subject: [PATCH 118/143] feat(casenames): update tokenize_case_name() update test for tokenize_case_name() --- .../commands/update_casenames_wl_dataset.py | 5 +++-- cl/corpus_importer/tests.py | 11 +++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py index 11d56d24c6..acce5f19e2 100644 --- a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -54,13 +54,14 @@ def tokenize_case_name(case_name: str) -> set[str]: """Tokenizes case name and removes single-character words except for letters with periods. - Also removes false positive words + It uses harmonize() from juriscraper to make case names cleaner + Also removes false positive words, e.g. (U.S -> United States) :param case_name: case name to tokenize :return: list of words """ words = [] - for word in WORD_PATTERN.findall(case_name): + for word in WORD_PATTERN.findall(harmonize(case_name)): if len(word) > 1: # Only keep words with more than one character words.append(word.lower()) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 1e7d9de3da..00d9535a87 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -4087,10 +4087,15 @@ class CaseNamesTest(SimpleTestCase): def test_check_case_names_match(self) -> None: """Can we check if the case names match?""" case_names_tests = ( + ( + "U.S. v. Smith", + "United States v. Smith", + True, + ), ( "US v. Guerrero-Martinez", # 736793 "United States v. Hector Guerrero-Martinez, AKA Hector Guerrero AKA Hector Martinez-Guerrero", - False, + True, ), ( "In re CP", # 2140442 @@ -4125,5 +4130,7 @@ def test_check_case_names_match(self) -> None: ) for wl_casename, cl_casename, overlap in case_names_tests: self.assertEqual( - check_case_names_match(wl_casename, cl_casename), overlap + check_case_names_match(wl_casename, cl_casename), + overlap, + msg="Case names don't match", ) From 62bdf183658933aba4c70b8c3e8c4e5fe5a8d2e7 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 28 Nov 2024 11:39:58 -0600 Subject: [PATCH 119/143] fix(elasticsearch): Improved comment in build_more_like_this_query --- cl/lib/elasticsearch_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index f7dbb19708..3d0b2c7fa2 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -196,7 +196,10 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query: for pair in opinion_cluster_pairs ] or [ {"_id": f"o_{pk}"} for pk in related_ids - ] # Fall back in case IDs are not found in DB. + ] # Fallback in case IDs are not found in the database. + # The user might have provided non-existent Opinion IDs. + # This ensures that the query does not raise an error and instead returns + # no results. more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy() mlt_query = Q( From dd85888d3d8969e009dd44e38a86aa54884d815c Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 29 Nov 2024 09:59:10 -0500 Subject: [PATCH 120/143] feat(update_casenames): Tweak name comparison feature --- .../commands/update_casenames_wl_dataset.py | 34 +++++++------------ 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py index acce5f19e2..3e1776a335 100644 --- a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -54,14 +54,11 @@ def tokenize_case_name(case_name: str) -> set[str]: """Tokenizes case name and removes single-character words except for letters with periods. - It uses harmonize() from juriscraper to make case names cleaner - Also removes false positive words, e.g. (U.S -> United States) - :param case_name: case name to tokenize :return: list of words """ words = [] - for word in WORD_PATTERN.findall(harmonize(case_name)): + for word in WORD_PATTERN.findall(case_name): if len(word) > 1: # Only keep words with more than one character words.append(word.lower()) @@ -82,21 +79,23 @@ def check_case_names_match(west_case_name: str, cl_case_name: str) -> bool: :return: True if they match else False """ - overlap = tokenize_case_name(west_case_name) & tokenize_case_name( - cl_case_name - ) + west_set = tokenize_case_name(west_case_name) + cl_set = tokenize_case_name(cl_case_name) + overlap = west_set & cl_set if not overlap: # if no hits no match on name - move along return False # Check for "v." in title - if "v." not in west_case_name.lower(): + if "v." not in west_case_name.lower() or ( + len(cl_set) == 1 or len(west_set) == 1 + ): # in the matter of Smith # if no V. - likely an "in re" case and only match on at least 1 name return True - # otherwise check if a match occurs on both sides of the V + # otherwise check if a match occurs on both sides of the `v.` v_index = west_case_name.lower().index("v.") hit_indices = [west_case_name.lower().find(hit) for hit in overlap] @@ -215,14 +214,14 @@ def update_matched_case_name( if not matched_cluster.case_name: # Save case name in cluster when we don't have it - matched_cluster.case_name = harmonize(west_case_name) + matched_cluster.case_name = west_case_name matched_cluster.save() logger.info("Case name updated for cluster id: %s", matched_cluster.id) cluster_case_name_updated = True if not matched_cluster.docket.case_name: # Save case name in docket when we don't have it - matched_cluster.docket.case_name = harmonize(west_case_name) + matched_cluster.docket.case_name = west_case_name matched_cluster.docket.save() logger.info( "Case name updated for docket id: %s", matched_cluster.docket.id @@ -268,17 +267,8 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: logger.info("Processing %s", filepath) df = pd.read_csv(filepath).dropna() for row in df.itertuples(): - ( - index, - west_case_name, - court, - date_str, - cite1, - cite2, - docket, - volume, - ) = row - + index, case_name, court, date_str, cite1, cite2, docket, _ = row + west_case_name = harmonize(case_name) clean_docket_num = docket.strip('="').strip('"') if not clean_docket_num: logger.info("Row index: %s - No docket number found.", index) From a0920a202fbc2884b6f30cdfe2ac2376f8111635 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 29 Nov 2024 10:11:05 -0600 Subject: [PATCH 121/143] feat(casenames): update tokenize_case_name() remove combine initials --- .../commands/update_casenames_wl_dataset.py | 29 ++----------------- cl/corpus_importer/tests.py | 4 +-- 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py index 3e1776a335..c18f815043 100644 --- a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -79,8 +79,8 @@ def check_case_names_match(west_case_name: str, cl_case_name: str) -> bool: :return: True if they match else False """ - west_set = tokenize_case_name(west_case_name) - cl_set = tokenize_case_name(cl_case_name) + west_set = tokenize_case_name(west_case_name.lower()) + cl_set = tokenize_case_name(cl_case_name.lower()) overlap = west_set & cl_set if not overlap: @@ -98,7 +98,6 @@ def check_case_names_match(west_case_name: str, cl_case_name: str) -> bool: # otherwise check if a match occurs on both sides of the `v.` v_index = west_case_name.lower().index("v.") hit_indices = [west_case_name.lower().find(hit) for hit in overlap] - return min(hit_indices) < v_index < max(hit_indices) @@ -231,27 +230,6 @@ def update_matched_case_name( return cluster_case_name_updated, docket_case_name_updated -def combine_initials(case_name: str) -> str: - """Combine initials in case captions - - This function identifies initials (e.g., "J. D. E.") in a case name and combines - them into a compressed format without spaces or periods (e.g., "JDE"). - - :param case_name: the case caption - :return: the cleaned case caption - """ - - initials_pattern = re.compile(r"(\b[A-Z]\.?\s?){2,}(\s|$)") - - matches = initials_pattern.finditer(case_name) - if matches: - for match in matches: - initials = match.group() - compressed_initials = re.sub(r"(?!\s$)[\s\.]", "", initials) - case_name = case_name.replace(initials, compressed_initials) - return case_name - - def process_csv(filepath: str, delay: float, dry_run: bool) -> None: """Process rows from csv file @@ -307,9 +285,6 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: else match.cluster.case_name ) - west_case_name = combine_initials(west_case_name) - cl_case_name = combine_initials(cl_case_name) - case_name_match = check_case_names_match( west_case_name, cl_case_name ) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 00d9535a87..37cc03d24d 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -4093,7 +4093,7 @@ def test_check_case_names_match(self) -> None: True, ), ( - "US v. Guerrero-Martinez", # 736793 + "United States v. Guerrero-Martinez", # 736793 "United States v. Hector Guerrero-Martinez, AKA Hector Guerrero AKA Hector Martinez-Guerrero", True, ), @@ -4132,5 +4132,5 @@ def test_check_case_names_match(self) -> None: self.assertEqual( check_case_names_match(wl_casename, cl_casename), overlap, - msg="Case names don't match", + msg=f"Case names don't match: {wl_casename} - {cl_casename}", ) From 01c1fa4e98b9f6dd1042e25b74153e421bdcfe33 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Fri, 29 Nov 2024 11:30:45 -0600 Subject: [PATCH 122/143] feat(update_casenames): add start_row and limit param to command --- .../commands/update_casenames_wl_dataset.py | 41 +++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py index c18f815043..c98d619b93 100644 --- a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -230,12 +230,20 @@ def update_matched_case_name( return cluster_case_name_updated, docket_case_name_updated -def process_csv(filepath: str, delay: float, dry_run: bool) -> None: +def process_csv( + filepath: str, + delay: float, + dry_run: bool, + limit: int | None, + start_row: int, +) -> None: """Process rows from csv file :param filepath: path to csv file :param delay: delay between saves in seconds :param dry_run: flag to simulate update process + :param limit: limit number of rows to process + :param start_row: start row """ total_clusters_updated = 0 @@ -243,7 +251,19 @@ def process_csv(filepath: str, delay: float, dry_run: bool) -> None: total_citations_added = 0 logger.info("Processing %s", filepath) - df = pd.read_csv(filepath).dropna() + + # Generate rows to skip, excluding the header row + skip_rows = list(range(1, start_row)) if start_row else None + + df = pd.read_csv(filepath, skiprows=skip_rows, nrows=limit).dropna() + + # Reset the index to start from 0 (needed if we pass skip_rows param) + df.reset_index(drop=True, inplace=True) + + if start_row: + # Update rows index to reflect the original csv row numbers + df.index = range(start_row, start_row + len(df)) + for row in df.itertuples(): index, case_name, court, date_str, cite1, cite2, docket, _ = row west_case_name = harmonize(case_name) @@ -391,15 +411,30 @@ def add_arguments(self, parser): action="store_true", help="Simulate the update process without making changes", ) + parser.add_argument( + "--start-row", + default=0, + type=int, + help="Start row (inclusive).", + ) + parser.add_argument( + "--limit", + default=None, + type=int, + help="Limit number of rows to process.", + required=False, + ) def handle(self, *args, **options): filepath = options["filepath"] delay = options["delay"] dry_run = options["dry_run"] + limit = options["limit"] + start_row = options["start_row"] if not filepath: raise CommandError( "Filepath is required. Use --filepath to specify the CSV file location." ) - process_csv(filepath, delay, dry_run) + process_csv(filepath, delay, dry_run, limit, start_row) From c15f20d8131a933a66cd2b94b313d1094e70e325 Mon Sep 17 00:00:00 2001 From: mlissner Date: Fri, 29 Nov 2024 10:39:56 -0800 Subject: [PATCH 123/143] feat(ci): Add iquery rollout task --- .github/workflows/docker-build.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index ff00cf891a..37fbcf30d2 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -131,6 +131,11 @@ jobs: - name: Watch cl-es-sweep-indexer rollout status run: kubectl rollout status -n ${{ env.EKS_NAMESPACE }} deployment/cl-es-sweep-indexer + - name: Rollout cl-iquery-probe + run: kubectl set image -n ${{ env.EKS_NAMESPACE }} deployment/cl-iquery-probe cl-iquery-probe=freelawproject/courtlistener:${{ steps.vars.outputs.sha_short }}-prod + - name: Watch cl-iquery-probe rollout status + run: kubectl rollout status -n ${{ env.EKS_NAMESPACE }} deployment/cl-iquery-probe + # Watch "cronjobs" in k9s - name: Update cronjobs From 84cdae99fd3ef375547e859a87a00f3fe8dd0ea8 Mon Sep 17 00:00:00 2001 From: mlissner Date: Fri, 29 Nov 2024 10:41:21 -0800 Subject: [PATCH 124/143] feat(settings): Throttle down users --- cl/settings/third_party/rest_framework.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cl/settings/third_party/rest_framework.py b/cl/settings/third_party/rest_framework.py index 9e4365b4ce..2c0f04163e 100644 --- a/cl/settings/third_party/rest_framework.py +++ b/cl/settings/third_party/rest_framework.py @@ -29,8 +29,12 @@ "OVERRIDE_THROTTLE_RATES": { # Throttling down. # Unresponsive + "projecttesting": "1/hour", "SAGW": "1/hour", # Bounced + "riwiko8259": "1/hour", + "xicaro7027": "1/hour", + "nayibij851": "1/hour", "testname2024": "1/hour", "cadebe2258": "1/hour", # Disposable email From 8c2cd99ad2dc9d816c9f5348a6ef2006af9d400a Mon Sep 17 00:00:00 2001 From: mlissner Date: Fri, 29 Nov 2024 10:42:03 -0800 Subject: [PATCH 125/143] feat(html): Typos --- cl/api/templates/migration-guide.html | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cl/api/templates/migration-guide.html b/cl/api/templates/migration-guide.html index 8b9d421529..4d1dbf7822 100644 --- a/cl/api/templates/migration-guide.html +++ b/cl/api/templates/migration-guide.html @@ -31,13 +31,13 @@

V4 API Migration Guide

- After several years of planning and development we have released v4 of our APIs. + After several years of planning and development, we have released v4 of our APIs.

This upgrade responds to feedback we have received over the years and should be much better for our users — faster, more featureful, more scalable, and more accurate.

- Unfortunately, we couldn't make these new APIs completely backwards compatible so this guide explains what's new. + Unfortunately, we couldn't make these new APIs completely backwards compatible, so this guide explains what's new.

Support

@@ -71,10 +71,10 @@

Timeline for Changes

What If I Do Nothing?

- You might be fine. Most of the database and search APIs are only changing slightly and v3 will be supported for some period of time. + You might be fine. Most of the database and search APIs are only changing slightly, and v3 will be supported for some period of time. But you should read this guide to see if any changes are needed to your application.

-

The remainder of this guide is in three section:

+

The remainder of this guide is in three sections:

  • New features you can expect
  • How to migrate database APIs
  • @@ -90,7 +90,7 @@

    Cursor-based pagination

    ElasticSearch

    v4 of the Search API is powered by ElasticSearch instead of Solr. This is a huge upgrade to our API and search engine.

    -

    Some of the improvements include:

    +

    Some improvements include:

    • In v4, all PACER cases are now searchable. In v3 you only got results if a case had a docket entry.
    • @@ -107,12 +107,12 @@

      ElasticSearch

    • Camelcase words like "McDonalds" are more searchable.
    • Highlighting is more consistent and can be disabled for better performance.
    -
  • Emojis and unicode characters are now searchable.
  • +
  • Emojis and Unicode characters are now searchable.
  • Docket number and other fielded searches are more robust.
  • Timezone handling is more consistent.
  • We've added a number of new searchable fields.
-

For more details please see our blog.

+

For more details, please see our blog.

Breaking Changes to v3 of the Search API

We cannot continue running Solr forever, but we can do our best to support v3 of the API. To do this, on November 25, 2024, v3 of the Search API will be upgraded to use ElasticSearch. We expect this to support most uses, but it will cause some breaking changes, as outlined in this section. From 16bd4a76dcd8211ac3d874aa6d2ab057b3b4c9ad Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 29 Nov 2024 15:48:04 -0600 Subject: [PATCH 126/143] fix(sweep_indexer): Added poll interval setting and wait between chunks --- cl/search/management/commands/sweep_indexer.py | 13 +++++++++++-- cl/settings/third_party/elasticsearch.py | 3 +++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cl/search/management/commands/sweep_indexer.py b/cl/search/management/commands/sweep_indexer.py index 4cc7b0bc4f..c3607ca24c 100644 --- a/cl/search/management/commands/sweep_indexer.py +++ b/cl/search/management/commands/sweep_indexer.py @@ -359,7 +359,7 @@ def process_queryset( processed_count = 0 accumulated_chunk = 0 throttle = CeleryThrottle( - poll_interval=10, + poll_interval=settings.ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL, min_items=self.chunk_size, queue_name=self.queue, ) @@ -405,8 +405,17 @@ def process_queryset( ).set(queue=self.queue).apply_async() accumulated_chunk += len(chunk) + if not testing_mode: + # Wait for 1/ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL + # before processing the next chunk. + # e.g: With a poll interval of 10 and a chunk size of 10, + # it will wait for 0.1 seconds for every 10 documents processed, + # maintaining an index rate of 100 documents per second. + time.sleep( + 1 / settings.ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL + ) self.stdout.write( - "\rProcessed {}/{}, ({:.0%}), last {} PK indexed: {},".format( + "\rProcessed {}/{}, ({:.0%}), last {} ID indexed: {},".format( processed_count, count, processed_count * 1.0 / count, diff --git a/cl/settings/third_party/elasticsearch.py b/cl/settings/third_party/elasticsearch.py index c62e575d2e..69b0b72087 100644 --- a/cl/settings/third_party/elasticsearch.py +++ b/cl/settings/third_party/elasticsearch.py @@ -264,6 +264,9 @@ ELASTICSEARCH_SWEEP_INDEXER_HEADS_RATE = env( "ELASTICSEARCH_SWEEP_INDEXER_HEADS_RATE", default=60 ) +ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL = env( + "ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL", default=10 +) ELASTICSEARCH_SWEEP_INDEXER_MODELS = env( "ELASTICSEARCH_SWEEP_INDEXER_MODELS", default=[ From 9f331aff55a5ce7d7804de586246355241638bd1 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 29 Nov 2024 15:54:40 -0600 Subject: [PATCH 127/143] fix(sweep_indexer): Ignore types for setting --- cl/search/management/commands/sweep_indexer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/search/management/commands/sweep_indexer.py b/cl/search/management/commands/sweep_indexer.py index c3607ca24c..9c2c77d47e 100644 --- a/cl/search/management/commands/sweep_indexer.py +++ b/cl/search/management/commands/sweep_indexer.py @@ -359,7 +359,7 @@ def process_queryset( processed_count = 0 accumulated_chunk = 0 throttle = CeleryThrottle( - poll_interval=settings.ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL, + poll_interval=settings.ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL, # type: ignore min_items=self.chunk_size, queue_name=self.queue, ) @@ -412,7 +412,7 @@ def process_queryset( # it will wait for 0.1 seconds for every 10 documents processed, # maintaining an index rate of 100 documents per second. time.sleep( - 1 / settings.ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL + 1 / settings.ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL # type: ignore ) self.stdout.write( "\rProcessed {}/{}, ({:.0%}), last {} ID indexed: {},".format( From 34f1ac37e5623173c8de672509719bbb7644a7b7 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 29 Nov 2024 16:11:22 -0600 Subject: [PATCH 128/143] fix(sweep): Indexer reduced default poll interval --- cl/settings/third_party/elasticsearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/settings/third_party/elasticsearch.py b/cl/settings/third_party/elasticsearch.py index 69b0b72087..879baaa58f 100644 --- a/cl/settings/third_party/elasticsearch.py +++ b/cl/settings/third_party/elasticsearch.py @@ -265,7 +265,7 @@ "ELASTICSEARCH_SWEEP_INDEXER_HEADS_RATE", default=60 ) ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL = env( - "ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL", default=10 + "ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL", default=5 ) ELASTICSEARCH_SWEEP_INDEXER_MODELS = env( "ELASTICSEARCH_SWEEP_INDEXER_MODELS", From ceb049e350f82e0086c68755bffb3bd3c1de4c0e Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 29 Nov 2024 17:09:29 -0600 Subject: [PATCH 129/143] fix(sweep_indexer): Added a separate setting for waiting between chunks - Added a logger for scheduling make_docket_by_iquery_sweep --- cl/corpus_importer/signals.py | 4 ++++ cl/search/management/commands/sweep_indexer.py | 4 ++-- cl/settings/third_party/elasticsearch.py | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/cl/corpus_importer/signals.py b/cl/corpus_importer/signals.py index d2443b62f3..08254d7d85 100644 --- a/cl/corpus_importer/signals.py +++ b/cl/corpus_importer/signals.py @@ -76,6 +76,10 @@ def update_latest_case_id_and_schedule_iquery_sweep(docket: Docket) -> None: countdown=task_scheduled_countdown, queue=settings.CELERY_IQUERY_QUEUE, ) + logger.info( + f"Enqueued iquery docket case ID: {iquery_pacer_case_id_current} " + f"for court {court_id} with countdown {task_scheduled_countdown}" + ) # Update the iquery_pacer_case_id_current in Redis r.hset( diff --git a/cl/search/management/commands/sweep_indexer.py b/cl/search/management/commands/sweep_indexer.py index 9c2c77d47e..fe2bb96e79 100644 --- a/cl/search/management/commands/sweep_indexer.py +++ b/cl/search/management/commands/sweep_indexer.py @@ -406,13 +406,13 @@ def process_queryset( accumulated_chunk += len(chunk) if not testing_mode: - # Wait for 1/ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL + # Wait for 1/ELASTICSEARCH_SWEEP_INDEXER_WAIT_BETWEEN_CHUNKS # before processing the next chunk. # e.g: With a poll interval of 10 and a chunk size of 10, # it will wait for 0.1 seconds for every 10 documents processed, # maintaining an index rate of 100 documents per second. time.sleep( - 1 / settings.ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL # type: ignore + 1 / settings.ELASTICSEARCH_SWEEP_INDEXER_WAIT_BETWEEN_CHUNKS # type: ignore ) self.stdout.write( "\rProcessed {}/{}, ({:.0%}), last {} ID indexed: {},".format( diff --git a/cl/settings/third_party/elasticsearch.py b/cl/settings/third_party/elasticsearch.py index 879baaa58f..7a1ec6b779 100644 --- a/cl/settings/third_party/elasticsearch.py +++ b/cl/settings/third_party/elasticsearch.py @@ -265,7 +265,10 @@ "ELASTICSEARCH_SWEEP_INDEXER_HEADS_RATE", default=60 ) ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL = env( - "ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL", default=5 + "ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL", default=10 +) +ELASTICSEARCH_SWEEP_INDEXER_WAIT_BETWEEN_CHUNKS = env( + "ELASTICSEARCH_SWEEP_INDEXER_WAIT_BETWEEN_CHUNKS", default=3 ) ELASTICSEARCH_SWEEP_INDEXER_MODELS = env( "ELASTICSEARCH_SWEEP_INDEXER_MODELS", From 3a62c294c39383f6f1054a4819c2b92a6ff0c0c3 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 29 Nov 2024 19:37:04 -0400 Subject: [PATCH 130/143] refactor(assets): Format opinions.js for consistency --- cl/assets/static-global/js/opinions.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cl/assets/static-global/js/opinions.js b/cl/assets/static-global/js/opinions.js index e6665237bf..3a3f5c2b11 100644 --- a/cl/assets/static-global/js/opinions.js +++ b/cl/assets/static-global/js/opinions.js @@ -9,9 +9,9 @@ $('.star-pagination').each(function (index, element) { if ($(this).attr('pagescheme')) { // For ANON 2020 this has two sets of numbers but only one can be // verified with other databses so only showing one - var number = $(this).attr('number') - if (number.indexOf("P") > -1) { - $(this).attr('label', ""); + var number = $(this).attr('number'); + if (number.indexOf('P') > -1) { + $(this).attr('label', ''); } else { $(this).attr('label', number); } @@ -287,4 +287,4 @@ document.querySelectorAll("page-label").forEach(label => { window.location.href = href; } }); -}); \ No newline at end of file +}); From 5d60c2e4664745294c04952698138c44cee41007 Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Fri, 29 Nov 2024 20:45:12 -0400 Subject: [PATCH 131/143] refactor(opinions): Remove unused search_params argument This commit removes the search_params argument from the build_related_clusters_query call in the es_get_related_clusters_with_cache method. --- cl/opinion_page/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index 18d5fa4c30..33e0682211 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -298,7 +298,7 @@ async def es_get_related_clusters_with_cache( return related_cluster_result related_query = await build_related_clusters_query( - cluster_search, sub_opinion_pks, search_params + cluster_search, sub_opinion_pks ) related_query = related_query.params( From 948b7d33120669d7d7de5cd0d74687849ed1db44 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 29 Nov 2024 20:36:04 -0600 Subject: [PATCH 132/143] fix(corpus_importer): Changed setting to disable probe_iquery_pages_daemon --- .../management/commands/probe_iquery_pages_daemon.py | 2 +- cl/corpus_importer/tests.py | 2 +- cl/settings/project/corpus_importer.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cl/corpus_importer/management/commands/probe_iquery_pages_daemon.py b/cl/corpus_importer/management/commands/probe_iquery_pages_daemon.py index 759700673e..8a99322eb2 100644 --- a/cl/corpus_importer/management/commands/probe_iquery_pages_daemon.py +++ b/cl/corpus_importer/management/commands/probe_iquery_pages_daemon.py @@ -81,7 +81,7 @@ def handle(self, *args, **options): iterations_completed = 0 r = get_redis_interface("CACHE") testing = True if testing_iterations else False - while True and settings.IQUERY_PROBE_DAEMON_ENABLED: + while True and settings.IQUERY_CASE_PROBE_DAEMON_ENABLED: for court_id in court_ids: if r.exists(f"iquery:court_wait:{court_id}"): continue diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 37cc03d24d..5b3d858897 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -3346,7 +3346,7 @@ def test_merger(self): @patch("cl.corpus_importer.tasks.get_or_cache_pacer_cookies") @override_settings( - IQUERY_PROBE_DAEMON_ENABLED=True, + IQUERY_CASE_PROBE_DAEMON_ENABLED=True, IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED=True, EGRESS_PROXY_HOSTS=["http://proxy_1:9090", "http://proxy_2:9090"], ) diff --git a/cl/settings/project/corpus_importer.py b/cl/settings/project/corpus_importer.py index dc81d21978..f2f375845f 100644 --- a/cl/settings/project/corpus_importer.py +++ b/cl/settings/project/corpus_importer.py @@ -1,8 +1,8 @@ import environ env = environ.FileAwareEnv() -IQUERY_PROBE_DAEMON_ENABLED = env.int( - "IQUERY_PROBE_DAEMON_ENABLED", default=False +IQUERY_CASE_PROBE_DAEMON_ENABLED = env.bool( + "IQUERY_CASE_PROBE_DAEMON_ENABLED", default=False ) IQUERY_PROBE_ITERATIONS = env.int("IQUERY_PROBE_ITERATIONS", default=9) IQUERY_PROBE_WAIT = env.int("IQUERY_PROBE_WAIT", default=300) From d5b40e73ec89b1e24ff23c50cf96060ca5b167f0 Mon Sep 17 00:00:00 2001 From: William Palin Date: Sat, 30 Nov 2024 09:25:39 -0500 Subject: [PATCH 133/143] choice(add_download): Remove commented code --- cl/opinion_page/templates/includes/add_download_button.html | 1 - 1 file changed, 1 deletion(-) diff --git a/cl/opinion_page/templates/includes/add_download_button.html b/cl/opinion_page/templates/includes/add_download_button.html index b6d05c41be..bcd7a508ea 100644 --- a/cl/opinion_page/templates/includes/add_download_button.html +++ b/cl/opinion_page/templates/includes/add_download_button.html @@ -1,4 +1,3 @@ -{#

#}
- +

{{ cluster.docket.court }}


  • Citations: {{ cluster.citation_string|default:"None known" }}
  • - - {% if cluster.case_name_full != cluster.case_name and cluster.case_name_full != "" %} + {% if cluster.case_name_full != cluster.case_name and cluster.case_name_full != "" and cluster.case_name != "" %}
  • Full Case Name: {{ cluster.case_name_full }}
  • From 1af9aab1405f8cf287282e270a3f7b0d318963ca Mon Sep 17 00:00:00 2001 From: William Palin Date: Tue, 3 Dec 2024 09:40:40 -0500 Subject: [PATCH 140/143] fix(opinions.html): Use best case name --- cl/opinion_page/templates/opinions.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/opinion_page/templates/opinions.html b/cl/opinion_page/templates/opinions.html index cc84454184..d627dbcd74 100644 --- a/cl/opinion_page/templates/opinions.html +++ b/cl/opinion_page/templates/opinions.html @@ -216,13 +216,13 @@

- +

{{ cluster.docket.court }}


  • Citations: {{ cluster.citation_string|default:"None known" }}
  • - {% if cluster.case_name_full != cluster.case_name and cluster.case_name_full != "" and cluster.case_name != "" %} + {% if cluster.case_name_full != cluster|best_case_name %}
  • Full Case Name: {{ cluster.case_name_full }}
  • From 71fc4b8da32e3ed5c40ab2a35e31606fe0527f7c Mon Sep 17 00:00:00 2001 From: William Palin Date: Tue, 3 Dec 2024 09:55:45 -0500 Subject: [PATCH 141/143] fix(opinions.html): Enable Harvard SCAN tab --- cl/opinion_page/templates/opinions.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/opinion_page/templates/opinions.html b/cl/opinion_page/templates/opinions.html index 320dbb40d9..37343d034a 100644 --- a/cl/opinion_page/templates/opinions.html +++ b/cl/opinion_page/templates/opinions.html @@ -333,10 +333,10 @@

    {{ cluster.docket.court }}

    {% endif %} {% endif %} - {% if has_downloads and "pdf" in pdf_path %} + {% if has_downloads and "pdf" in pdf_path or cluster.filepath_pdf_harvard %} - - {% if cluster.headmatter %} - - {% else %} - - {% if cluster.correction %} - - {% endif %} - {% if cluster.attorneys %} - - {% endif %} - {% if cluster.headnotes %} - - {% endif %} - {% if cluster.syllabus %} - - {% endif %} - {% if cluster.summary %} - - {% endif %} - {% if cluster.history %} - - {% endif %} - {% if cluster.disposition %} - - {% endif %} - {% endif %} - {% for sub_opinion in cluster.ordered_opinions %} - - {% endfor %} -
- - {% if cluster.sub_opinions.all.first.extracted_by_ocr or "U" in cluster.source and tab == "opinions" %} -
-

The text of this document was obtained by analyzing a scanned document and may have typos.

-
{% endif %} - {% if tab == "authorities" %} -
-

- This page displays all the citations that have been extracted and linked in our system. Please note, it does not serve as a comprehensive list of all citations within the document. -

-
-
- {% endif %} + + + {% if cluster.sub_opinions.all.first.extracted_by_ocr or "U" in cluster.source and tab == "opinions" %} +
+

The text of this document was obtained by analyzing a scanned document and may have typos. +

+
+
+ {% endif %} - {% if tab == "summaries" %} -
-

- Summaries or parenthetical groupings are used to - provide concise explanations or clarifications about a - case’s procedural posture, legal principles, or - facts that are immediately relevant to the citation, - typically enclosed in parentheses following a case citation. + {% if tab == "authorities" %} +

+

+ This page displays all the citations that have been extracted and linked in our system. Please note, it does not serve as a comprehensive list of all citations within the document. +

+
+
+ {% endif %} -

-
-
- {% endif %} + {% if tab == "related-cases" %} +
+

+ The Related Cases query is used to find legal cases + related to a given case by analyzing textual similarities. + It identifies and retrieves cases with similar content, + allowing for the generation of a summary of related cases, + including their names, links, and filing dates, + to help users explore precedents or comparable rulings. +

+
+ {% endif %} -
- {# Sponsored by #} - {% if sponsored %} - - {% else %} - {% include "includes/donate_sidebar.html" with referrer="o-donate-now" %} - {% endif %} + {% if tab == "summaries" %} +
+

+ Summaries or parenthetical groupings are used to + provide concise explanations or clarifications about a + case’s procedural posture, legal principles, or + facts that are immediately relevant to the citation, + typically enclosed in parentheses following a case citation. + +

+
+ {% endif %}
- - {% endwith %} +
+ {# Sponsored by #} + {% if sponsored %} + + {% else %} + {% include "includes/donate_sidebar.html" with referrer="o-donate-now" %} + {% endif %} +
+
{% endblock %} {% block body-classes %}opinion-body{% endblock %}