diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index ff00cf891a..37fbcf30d2 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -131,6 +131,11 @@ jobs: - name: Watch cl-es-sweep-indexer rollout status run: kubectl rollout status -n ${{ env.EKS_NAMESPACE }} deployment/cl-es-sweep-indexer + - name: Rollout cl-iquery-probe + run: kubectl set image -n ${{ env.EKS_NAMESPACE }} deployment/cl-iquery-probe cl-iquery-probe=freelawproject/courtlistener:${{ steps.vars.outputs.sha_short }}-prod + - name: Watch cl-iquery-probe rollout status + run: kubectl rollout status -n ${{ env.EKS_NAMESPACE }} deployment/cl-iquery-probe + # Watch "cronjobs" in k9s - name: Update cronjobs diff --git a/cl/alerts/management/commands/cl_send_alerts.py b/cl/alerts/management/commands/cl_send_alerts.py index 897b077fea..4792e78fd2 100644 --- a/cl/alerts/management/commands/cl_send_alerts.py +++ b/cl/alerts/management/commands/cl_send_alerts.py @@ -13,17 +13,25 @@ from django.template import loader from django.urls import reverse from django.utils.timezone import now +from elasticsearch_dsl import MultiSearch from elasticsearch_dsl import Q as ES_Q +from elasticsearch_dsl.response import Response from cl.alerts.models import Alert, RealTimeQueue from cl.alerts.utils import InvalidDateError -from cl.api.models import WebhookEventType +from cl.api.models import WebhookEventType, WebhookVersions from cl.api.webhooks import send_search_alert_webhook from cl.lib import search_utils from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.elasticsearch_utils import do_es_api_query +from cl.lib.elasticsearch_utils import ( + do_es_api_query, + limit_inner_hits, + set_child_docs_and_score, + set_results_highlights, +) from cl.lib.scorched_utils import ExtraSolrInterface from cl.lib.search_utils import regroup_snippets +from cl.lib.types import CleanData from cl.search.constants import ALERTS_HL_TAG, SEARCH_ALERTS_OPINION_HL_FIELDS from cl.search.documents import OpinionDocument from cl.search.forms import SearchForm @@ -106,6 +114,59 @@ def send_alert(user_profile, hits): msg.send(fail_silently=False) +def query_alerts_es( + cd: CleanData, v1_webhook: bool = False +) -> tuple[Response, Response | None]: + """Query ES for opinion alerts, optionally handling a V1 webhook query. + + :param cd: A CleanData object containing the query parameters. + :param v1_webhook: A boolean indicating whether to include a V1 webhook query. + :return: A tuple containing the main search response and an optional V1 + query response. + """ + + v1_results = None + search_query = OpinionDocument.search() + cd["highlight"] = True + main_query, _ = do_es_api_query( + search_query, + cd, + SEARCH_ALERTS_OPINION_HL_FIELDS, + ALERTS_HL_TAG, + "v4", + ) + main_query = main_query.extra( + from_=0, + size=settings.SCHEDULED_ALERT_HITS_LIMIT, + ) + multi_search = MultiSearch() + multi_search = multi_search.add(main_query) + + if v1_webhook: + search_query = OpinionDocument.search() + v1_query, _ = do_es_api_query( + search_query, + cd, + SEARCH_ALERTS_OPINION_HL_FIELDS, + ALERTS_HL_TAG, + "v3", + ) + v1_query = v1_query.extra( + from_=0, + size=settings.SCHEDULED_ALERT_HITS_LIMIT, + ) + multi_search = multi_search.add(v1_query) + + responses = multi_search.execute() + results = responses[0] + limit_inner_hits({}, results, cd["type"]) + set_results_highlights(results, cd["type"]) + set_child_docs_and_score(results) + if v1_webhook: + v1_results = responses[1] + return results, v1_results + + class Command(VerboseCommand): help = ( "Sends the alert emails on a real time, daily, weekly or monthly " @@ -152,10 +213,9 @@ def handle(self, *args, **options): if options["rate"] == Alert.REAL_TIME: self.clean_rt_queue() - def run_query(self, alert, rate): + def run_query(self, alert, rate, v1_webhook=False): results = [] - cd = {} - main_params = {} + v1_results = None logger.info(f"Now running the query: {alert.query}\n") # Make a dict from the query string. @@ -175,7 +235,7 @@ def run_query(self, alert, rate): if waffle.switch_is_active("oa-es-alerts-active"): # Return empty results for OA alerts. They are now handled # by Elasticsearch. - return query_type, results + return query_type, results, v1_results logger.info(f"Data sent to SearchForm is: {qd}\n") search_form = SearchForm(qd, is_es_form=self.o_es_alerts) @@ -187,7 +247,7 @@ def run_query(self, alert, rate): and len(self.valid_ids[query_type]) == 0 ): # Bail out. No results will be found if no valid_ids. - return query_type, results + return query_type, results, v1_results main_params = search_utils.build_main_query( cd, @@ -220,19 +280,7 @@ def run_query(self, alert, rate): ) if self.o_es_alerts: - search_query = OpinionDocument.search() - s, _ = do_es_api_query( - search_query, - cd, - SEARCH_ALERTS_OPINION_HL_FIELDS, - ALERTS_HL_TAG, - "v3", - ) - s = s.extra( - from_=0, - size=settings.SCHEDULED_ALERT_HITS_LIMIT, - ) - results = s.execute() + results, v1_results = query_alerts_es(cd, v1_webhook) else: # Ignore warnings from this bit of code. Otherwise, it complains # about the query URL being too long and having to POST it instead @@ -248,7 +296,7 @@ def run_query(self, alert, rate): regroup_snippets(results) logger.info(f"There were {len(results)} results.") - return qd, results + return qd, results, v1_results def send_emails_and_webhooks(self, rate): """Send out an email and webhook events to every user whose alert has a @@ -261,6 +309,13 @@ def send_emails_and_webhooks(self, rate): alerts = user.alerts.filter(rate=rate) logger.info(f"Running alerts for user '{user}': {alerts}") + # Query user's webhooks. + user_webhooks = user.webhooks.filter( + event_type=WebhookEventType.SEARCH_ALERT, enabled=True + ) + v1_webhook = WebhookVersions.v1 in { + webhook.version for webhook in user_webhooks + } if rate == Alert.REAL_TIME: if not user.profile.is_member: continue @@ -268,7 +323,9 @@ def send_emails_and_webhooks(self, rate): hits = [] for alert in alerts: try: - qd, results = self.run_query(alert, rate) + qd, results, v1_results = self.run_query( + alert, rate, v1_webhook + ) except: traceback.print_exc() logger.info( @@ -293,10 +350,13 @@ def send_emails_and_webhooks(self, rate): # Send webhook event if the user has a SEARCH_ALERT # endpoint enabled. - user_webhooks = user.webhooks.filter( - event_type=WebhookEventType.SEARCH_ALERT, enabled=True - ) for user_webhook in user_webhooks: + results = ( + v1_results + if alert.alert_type == SEARCH_TYPES.OPINION + and user_webhook.version == WebhookVersions.v1 + else results + ) send_search_alert_webhook( self.sis[search_type], results, user_webhook, alert ) diff --git a/cl/alerts/templates/alert_email_es.html b/cl/alerts/templates/alert_email_es.html index dc2f797268..7ef71cfcfe 100644 --- a/cl/alerts/templates/alert_email_es.html +++ b/cl/alerts/templates/alert_email_es.html @@ -75,17 +75,56 @@

- - View original: - - {% if result.download_url %} - - From the court - -   |   - {% endif %} + {% endif %} + {% if type == 'o' %} + + {% endif %} + {% if type == 'oa' %} +

+ + View original: + + {% if result.download_url %} + + From the court + +   |   + {% endif %} {% if result.local_path %} {# Provide link to S3. #} @@ -93,8 +132,6 @@

Date Argued: {% if result.dateArgued %} @@ -116,9 +153,7 @@

- {% endif %} - {% if type == 'o' or type == 'oa' %} -

+

{% if result|get_highlight:"text" %} …{{ result|get_highlight:"text"|safe|underscore_to_space }}… {% endif %} diff --git a/cl/alerts/templates/alert_email_es.txt b/cl/alerts/templates/alert_email_es.txt index 2b7ec3b569..8c0324e8f1 100644 --- a/cl/alerts/templates/alert_email_es.txt +++ b/cl/alerts/templates/alert_email_es.txt @@ -16,8 +16,14 @@ View Full Results / Edit this Alert: https://www.courtlistener.com/?{{ alert.que Disable this Alert (one click): https://www.courtlistener.com{% url "disable_alert" alert.secret_key %}{% endif %} {{forloop.counter}}. {{ result.caseName|render_string_or_list|safe|striptags }} ({% if result.court_id != 'scotus' %}{{ result.court_citation_string|render_string_or_list|striptags }} {% endif %}{% if type == 'o' or type == 'r' %}{{ result.dateFiled|date:"Y" }}{% elif type == 'oa' %}{{ result.dateArgued|date:"Y" }}{% endif %}) -{% if type == 'oa' %}{% if result.dateArgued %}Date Argued: {{ result.dateArgued|date:"F jS, Y" }}{% else %}Date Argued: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %} | Duration: {{ result.duration|naturalduration }}{% if result.judge %} | Judge: {{ result.judge|render_string_or_list|safe|striptags|underscore_to_space }}{% endif %}{% endif %} -{% if type == 'o' or type == 'oa' %}{% if result|get_highlight:"text" %}...{{ result|get_highlight:"text"|safe|striptags|underscore_to_space|compress_whitespace }}...{% endif %}{% endif %} +{% if type == 'oa' %}{% if result.dateArgued %}Date Argued: {{ result.dateArgued|date:"F jS, Y" }}{% else %}Date Argued: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %} | Duration: {{ result.duration|naturalduration }}{% if result.judge %} | Judge: {{ result.judge|render_string_or_list|safe|striptags|underscore_to_space }}{% endif %} +{% if result|get_highlight:"text" %}...{{ result|get_highlight:"text"|safe|striptags|underscore_to_space|compress_whitespace }}...{% endif %} +{% endif %} +{% if type == 'o' %}{% for doc in result.child_docs %}{% with doc=doc|get_es_doc_content:True %}{% if result.child_docs|length > 1 or doc.type != 'combined-opinion' %}{% if doc.text %}{{ doc.type_text }}{% endif %}{% endif %} + {% if doc.text %}...{{ doc.text|render_string_or_list|safe|striptags|underscore_to_space|compress_whitespace }}...{% endif %} + {% if doc.download_url %} - Download original from the court: {{doc.download_url}}{% endif %} + {% if doc.local_path %} - Download the original from our backup: https://storage.courtlistener.com/{{ doc.local_path }}{% endif %} +{% endwith %}{% endfor %}{% endif %} {% if type == 'r' %}{% if result.dateFiled %}Date Filed: {{ result.dateFiled|date:"F jS, Y" }}{% else %}Date Filed: Unknown Date {% endif %}{% if result.docketNumber %} | Docket Number: {{ result.docketNumber|render_string_or_list|safe|striptags }}{% endif %} {% for doc in result.child_docs %}{% with doc=doc|get_es_doc_content:scheduled_alert %} - {% if doc.short_description %}{{ doc.short_description|render_string_or_list|safe|striptags }} - {% endif %}Document #{% if doc.document_number %}{{ doc.document_number }}{% endif %}{% if doc.attachment_number %}, Attachment #{{ doc.attachment_number }}{% endif %} {% if doc.description %}Description: {{ doc.description|render_string_or_list|safe|striptags }}{% endif %} @@ -27,9 +33,8 @@ Disable this Alert (one click): https://www.courtlistener.com{% url "disable_ale {% if result.child_docs and result.child_remaining %}{% extract_q_value alert.query_run as q_value %}View Additional Results for this Case: https://www.courtlistener.com/?type={{ type|urlencode }}&q={% if q_value %}({{ q_value|urlencode }})%20AND%20{% endif %}docket_id%3A{{ result.docket_id|urlencode }}{% endif %} {% endif %}~~~~~ - View this item on our site: https://www.courtlistener.com{% if type == 'r' %}{{result.docket_absolute_url}}{% else %}{{result.absolute_url}}{% endif %} -{% if result.download_url %} - Download original from the court: {{result.download_url}} -{% endif %}{% if result.local_path %} - Download the original from our backup: https://storage.courtlistener.com/{{ result.local_path }}{% endif %}{% endfor %} - +{% if type == 'oa' %}{% if result.download_url %} - Download original from the court: {{result.download_url}} +{% endif %}{% if result.local_path %} - Download the original from our backup: https://storage.courtlistener.com/{{ result.local_path }}{% endif %}{% endif %}{% endfor %} {% endfor %} ************************ This alert brought to you by the 501(c)(3) non-profit Free Law Project diff --git a/cl/alerts/tests/tests.py b/cl/alerts/tests/tests.py index 8af31c06ae..990658c6a3 100644 --- a/cl/alerts/tests/tests.py +++ b/cl/alerts/tests/tests.py @@ -48,7 +48,9 @@ Webhook, WebhookEvent, WebhookEventType, + WebhookVersions, ) +from cl.api.utils import get_webhook_deprecation_date from cl.audio.factories import AudioWithParentsFactory from cl.audio.models import Audio from cl.donate.models import NeonMembership @@ -73,10 +75,14 @@ Opinion, RECAPDocument, ) -from cl.search.tasks import add_items_to_solr from cl.stats.models import Stat from cl.tests.base import SELENIUM_TIMEOUT, BaseSeleniumTest -from cl.tests.cases import APITestCase, ESIndexTestCase, TestCase +from cl.tests.cases import ( + APITestCase, + ESIndexTestCase, + SearchAlertsAssertions, + TestCase, +) from cl.tests.utils import MockResponse, make_client from cl.users.factories import UserFactory, UserProfileWithParentsFactory from cl.users.models import EmailSent @@ -564,7 +570,9 @@ async def test_alert_update(self) -> None: @override_switch("o-es-alerts-active", active=True) @mock.patch("cl.search.tasks.percolator_alerts_models_supported", new=[Audio]) -class SearchAlertsWebhooksTest(ESIndexTestCase, TestCase): +class SearchAlertsWebhooksTest( + ESIndexTestCase, TestCase, SearchAlertsAssertions +): """Test Search Alerts Webhooks""" @classmethod @@ -581,6 +589,7 @@ def setUpTestData(cls): event_type=WebhookEventType.SEARCH_ALERT, url="https://example.com/", enabled=True, + version=2, ) cls.webhook_enabled_1 = WebhookFactory( user=cls.user_profile_1.user, @@ -647,6 +656,7 @@ def setUpTestData(cls): event_type=WebhookEventType.SEARCH_ALERT, url="https://example.com/", enabled=True, + version=1, ) cls.search_alert_3 = AlertFactory( user=cls.user_profile_3.user, @@ -781,7 +791,7 @@ def test_send_search_alert_webhooks(self): len(mail.outbox), 4, msg="Outgoing emails don't match." ) - # Opinion email alert assertions + # First Opinion email alert assertions search_alert self.assertEqual(mail.outbox[0].to[0], self.user_profile.user.email) # Plain text assertions opinion_alert_content = mail.outbox[0].body @@ -793,18 +803,36 @@ def test_send_search_alert_webhooks(self): opinion_alert_content, ) self.assertIn("California vs Lorem", opinion_alert_content) - self.assertIn("california sit amet", opinion_alert_content) + self.assertIn( + "california sit amet", + opinion_alert_content, + msg="Alert content didn't match", + ) self.assertIn(self.dly_opinion_2.download_url, opinion_alert_content) self.assertIn( str(self.dly_opinion_2.local_path), opinion_alert_content ) - html_content = None - for content, content_type in mail.outbox[0].alternatives: - if content_type == "text/html": - html_content = content - break + html_content = self.get_html_content_from_email(mail.outbox[0]) # HTML assertions + self._count_alert_hits_and_child_hits( + html_content, + self.search_alert.name, + 1, + self.dly_opinion.cluster.case_name, + 2, + ) + + self._assert_child_hits_content( + html_content, + self.search_alert.name, + self.dly_opinion.cluster.case_name, + [ + self.dly_opinion.get_type_display(), + self.dly_opinion_2.get_type_display(), + ], + ) + self.assertIn("had 1 hit", html_content) self.assertIn( self.dly_opinion_2.cluster.docket.court.citation_string.replace( @@ -830,7 +858,27 @@ def test_send_search_alert_webhooks(self): mail.outbox[0].extra_headers["List-Unsubscribe"], ) - # Second Opinion alert + # Second Opinion alert search_alert_2 + html_content = self.get_html_content_from_email(mail.outbox[1]) + # HTML assertions + self._count_alert_hits_and_child_hits( + html_content, + self.search_alert.name, + 1, + self.dly_opinion.cluster.case_name, + 2, + ) + + self._assert_child_hits_content( + html_content, + self.search_alert.name, + self.dly_opinion.cluster.case_name, + [ + self.dly_opinion.get_type_display(), + self.dly_opinion_2.get_type_display(), + ], + ) + self.assertEqual(mail.outbox[1].to[0], self.user_profile_2.user.email) self.assertIn("daily opinion alert", mail.outbox[1].body) self.assertEqual( @@ -845,6 +893,24 @@ def test_send_search_alert_webhooks(self): mail.outbox[1].extra_headers["List-Unsubscribe"], ) + # Third Opinion alert search_alert_3 + html_content = self.get_html_content_from_email(mail.outbox[2]) + # HTML assertions + self._count_alert_hits_and_child_hits( + html_content, + self.search_alert_3.name, + 1, + self.dly_opinion.cluster.case_name, + 1, + ) + + self._assert_child_hits_content( + html_content, + self.search_alert_3.name, + self.dly_opinion.cluster.case_name, + [self.dly_opinion.get_type_display()], + ) + # Oral Argument Alert self.assertEqual(mail.outbox[3].to[0], self.user_profile.user.email) self.assertIn("daily oral argument alert ", mail.outbox[3].body) @@ -860,8 +926,11 @@ def test_send_search_alert_webhooks(self): mail.outbox[3].extra_headers["List-Unsubscribe"], ) - # Two webhook events should be sent, both of them to user_profile user - webhook_events = WebhookEvent.objects.all() + # 3 webhook events should be sent, 2 user_profile user and 1 user_profile_3 + webhook_events = WebhookEvent.objects.filter().values_list( + "content", flat=True + ) + self.assertEqual( len(webhook_events), 3, msg="Webhook events don't match." ) @@ -884,7 +953,46 @@ def test_send_search_alert_webhooks(self): }, } - for webhook_sent in webhook_events: + # Assert V2 Opinion Search Alerts Webhook + self._count_webhook_hits_and_child_hits( + list(webhook_events), + self.search_alert.name, + 1, + self.dly_opinion.cluster.case_name, + 2, + "opinions", + ) + + # Assert HL content in V2 webhooks. + self._assert_webhook_hit_hl( + webhook_events, + self.search_alert.name, + "caseName", + "California vs Lorem", + child_field=False, + nested_field="opinions", + ) + self._assert_webhook_hit_hl( + webhook_events, + self.search_alert.name, + "snippet", + "Lorem dolor california sit amet, consectetur adipiscing elit.", + child_field=True, + nested_field="opinions", + ) + + # Assert V1 Opinion Search Alerts Webhook + self._count_webhook_hits_and_child_hits( + list(webhook_events), + self.search_alert_3.name, + 1, + self.dly_opinion.cluster.case_name, + 0, + None, + ) + + webhook_events_instances = WebhookEvent.objects.all() + for webhook_sent in webhook_events_instances: with self.subTest(webhook_sent=webhook_sent): self.assertEqual( webhook_sent.event_status, @@ -892,6 +1000,7 @@ def test_send_search_alert_webhooks(self): msg="The event status doesn't match.", ) content = webhook_sent.content + alert_data_compare = alert_data[ content["payload"]["alert"]["id"] ] @@ -925,14 +1034,13 @@ def test_send_search_alert_webhooks(self): if ( content["payload"]["alert"]["alert_type"] == SEARCH_TYPES.OPINION - ): + ) and webhook_sent.webhook.version == WebhookVersions.v1: # Assert the number of keys in the Opinions Search Webhook # payload keys_count = len(content["payload"]["results"][0]) self.assertEqual( keys_count, len(opinion_v3_search_api_keys) ) - # Iterate through all the opinion fields and compare them. for ( field, @@ -950,7 +1058,10 @@ def test_send_search_alert_webhooks(self): expected_value, f"Field '{field}' does not match.", ) - else: + elif ( + content["payload"]["alert"]["alert_type"] + == SEARCH_TYPES.ORAL_ARGUMENT + ): # Assertions for OA webhook payload. self.assertEqual( content["payload"]["results"][0]["caseName"], @@ -1544,6 +1655,14 @@ def setUpTestData(cls): event_type=WebhookEventType.OLD_DOCKET_ALERTS_REPORT, url="https://example.com/", enabled=True, + version=1, + ) + cls.webhook_v2_enabled = WebhookFactory( + user=cls.user_profile.user, + event_type=WebhookEventType.OLD_DOCKET_ALERTS_REPORT, + url="https://example.com/", + enabled=True, + version=2, ) cls.disabled_docket_alert = DocketAlertWithParentsFactory( docket__source=Docket.RECAP, @@ -1605,9 +1724,16 @@ def test_send_old_docket_alerts_webhook(self): self.assertEqual(active_docket_alerts.count(), 2) webhook_events = WebhookEvent.objects.all() - # Only one webhook event should be triggered for user_profile since + # Two webhook events (v1, v2) should be triggered for user_profile since # user_profile_2 webhook endpoint is disabled. - self.assertEqual(len(webhook_events), 1) + self.assertEqual(len(webhook_events), 2) + + # Confirm webhooks for V1 and V2 are properly triggered. + webhook_versions = { + webhook.content["webhook"]["version"] for webhook in webhook_events + } + self.assertEqual(webhook_versions, {1, 2}) + self.assertEqual( webhook_events[0].event_status, WEBHOOK_EVENT_STATUS.SUCCESSFUL, @@ -1653,6 +1779,21 @@ def test_send_old_docket_alerts_webhook(self): self.very_old_docket_alert.docket.pk, ) + # Confirm deprecation date webhooks according the version. + v1_webhook_event = WebhookEvent.objects.filter( + webhook=self.webhook_enabled + ).first() + v2_webhook_event = WebhookEvent.objects.filter( + webhook=self.webhook_v2_enabled + ).first() + self.assertEqual( + v1_webhook_event.content["webhook"]["deprecation_date"], + get_webhook_deprecation_date(settings.WEBHOOK_V1_DEPRECATION_DATE), + ) + self.assertEqual( + v2_webhook_event.content["webhook"]["deprecation_date"], None + ) + # Run command again with mock.patch( "cl.api.webhooks.requests.post", @@ -1701,10 +1842,16 @@ def test_send_old_docket_alerts_webhook_only_warn(self): # user_profile_2 self.assertEqual(len(mail.outbox), 2) - # Only one webhook event should be triggered for user_profile since + # Two webhook events (v1, v2) should be triggered for user_profile since # user_profile_2 webhook endpoint is disabled. webhook_events = WebhookEvent.objects.all() - self.assertEqual(len(webhook_events), 1) + self.assertEqual(len(webhook_events), 2) + + # Confirm webhooks for V1 and V2 are properly triggered. + webhook_versions = { + webhook.content["webhook"]["version"] for webhook in webhook_events + } + self.assertEqual(webhook_versions, {1, 2}) self.assertEqual( webhook_events[0].webhook.user, diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index d7940209da..e4610cffd9 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -2,6 +2,7 @@ from unittest import mock import time_machine +from django.conf import settings from django.core import mail from django.core.management import call_command from django.test.utils import override_settings @@ -26,6 +27,7 @@ ) from cl.api.factories import WebhookFactory from cl.api.models import WebhookEvent, WebhookEventType +from cl.api.utils import get_webhook_deprecation_date from cl.donate.models import NeonMembership from cl.lib.redis_utils import get_redis_interface from cl.lib.test_helpers import RECAPSearchTestCase @@ -49,7 +51,7 @@ from cl.search.models import Docket from cl.search.tasks import index_docket_parties_in_es from cl.stats.models import Stat -from cl.tests.cases import ESIndexTestCase, RECAPAlertsAssertions, TestCase +from cl.tests.cases import ESIndexTestCase, SearchAlertsAssertions, TestCase from cl.tests.utils import MockResponse from cl.users.factories import UserProfileWithParentsFactory @@ -59,7 +61,7 @@ return_value="alert_hits_sweep", ) class RECAPAlertsSweepIndexTest( - RECAPSearchTestCase, ESIndexTestCase, TestCase, RECAPAlertsAssertions + RECAPSearchTestCase, ESIndexTestCase, TestCase, SearchAlertsAssertions ): """ RECAP Alerts Sweep Index Tests @@ -625,6 +627,18 @@ def test_filter_out_alerts_to_send_by_query_and_hits( alert_de.docket.case_name, [rd_2.description], ) + webhook_events = WebhookEvent.objects.all().values_list( + "content", flat=True + ) + # Assert webhook event child hits. + self._count_webhook_hits_and_child_hits( + list(webhook_events), + cross_object_alert.name, + 1, + alert_de.docket.case_name, + 1, + ) + # Assert email text version: txt_email = mail.outbox[4].body self.assertIn(cross_object_alert.name, txt_email) @@ -1517,6 +1531,15 @@ def test_percolator_plus_sweep_alerts_integration( query=f'q=pacer_doc_id:0190645981 AND "SUBPOENAS SERVED CASE UPDATED"&type=r', ) + with mock.patch("cl.users.signals.notify_new_or_updated_webhook"): + webhook_2_1 = WebhookFactory( + user=self.user_profile.user, + event_type=WebhookEventType.SEARCH_ALERT, + url="https://example.com/", + enabled=True, + version=2, + ) + with mock.patch( "cl.api.webhooks.requests.post", side_effect=lambda *args, **kwargs: MockResponse( @@ -1560,10 +1583,25 @@ def test_percolator_plus_sweep_alerts_integration( webhook_events = WebhookEvent.objects.all().values_list( "content", flat=True ) - # 2 webhooks should be triggered one for each document ingested that - # matched each alert. + # One webhook should be triggered for each webhook version (V1, V2) and + # for each document ingested that matched each alert. 4 Webhook events total. + self.assertEqual( + len(webhook_events), 4, msg="Webhook events didn't match." + ) + + # Confirm webhooks for V1 and V2 are properly triggered. + webhook_versions = [ + webhook["webhook"]["version"] for webhook in webhook_events + ] self.assertEqual( - len(webhook_events), 2, msg="Webhook events didn't match." + webhook_versions.count(2), + 2, + msg="Wrong number of V2 webhook events.", + ) + self.assertEqual( + webhook_versions.count(1), + 2, + msg="Wrong number of V1 webhook events.", ) html_content = self.get_html_content_from_email(mail.outbox[0]) @@ -1623,10 +1661,40 @@ def test_percolator_plus_sweep_alerts_integration( webhook_events = WebhookEvent.objects.all().values_list( "content", flat=True ) - # 3 webhooks should be triggered one for each document ingested that - # matched each alert. + # One webhook should be triggered for each webhook version (V1, V2) and + # for each document ingested that matched each alert. 6 Webhook events total. + self.assertEqual( + len(webhook_events), 6, msg="Webhook events didn't match." + ) + + # Confirm webhooks for V1 and V2 are properly triggered. + webhook_versions = [ + webhook["webhook"]["version"] for webhook in webhook_events + ] + self.assertEqual( + webhook_versions.count(2), + 3, + msg="Wrong number of V2 webhook events.", + ) + self.assertEqual( + webhook_versions.count(1), + 3, + msg="Wrong number of V1 webhook events.", + ) + + # Confirm deprecation date webhooks according the version. + v1_webhook_event = WebhookEvent.objects.filter( + webhook=self.webhook_enabled + ).first() + v2_webhook_event = WebhookEvent.objects.filter( + webhook=webhook_2_1 + ).first() + self.assertEqual( + v1_webhook_event.content["webhook"]["deprecation_date"], + get_webhook_deprecation_date(settings.WEBHOOK_V1_DEPRECATION_DATE), + ) self.assertEqual( - len(webhook_events), 3, msg="Webhook events didn't match." + v2_webhook_event.content["webhook"]["deprecation_date"], None ) html_content = self.get_html_content_from_email(mail.outbox[1]) @@ -1656,7 +1724,7 @@ def test_percolator_plus_sweep_alerts_integration( return_value="alert_hits_percolator", ) class RECAPAlertsPercolatorTest( - RECAPSearchTestCase, ESIndexTestCase, TestCase, RECAPAlertsAssertions + RECAPSearchTestCase, ESIndexTestCase, TestCase, SearchAlertsAssertions ): """ RECAP Alerts Percolator Tests diff --git a/cl/api/api_permissions.py b/cl/api/api_permissions.py index 6562da789f..c5af0d0696 100644 --- a/cl/api/api_permissions.py +++ b/cl/api/api_permissions.py @@ -2,9 +2,9 @@ from django.conf import settings from django.contrib.auth.models import AnonymousUser, User -from django.http import HttpRequest from rest_framework import permissions from rest_framework.exceptions import PermissionDenied +from rest_framework.request import Request from rest_framework.views import APIView from cl.lib.redis_utils import get_redis_interface @@ -52,7 +52,7 @@ def is_user_v3_blocked(self, user: User) -> bool: return is_blocked_user @staticmethod - def is_v3_api_request(request: HttpRequest) -> bool: + def is_v3_api_request(request: Request) -> bool: return getattr(request, "version", None) == "v3" @staticmethod @@ -62,7 +62,7 @@ def check_request() -> bool: return True return False - def has_permission(self, request: HttpRequest, view: APIView) -> bool: + def has_permission(self, request: Request, view: APIView) -> bool: """Check if the user has permission to access the V3 API. :param request: The HTTPRequest object. diff --git a/cl/api/migrations/0013_add_webhook_version_choices_noop.py b/cl/api/migrations/0013_add_webhook_version_choices_noop.py new file mode 100644 index 0000000000..b19e00dbcb --- /dev/null +++ b/cl/api/migrations/0013_add_webhook_version_choices_noop.py @@ -0,0 +1,31 @@ +# Generated by Django 5.1.2 on 2024-11-14 23:36 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("api", "0012_alter_webhookevent_status_code"), + ] + + operations = [ + migrations.AlterField( + model_name="webhook", + name="version", + field=models.IntegerField( + choices=[(1, "v1"), (2, "v2")], + default=1, + help_text="The specific version of the webhook provisioned.", + ), + ), + migrations.AlterField( + model_name="webhookhistoryevent", + name="version", + field=models.IntegerField( + choices=[(1, "v1"), (2, "v2")], + default=1, + help_text="The specific version of the webhook provisioned.", + ), + ), + ] diff --git a/cl/api/migrations/0013_add_webhook_version_choices_noop.sql b/cl/api/migrations/0013_add_webhook_version_choices_noop.sql new file mode 100644 index 0000000000..074abd1666 --- /dev/null +++ b/cl/api/migrations/0013_add_webhook_version_choices_noop.sql @@ -0,0 +1,10 @@ +BEGIN; +-- +-- Alter field version on webhook +-- +-- (no-op) +-- +-- Alter field version on webhookhistoryevent +-- +-- (no-op) +COMMIT; diff --git a/cl/api/models.py b/cl/api/models.py index ca4cc8ed54..4ab2bac21d 100644 --- a/cl/api/models.py +++ b/cl/api/models.py @@ -16,6 +16,11 @@ class WebhookEventType(models.IntegerChoices): OLD_DOCKET_ALERTS_REPORT = 4, "Old Docket Alerts Report" +class WebhookVersions(models.IntegerChoices): + v1 = 1, "v1" + v2 = 2, "v2" + + HttpStatusCodes = models.IntegerChoices( # type: ignore "HttpStatusCodes", [(s.name, s.value) for s in HTTPStatus] # type: ignore[arg-type] ) @@ -48,7 +53,9 @@ class Webhook(AbstractDateTimeModel): help_text="An on/off switch for the webhook.", default=False ) version: models.IntegerField = models.IntegerField( - help_text="The specific version of the webhook provisioned.", default=1 + help_text="The specific version of the webhook provisioned.", + choices=WebhookVersions.choices, + default=WebhookVersions.v1, ) failure_count: models.IntegerField = models.IntegerField( help_text="The number of failures (400+ status) responses the webhook " @@ -57,7 +64,7 @@ class Webhook(AbstractDateTimeModel): ) def __str__(self) -> str: - return f"" + return f"" class WEBHOOK_EVENT_STATUS: diff --git a/cl/api/pagination.py b/cl/api/pagination.py index 9e79185d66..27ee3928d2 100644 --- a/cl/api/pagination.py +++ b/cl/api/pagination.py @@ -44,6 +44,8 @@ class VersionBasedPagination(PageNumberPagination): } ordering = "" cursor_ordering_fields = [] + is_count_request = False + count = 0 def __init__(self): super().__init__() @@ -88,6 +90,14 @@ def paginate_queryset(self, queryset, request, view=None): self.version = request.version self.request = request + self.is_count_request = ( + request.query_params.get("count") == "on" and self.version == "v4" + ) + + if self.is_count_request: + self.count = queryset.count() + return [] + do_cursor_pagination, requested_ordering = ( self.do_v4_cursor_pagination() ) @@ -103,10 +113,18 @@ def paginate_queryset(self, queryset, request, view=None): ) def get_paginated_response(self, data): + if self.is_count_request: + return Response({"count": self.count}) + do_cursor_pagination, _ = self.do_v4_cursor_pagination() if do_cursor_pagination: - # Get paginated response for CursorPagination - return self.cursor_paginator.get_paginated_response(data) + response = self.cursor_paginator.get_paginated_response(data) + # Build and include the count URL: + count_url = self.request.build_absolute_uri() + count_url = replace_query_param(count_url, "count", "on") + response.data["count"] = count_url + response.data.move_to_end("count", last=False) + return response # Get paginated response for PageNumberPagination return super().get_paginated_response(data) diff --git a/cl/api/tasks.py b/cl/api/tasks.py index ec1c5971ac..39c5fe7533 100644 --- a/cl/api/tasks.py +++ b/cl/api/tasks.py @@ -1,5 +1,4 @@ import json -from collections import defaultdict from typing import Any from elasticsearch_dsl.response import Hit @@ -12,13 +11,14 @@ from cl.api.webhooks import send_webhook_event from cl.celery_init import app from cl.corpus_importer.api_serializers import DocketEntrySerializer -from cl.lib.elasticsearch_utils import merge_highlights_into_result +from cl.lib.elasticsearch_utils import set_child_docs_and_score from cl.search.api_serializers import ( RECAPESWebhookResultSerializer, V3OAESResultSerializer, ) from cl.search.api_utils import ResultObject from cl.search.models import SEARCH_TYPES, DocketEntry +from cl.search.types import ESDictDocument @app.task() @@ -127,7 +127,7 @@ def send_es_search_alert_webhook( @app.task() def send_search_alert_webhook_es( - results: list[dict[str, Any]] | list[Hit], + results: list[ESDictDocument] | list[Hit], webhook_pk: int, alert_pk: int, ) -> None: @@ -152,34 +152,7 @@ def send_search_alert_webhook_es( es_results, many=True ).data case SEARCH_TYPES.RECAP: - for result in results: - child_result_objects = [] - child_docs = None - if isinstance(result, dict): - child_docs = result.get("child_docs") - elif hasattr(result, "child_docs"): - child_docs = result.child_docs - - if child_docs: - for child_doc in child_docs: - if isinstance(result, dict): - child_result_objects.append(child_doc) - else: - child_result_objects.append( - defaultdict( - lambda: None, - child_doc["_source"].to_dict(), - ) - ) - - result["child_docs"] = child_result_objects - # Merge HL into the parent document from percolator response. - if isinstance(result, dict): - meta_hl = result.get("meta", {}).get("highlight", {}) - merge_highlights_into_result( - meta_hl, - result, - ) + set_child_docs_and_score(results, merge_highlights=True) serialized_results = RECAPESWebhookResultSerializer( results, many=True ).data diff --git a/cl/api/templates/includes/toc_sidebar.html b/cl/api/templates/includes/toc_sidebar.html index 60a99cd69f..e609bfd201 100644 --- a/cl/api/templates/includes/toc_sidebar.html +++ b/cl/api/templates/includes/toc_sidebar.html @@ -33,6 +33,7 @@

Table of Contents

  • Parsing Uploaded Content
  • Filtering
  • Ordering
  • +
  • Counting
  • Field Selection
  • Pagination
  • Rate Limits
  • diff --git a/cl/api/templates/migration-guide.html b/cl/api/templates/migration-guide.html index 8b9d421529..4d1dbf7822 100644 --- a/cl/api/templates/migration-guide.html +++ b/cl/api/templates/migration-guide.html @@ -31,13 +31,13 @@

    V4 API Migration Guide

    - After several years of planning and development we have released v4 of our APIs. + After several years of planning and development, we have released v4 of our APIs.

    This upgrade responds to feedback we have received over the years and should be much better for our users — faster, more featureful, more scalable, and more accurate.

    - Unfortunately, we couldn't make these new APIs completely backwards compatible so this guide explains what's new. + Unfortunately, we couldn't make these new APIs completely backwards compatible, so this guide explains what's new.

    Support

    @@ -71,10 +71,10 @@

    Timeline for Changes

    What If I Do Nothing?

    - You might be fine. Most of the database and search APIs are only changing slightly and v3 will be supported for some period of time. + You might be fine. Most of the database and search APIs are only changing slightly, and v3 will be supported for some period of time. But you should read this guide to see if any changes are needed to your application.

    -

    The remainder of this guide is in three section:

    +

    The remainder of this guide is in three sections:

    -

    For more details please see our blog.

    +

    For more details, please see our blog.

    Breaking Changes to v3 of the Search API

    We cannot continue running Solr forever, but we can do our best to support v3 of the API. To do this, on November 25, 2024, v3 of the Search API will be upgraded to use ElasticSearch. We expect this to support most uses, but it will cause some breaking changes, as outlined in this section. diff --git a/cl/api/templates/rest-docs-vlatest.html b/cl/api/templates/rest-docs-vlatest.html index 90bc8a7159..91e7205bcd 100644 --- a/cl/api/templates/rest-docs-vlatest.html +++ b/cl/api/templates/rest-docs-vlatest.html @@ -324,6 +324,30 @@

    Ordering

    Ordering by fields with duplicate values is non-deterministic. If you wish to order by such a field, you should provide a second field as a tie-breaker to consistently order results. For example, ordering by date_filed will not return consistent ordering for items that have the same date, but this can be fixed by ordering by date_filed,id. In that case, if two items have the same date_filed value, the tie will be broken by the id field.

    +

    Counting

    +

    To retrieve the total number of items matching your query without fetching all the data, you can use the count=on parameter. This is useful for verifying filters and understanding the scope of your query results without incurring the overhead of retrieving full datasets. +

    +
    curl "{% get_full_host %}{% url "opinion-list" version="v4" %}?cited_opinion=32239&count=on"
    +
    +{"count": 3302}
    +

    When count=on is specified:

    + +

    In standard paginated responses, a count key is included with the URL to obtain the total count for your query:

    +
    curl "{% get_full_host %}{% url "opinion-list" version="v4" %}?cited_opinion=32239"
    +
    +{
    +  "count": "https://www.courtlistener.com/api/rest/v4/opinions/?cited_opinion=32239&count=on",
    +  "next": "https://www.courtlistener.com/api/rest/v4/opinions/?cited_opinion=32239&cursor=2",
    +  "previous": null,
    +  "results": [
    +    // paginated results
    +  ]
    +}
    +

    You can follow this URL to get the total count of items matching your query.

    Field Selection

    To save bandwidth and increase serialization performance, fields can be limited by using the fields parameter with a comma-separated list of fields. diff --git a/cl/api/tests.py b/cl/api/tests.py index 63a8e14aa5..e523d46a80 100644 --- a/cl/api/tests.py +++ b/cl/api/tests.py @@ -1,5 +1,5 @@ import json -from datetime import date, timedelta +from datetime import date, datetime, timedelta, timezone from http import HTTPStatus from typing import Any, Dict from unittest import mock @@ -86,7 +86,7 @@ TagViewSet, ) from cl.search.factories import CourtFactory, DocketFactory -from cl.search.models import SOURCES, Docket, Opinion +from cl.search.models import SOURCES, Court, Docket, Opinion from cl.stats.models import Event from cl.tests.cases import SimpleTestCase, TestCase, TransactionTestCase from cl.tests.utils import MockResponse, make_client @@ -320,6 +320,56 @@ def test_recap_api_required_filter(self, mock_logging_prefix) -> None: r = self.client.get(path, {"pacer_doc_id__in": "17711118263,asdf"}) self.assertEqual(r.status_code, HTTPStatus.OK) + def test_count_on_query_counts(self, mock_logging_prefix) -> None: + """ + Check that a v4 API request with param `count=on` only performs + 2 queries to the database: one to check the authenticated user, + and another to select the count. + """ + with CaptureQueriesContext(connection) as ctx: + path = reverse("docket-list", kwargs={"version": "v4"}) + params = {"count": "on"} + self.client.get(path, params) + + self.assertEqual( + len(ctx.captured_queries), + 2, + msg=f"{len(ctx.captured_queries)} queries executed, 2 expected", + ) + + executed_queries = [query["sql"] for query in ctx.captured_queries] + expected_queries = [ + 'FROM "auth_user" WHERE "auth_user"."id" =', + 'SELECT COUNT(*) AS "__count"', + ] + for executed_query, expected_fragment in zip( + executed_queries, expected_queries + ): + self.assertIn( + expected_fragment, + executed_query, + msg=f"Expected query fragment not found: {expected_fragment}", + ) + + def test_standard_request_no_count_query( + self, mock_logging_prefix + ) -> None: + """ + Check that a v4 API request without param `count=on` doesn't perform + a count query. + """ + with CaptureQueriesContext(connection) as ctx: + path = reverse("docket-list", kwargs={"version": "v4"}) + self.client.get(path) + + executed_queries = [query["sql"] for query in ctx.captured_queries] + for sql in executed_queries: + self.assertNotIn( + 'SELECT COUNT(*) AS "__count"', + sql, + msg="Unexpected COUNT query found in standard request.", + ) + class ApiEventCreationTestCase(TestCase): """Check that events are created properly.""" @@ -484,6 +534,8 @@ def setUpTestData(cls) -> None: cls.audio_path_v3 = reverse("audio-list", kwargs={"version": "v3"}) cls.audio_path_v4 = reverse("audio-list", kwargs={"version": "v4"}) + cls.debt_path_v4 = reverse("debt-list", kwargs={"version": "v4"}) + cls.debt_path_v3 = reverse("debt-list", kwargs={"version": "v3"}) def setUp(self) -> None: self.r = get_redis_interface("STATS") @@ -595,6 +647,27 @@ async def test_allow_v4_for_anonymous_users(self, mock_api_prefix) -> None: response = await self.async_client.get(self.audio_path_v4) self.assertEqual(response.status_code, HTTPStatus.OK) + async def test_confirm_v4_post_requests_are_not_allowed( + self, mock_api_prefix + ) -> None: + """Confirm V4 users are not allowed to POST requests.""" + response = await self.client_2.post(self.debt_path_v4, {}) + self.assertEqual(response.status_code, HTTPStatus.FORBIDDEN) + + async def test_confirm_v3_post_requests_are_not_allowed( + self, mock_api_prefix + ) -> None: + """Confirm V3 users are not allowed to POST requests.""" + response = await self.client_2.post(self.debt_path_v3, {}) + self.assertEqual(response.status_code, HTTPStatus.FORBIDDEN) + + async def test_confirm_anonymous_post_requests_are_not_allowed( + self, mock_api_prefix + ) -> None: + """Confirm anonymous users are not allowed to POST requests.""" + response = await self.async_client.post(self.debt_path_v4, {}) + self.assertEqual(response.status_code, HTTPStatus.UNAUTHORIZED) + class DRFOrderingTests(TestCase): """Does ordering work generally and specifically?""" @@ -643,12 +716,203 @@ async def assertCountInResults(self, expected_count): f"the JSON: \n{r.json()}", ) got = len(r.data["results"]) + try: + path = r.request.get("path") + query_string = r.request.get("query_string") + url = f"{path}?{query_string}" + except AttributeError: + url = self.path self.assertEqual( got, expected_count, - msg=f"Expected {expected_count}, but got {got}.\n\nr.data was: {r.data}", + msg=f"Expected {expected_count}, but got {got} in {url}\n\nr.data was: {r.data}", + ) + return r + + +class DRFCourtApiFilterTests(TestCase, FilteringCountTestCase): + @classmethod + def setUpTestData(cls): + Court.objects.all().delete() + + cls.parent_court = CourtFactory( + id="parent1", + full_name="Parent Court", + short_name="PC", + citation_string="PC", + in_use=True, + has_opinion_scraper=True, + has_oral_argument_scraper=False, + position=1, + start_date=date(2000, 1, 1), + end_date=None, + jurisdiction=Court.FEDERAL_APPELLATE, + date_modified=datetime(2021, 1, 1, tzinfo=timezone.utc), + ) + + cls.child_court1 = CourtFactory( + id="child1", + parent_court=cls.parent_court, + full_name="Child Court 1", + short_name="CC1", + citation_string="CC1", + in_use=False, + has_opinion_scraper=False, + has_oral_argument_scraper=True, + position=2, + start_date=date(2010, 6, 15), + end_date=date(2020, 12, 31), + jurisdiction=Court.STATE_SUPREME, + date_modified=datetime(2022, 6, 15, tzinfo=timezone.utc), + ) + cls.child_court2 = CourtFactory( + id="child2", + parent_court=cls.parent_court, + full_name="Child Court 2", + short_name="CC2", + citation_string="CC2", + in_use=True, + has_opinion_scraper=False, + has_oral_argument_scraper=False, + position=3, + start_date=date(2015, 5, 20), + end_date=None, + jurisdiction=Court.STATE_TRIAL, + date_modified=datetime(2023, 3, 10, tzinfo=timezone.utc), + ) + + cls.orphan_court = CourtFactory( + id="orphan", + full_name="Orphan Court", + short_name="OC", + citation_string="OC", + in_use=True, + has_opinion_scraper=False, + has_oral_argument_scraper=False, + position=4, + start_date=date(2012, 8, 25), + end_date=None, + jurisdiction=Court.FEDERAL_DISTRICT, + date_modified=datetime(2023, 5, 5, tzinfo=timezone.utc), ) + @async_to_sync + async def setUp(self): + self.path = reverse("court-list", kwargs={"version": "v4"}) + self.q: Dict[str, Any] = {} + + async def test_parent_court_filter(self): + """Can we filter courts by parent_court id?""" + self.q["parent_court"] = "parent1" + # Should return child1 and child2: + response = await self.assertCountInResults(2) + + # Verify the returned court IDs + court_ids = [court["id"] for court in response.data["results"]] + self.assertEqual(set(court_ids), {"child1", "child2"}) + + # Filter for courts with parent_court id='orphan' (none should match) + self.q = {"parent_court": "orphan"} + await self.assertCountInResults(0) + + async def test_no_parent_court_filter(self): + """Do we get all courts when using no filters?""" + self.q = {} + await self.assertCountInResults(4) # Should return all four courts + + async def test_invalid_parent_court_filter(self): + """Do we handle invalid parent_court values correctly?""" + self.q["parent_court"] = "nonexistent" + await self.assertCountInResults(0) + + async def test_id_filter(self): + """Can we filter courts by id?""" + self.q["id"] = "child1" + response = await self.assertCountInResults(1) + self.assertEqual(response.data["results"][0]["id"], "child1") + + async def test_in_use_filter(self): + """Can we filter courts by in_use field?""" + self.q = {"in_use": "true"} + await self.assertCountInResults(3) # parent1, child2, orphan + self.q = {"in_use": "false"} + await self.assertCountInResults(1) # child1 + + async def test_has_opinion_scraper_filter(self): + """Can we filter courts by has_opinion_scraper field?""" + self.q = {"has_opinion_scraper": "true"} + await self.assertCountInResults(1) # parent1 + self.q = {"has_opinion_scraper": "false"} + await self.assertCountInResults(3) # child1, child2, orphan + + async def test_has_oral_argument_scraper_filter(self): + """Can we filter courts by has_oral_argument_scraper field?""" + self.q = {"has_oral_argument_scraper": "true"} + await self.assertCountInResults(1) # child1 + self.q = {"has_oral_argument_scraper": "false"} + await self.assertCountInResults(3) # parent1, child2, orphan + + async def test_position_filter(self): + """Can we filter courts by position with integer lookups?""" + self.q = {"position__gt": "2"} + await self.assertCountInResults(2) # child2 (3), orphan (4) + self.q = {"position__lte": "2"} + await self.assertCountInResults(2) # parent1 (1), child1 (2) + + async def test_start_date_filter(self): + """Can we filter courts by start_date with date lookups?""" + self.q = {"start_date__year": "2015"} + await self.assertCountInResults(1) # child2 (2015-05-20) + self.q = {"start_date__gte": "2010-01-01"} + await self.assertCountInResults(3) # child1, child2, orphan + + async def test_end_date_filter(self): + """Can we filter courts by end_date with date lookups?""" + self.q = {"end_date__day": "31"} + await self.assertCountInResults(1) # parent1, child2, orphan + self.q = {"end_date__year": "2024"} + await self.assertCountInResults(0) + + async def test_short_name_filter(self): + """Can we filter courts by short_name with text lookups?""" + self.q = {"short_name__iexact": "Cc1"} + await self.assertCountInResults(1) # child1 + self.q = {"short_name__icontains": "cc"} + await self.assertCountInResults(2) # child1, child2 + + async def test_full_name_filter(self): + """Can we filter courts by full_name with text lookups?""" + self.q = {"full_name__istartswith": "Child"} + await self.assertCountInResults(2) # child1, child2 + self.q = {"full_name__iendswith": "Court"} + await self.assertCountInResults(2) # parent1, orphan + + async def test_citation_string_filter(self): + """Can we filter courts by citation_string with text lookups?""" + self.q = {"citation_string": "OC"} + await self.assertCountInResults(1) # orphan + self.q = {"citation_string__icontains": "2"} + await self.assertCountInResults(1) # child2 + + async def test_jurisdiction_filter(self): + """Can we filter courts by jurisdiction?""" + self.q = { + "jurisdiction": [ + Court.FEDERAL_APPELLATE, + Court.FEDERAL_DISTRICT, + ] + } + await self.assertCountInResults(2) # parent1 and orphan + + async def test_combined_filters(self): + """Can we filter courts with multiple filters applied?""" + self.q = { + "in_use": "true", + "has_opinion_scraper": "false", + "position__gt": "2", + } + await self.assertCountInResults(2) # child2 and orphan + class DRFJudgeApiFilterTests( SimpleUserDataMixin, TestCase, FilteringCountTestCase @@ -2561,3 +2825,100 @@ async def test_avoid_logging_not_successful_webhook_events( self.assertEqual(await webhook_events.acount(), 2) # Confirm no milestone event should be created. self.assertEqual(await milestone_events.acount(), 0) + + +class CountParameterTests(TestCase): + @classmethod + def setUpTestData(cls) -> None: + cls.user_1 = UserProfileWithParentsFactory.create( + user__username="recap-user", + user__password=make_password("password"), + ) + permissions = Permission.objects.filter( + codename__in=["has_recap_api_access", "has_recap_upload_access"] + ) + cls.user_1.user.user_permissions.add(*permissions) + + cls.court_canb = CourtFactory(id="canb") + cls.court_cand = CourtFactory(id="cand") + + cls.url = reverse("docket-list", kwargs={"version": "v4"}) + + for i in range(7): + DocketFactory( + court=cls.court_canb, + source=Docket.RECAP, + pacer_case_id=str(100 + i), + ) + for i in range(5): + DocketFactory( + court=cls.court_cand, + source=Docket.HARVARD, + pacer_case_id=str(200 + i), + ) + + def setUp(self): + self.client = make_client(self.user_1.user.pk) + + async def test_count_on_returns_only_count(self): + """ + Test that when 'count=on' is specified, the API returns only the count. + """ + params = {"count": "on"} + response = await self.client.get(self.url, params) + + self.assertEqual(response.status_code, 200) + # The response should only contain the 'count' key + self.assertEqual(list(response.data.keys()), ["count"]) + self.assertIsInstance(response.data["count"], int) + # The count should match the total number of dockets + expected_count = await Docket.objects.acount() + self.assertEqual(response.data["count"], expected_count) + + async def test_standard_response_includes_count_url(self): + """ + Test that the standard response includes a 'count' key with the count URL. + """ + response = await self.client.get(self.url) + + self.assertEqual(response.status_code, 200) + self.assertIn("count", response.data) + count_url = response.data["count"] + self.assertIsInstance(count_url, str) + self.assertIn("count=on", count_url) + + async def test_invalid_count_parameter(self): + """ + Test that invalid 'count' parameter values are handled appropriately. + """ + params = {"count": "invalid"} + response = await self.client.get(self.url, params) + + self.assertEqual(response.status_code, 200) + # The response should be the standard paginated response + self.assertIn("results", response.data) + self.assertIsInstance(response.data["results"], list) + + async def test_count_with_filters(self): + """ + Test that the count returned matches the filters applied. + """ + params = {"court": "canb", "source": Docket.RECAP, "count": "on"} + response = await self.client.get(self.url, params) + + self.assertEqual(response.status_code, 200) + expected_count = await Docket.objects.filter( + court__id="canb", + source=Docket.RECAP, + ).acount() + self.assertEqual(response.data["count"], expected_count) + + async def test_count_with_no_results(self): + """ + Test that 'count=on' returns zero when no results match the filters. + """ + params = {"court": "cand", "source": Docket.RECAP, "count": "on"} + response = await self.client.get(self.url, params) + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.data["count"], 0) diff --git a/cl/api/utils.py b/cl/api/utils.py index 3068cd4369..45188fbbfe 100644 --- a/cl/api/utils.py +++ b/cl/api/utils.py @@ -28,7 +28,12 @@ from rest_framework_filters import FilterSet, RelatedFilter from rest_framework_filters.backends import RestFrameworkFilterBackend -from cl.api.models import WEBHOOK_EVENT_STATUS, Webhook, WebhookEvent +from cl.api.models import ( + WEBHOOK_EVENT_STATUS, + Webhook, + WebhookEvent, + WebhookVersions, +) from cl.citations.utils import filter_out_non_case_law_and_non_valid_citations from cl.lib.redis_utils import get_redis_interface from cl.stats.models import Event @@ -878,12 +883,47 @@ class WebhookKeyType(TypedDict): deprecation_date: str | None +def get_webhook_deprecation_date(webhook_deprecation_date: str) -> str: + """Convert a webhook deprecation date string to ISO-8601 format with + UTC timezone. + + :param webhook_deprecation_date: The deprecation date as a string in + "YYYY-MM-DD" format. + :return: The ISO-8601 formatted date string with UTC timezone. + """ + + deprecation_date = ( + datetime.strptime(webhook_deprecation_date, "%Y-%m-%d") + .replace( + hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc + ) + .isoformat() + ) + return deprecation_date + + def generate_webhook_key_content(webhook: Webhook) -> WebhookKeyType: + """Generate a dictionary representing the content for the webhook key. + + :param webhook: The Webhook instance. + :return: A dictionary containing webhook details, event type, version, + creation date in ISO format, and deprecation date according webhook version. + """ + + deprecation_date: str | None = None + match webhook.version: + case WebhookVersions.v1: + deprecation_date = get_webhook_deprecation_date( + settings.WEBHOOK_V1_DEPRECATION_DATE # type: ignore + ) + case WebhookVersions.v2: + deprecation_date = None + return { "event_type": webhook.event_type, "version": webhook.version, "date_created": webhook.date_created.isoformat(), - "deprecation_date": None, + "deprecation_date": deprecation_date, } diff --git a/cl/api/webhooks.py b/cl/api/webhooks.py index f6ca97d9e3..15f1d3cabf 100644 --- a/cl/api/webhooks.py +++ b/cl/api/webhooks.py @@ -13,7 +13,12 @@ ) from cl.alerts.models import Alert from cl.alerts.utils import OldAlertReport -from cl.api.models import Webhook, WebhookEvent, WebhookEventType +from cl.api.models import ( + Webhook, + WebhookEvent, + WebhookEventType, + WebhookVersions, +) from cl.api.utils import ( generate_webhook_key_content, update_webhook_event_after_request, @@ -23,6 +28,7 @@ from cl.recap.api_serializers import PacerFetchQueueSerializer from cl.recap.models import PROCESSING_STATUS, PacerFetchQueue from cl.search.api_serializers import ( + OpinionClusterWebhookResultSerializer, SearchResultSerializer, V3OpinionESResultSerializer, ) @@ -192,10 +198,17 @@ def send_search_alert_webhook( ).data else: # ES results serialization - serialized_results = V3OpinionESResultSerializer( - results, - many=True, - ).data + match webhook.version: + case WebhookVersions.v1: + serialized_results = V3OpinionESResultSerializer( + results, + many=True, + ).data + case WebhookVersions.v2: + serialized_results = OpinionClusterWebhookResultSerializer( + results, + many=True, + ).data post_content = { "webhook": generate_webhook_key_content(webhook), diff --git a/cl/assets/static-global/css/opinions.css b/cl/assets/static-global/css/opinions.css new file mode 100644 index 0000000000..ff1b0200d3 --- /dev/null +++ b/cl/assets/static-global/css/opinions.css @@ -0,0 +1,721 @@ + + +/*Wrap all our changes around an opinion-body class we load up + in the opinion template*/ + +.opinion-body { + + .harvard > * { + font-family: Merriweather, "Times New Roman", Times, serif; + font-size: 15px; + letter-spacing: 0.2px; + text-align: justify; + padding:0px; + margin: 0px; + background-color: white; + border: none; + line-height: 2.3em; + } + + #headmatter > parties { + text-align: center; + font-style: initial; + font-size: 2em; + display: block; + } + #headmatter > div.footnotes > .footnote > p { + line-height: 1em; + } + + #headmatter > * { + text-indent: 2em; + } + + #headmatter docketnumber, + #headmatter court, + #headmatter parties, + #headmatter attorneys, + #headmatter syllabus, + #headmatter decisiondate { + display: block; + } + + #headmatter > div.footnotes { + border-top: None; + padding-top: 1em; + } + + .jump-links > a{ + position: relative; + margin: -8px 20px 0 0; + width: 140px; + line-height: 18px; + font-size: 14px; + cursor: pointer; + white-space: nowrap; + text-overflow: ellipsis; + opacity: 1; + } + + .hr-opinion { + border-top: 2px solid black; + } + + /*Clean up the Case Caption section to look large and clean*/ + .case-caption { + font-size: 3em; + font-weight: 500; + text-align: left; + line-height: 1.1em; + margin-top: 50px; + } + + + .case-court { + font-size: 25px; + text-align: left; + } + +/*Update sidebar jump links to look nice*/ +.jump-links { + font-size: 12px; + padding-top: 5px; +} + + li.jump-links.active { + color: #B53C2C; + font-weight: bold; + } + + li.jump-links { + list-style-type: none; + padding-left: 0; + } + + li.jump-links::before { + content: ""; + border-left: 3px solid lightgrey; + height: 1em; + padding-right: 8px; + display: inline-block; + margin-right: 5px; + } + + li.jump-links.active::before { + content: ""; + border-left: 2px solid #B53C2C; + padding-right: 8px; + display: inline-block; + margin-right: 5px; + } + + + .jump-links { + font-size: 12px; + padding-top: 5px; +} + +li.jump-links { + height:2.5em; + list-style-type: none; + padding-left: 0; + position: relative; +} + +li.jump-links::before { + content: ""; + border-left: 2px solid lightgrey; + height: 100%; + position: absolute; + left: 0; + top: 0; + padding-right: 8px; + display: inline-block; +} + +/* Active link styles */ +li.jump-links > a.active { + font-weight: 500; + color: black; +} + +li.jump-links > a { + padding-left:10px; + color: black; +} + + +div.footnote:first-of-type { + border-top: 1px solid black; + width: 100%; + display: block; + } + + /*Columbia specific Fix*/ + /*Columbia/HTML Law box special footnotes data almost awlays starts with fn1*/ + footnote_body sup#fn1 { + padding-top: 10px; + border-top: 1px solid black; + width: 100%; + display: block; + } + + /*HTML law box page numbers*/ + strong[data-ref] { + font-size: 0.8em; + fon: italic; + } + + strong[data-ref]::before { + content: attr(data-ref); + display: inline; + position: relative; + float: right; + left: -.5em; + font-size: 0.8em; + color: dimgray; + width: 0; + } + + + div.footnote { + padding-top: 10px; + display: block; + line-height: 1em; + } + + div.footnote > p { + display: inline; + } + + div.footnote::before { + content: attr(label) " "; + font-weight: bold; + color: #000; + margin-right: 5px; + padding-top: 2em; + } + + div.footnote { + padding-top: 10px; + font-size: 12px; + } + + div.footnote > * { + padding-top: 10px; + font-size: 12px; + } + + + /*To help separate footnotes from opinion document*/ + footnote:first-of-type { + border-top: 1px solid black; + width: 100%; + display: block; + } + + footnote { + padding-top: 10px; + display: block; + line-height: 1.5em; + /*margin-left: 1em;*/ + padding-left: 40px; + } + + footnote > p { + display: inline; + } + + footnote::before { + content: attr(label); + font-weight: bold; + color: #000; + margin-right: 26px; + padding-top: 2em; + margin-left: -35px; + } + + /*Handle CSS in Columbia opinions*/ + footnotemark { + font-weight: bold; + font-size: 0.8em; + vertical-align: super; + line-height: 0; + } + + + #cited-by { + z-index: 1; + } + + footnotemark { + cursor: pointer; + color: blue; + text-decoration: underline; + } + + footnote { + padding-top: 10px; + font-size: 12px; + } + + + .jumpback { + color: blue; + cursor: pointer; + font-weight: bold; + margin-left: 5px; + } + + /*Jump backs are empty in resource.org documents for now*/ + #resource-org-text .jumpback { + display: none; + } + + + footnote > * { + font-size: 12px; + } + + author > page-number { + display: block; + font-size: 15px; + } + + author { + display: inline; + margin: 0; /* Remove any default margin */ + text-indent: 2em; /* Indents the first line by 2em */ + } + + /*Important for indenting harvard opinions correctly*/ + opinion > p[id^="b"] { + text-indent: 2em; + } + + + opinion > [id^="p-"] { + padding-left: 2em; + text-indent: 2em; + } +} + +[id^="A"] { + text-indent: 2em; + display: inline; + +} + +.opinion-body { + /*I think i did this but i dont know why so im leaving it for now*/ + /*.tab-pane {*/ + /* display: none; */ + /*}*/ + + .tab-pane.active { + display: block; + } + + @media (min-width: 767px) { + + #sidebar { + display: flex; + flex-direction: column; + height: 100vh; + justify-content: space-between; /* Push content apart */ + padding: 20px; + padding-top: 3px; + overflow-y: auto; + position: -webkit-sticky; /* For Safari */ + position: sticky; + top: 0; /* Stick to the top of the viewport */ + + } + } + + @media (min-width: 100px) { + #sidebar { + height: auto; + } + } + + .sidebar-bottom { + margin-top: auto; + } + + .support-flp, .sponsored-by { + margin-bottom: 20px; + text-align: center; + } + + #opinion > article > * > p { + text-indent: 2em; + } + + .active > a { + border-bottom-color: #B53C2C; + } + + #opinion p { + text-indent: 2em; + } + + + .nav-pills > li > a { + padding: 1px 15px; + } + + blockquote > * { + text-indent: 0em; + } + + sup { + font-size: .9em; + } + + .main-document { + padding-bottom: 5em; + } + + /*Case Caption CSS*/ + #caption-square { + background-color: #F6F2EE; + margin-left: -15px; + margin-right: -15px; + margin-top: -20px; + } + + #caption-square > ul > li { + background-color: #fcfaf9; + border-top-right-radius: 5px 5px; /* Rounds the corners */ + border-top-left-radius: 5px 5px; /* Rounds the corners */ + margin-left: 4px; + } + + #caption-square > ul > li.active { + background-color: #ffffff; + border-bottom: 1px solid lightgrey; + } + + #caption-square > ul > li.active { + background-color: #ffffff; + border-bottom: 1px solid white; + } + + #caption-square > ul > li.active > a { + border: 1px solid white; + } + + /*Opinion Date File*/ + + .case-date-new { + border: 1px solid #B53C2C; + padding: 0px 10px; + border-radius: 20px; /* Rounds the corners */ + color: #B53C2C; + } + + + + /*Buttons on Top of Page*/ + .add-a-note { + margin-left: 5px; + border: 1px solid black; + border-radius: 10px; + padding-left: 8px; + padding-right: 8px; + } + + .add-citation-alert { + border: 1px solid black; + border-radius: 10px; + padding-left: 8px; + padding-right: 8px; + } + + cross_reference { + font-style: italic; + } + + #opinion-caption { + margin-top: 20px; + font-family: Merriweather, "Times New Roman", Times, serif; + font-size: 15px; + letter-spacing: 0.2px; + line-height: 2.3em; + margin-bottom: 20px; + padding-left: 20px; + padding-top: 10px; + padding-right: 10px; + } + + .case-details { + font-size: 16px; + } + + .case-details li { + line-height: 1.5em; + } + + span.citation.no-link { + font-style: italic; + } + + .opinion-button-row { + padding-top: 40px; + } + + #download-original { + color: black; + border-color: black; + background-color: white; + } + + + #btn-group-download-original { + float:right; + margin-top: 0px; + margin-left:10px; + padding-right: 10px; + } + #add-note-button { + color: black; + border-color: black; + background-color: white; + } + + .top-row { + height: 32px; + line-height:28px + } + + .action-buttons{ + display: flex; + column-gap: 5px; + } + + #get-citation-btn-group { + float:right; + } + + #get-citation-btn-group > a { + + color: black; + border-color: black; + background-color: white; + vertical-align: top; + } + + + p > span.star-pagination::after { + display: inline; + position: relative; + content: attr(label);; + float: left; + left: -4.5em; + font-size: 1em; + color: dimgray; + width: 0; + } + + div > span.star-pagination::after { + display: inline; + position: relative; + content: attr(label);; + float: left; + left: -2.5em; + font-size: 1em; + color: dimgray; + width: 0; + } + + div.subopinion-content > .harvard { + font-family: Merriweather, "Times New Roman", Times, serif; + font-size: 15px; + letter-spacing: 0.2px; + line-height: 2.3em; + text-align: justify; + } + + #columbia-text { + font-family: Merriweather, "Times New Roman", Times, serif; + font-size: 15px; + letter-spacing: 0.2px; + line-height: 2.3em; + text-align: justify; + } + + #columbia-text > div.subopinion-content > div > p > span.star-pagination { + color: #555555; + } + + #columbia-text > div.subopinion-content > div > p > span.star-pagination::after { + display: inline; + position: relative; + content: attr(label);; + float: left; + left: -4.5em; + font-size: 1em; + color: dimgray; + width: 0; + } + + + page-number::after { + display: inline; + position: relative; + content: attr(label); + float: right; + font-size: 1em; + color: dimgray; + width: 0; + } + + page-number { + font-style: italic; + font-size: 0.8em; + margin-right: 4px; + margin-left: 2px; + } + + page-label { + font-style: italic; + font-size: 0.8em; + margin-right: 4px; + margin-left: 2px; + } + + page-label { + cursor: pointer; + } + + page-label:hover { + color: darkblue; + text-decoration: underline; /* Example hover styling */ + } + + page-label::after { + display: inline; + position: relative; + content: attr(data-label); + float: right; + font-size: 1em; + color: dimgray; + width: 0; + } + + a.page-label { + font-style: italic; + font-size: 0.8em; + margin-right: 4px; + margin-left: 2px; + color: #555555; + } + + + a.page-label::after { + display: inline; + position: relative; + content: attr(data-label); + float: right; + font-size: 1em; + color: dimgray; + width: 0; + } + + footnote > blockquote > a.page-label::after { + right: -1.0em; + } + + blockquote[id^="A"] > a.page-label::after { + right: -1.0em; + } + + blockquote[id^="b"] > a.page-label::after { + right: -1.0em; + } + + opinion > a.page-label::after { + right: -1.0em; + text-indent: 0; + } + + .harvard a.page-label::after { + right: -1.0em; + text-indent: 0; + position: absolute; + } + + /* Adjust to move the entire blockquote to the right */ + blockquote { + margin-left: 3em; + display: block; + } + + div.counsel > a.page-label::after { + right: -1.0em; + } + + footnote > p > a.page-label::after { + display: none; + } + + footnote > blockquote > a.page-label::after { + display: none; + } + + /*Remove the header on the opinion page so its flush*/ + header { + margin-bottom: 0px; + } + + .harvard > opinion > author { + line-height: inherit; + font-size: inherit; + display: inline-block; + } + + .container > .content { + margin-bottom: 0em; + } + + .meta-data-header { + font-size:15px; + } + + .case-details { + font-family: Merriweather, "Times New Roman", Times, serif; + letter-spacing: 0.2px; + line-height:2.3em; + } + + .opinion-section-title { + margin-top: 50px; + font-family: Merriweather, "Times New Roman", Times, serif; + } + + /*Add style to align roman numerals */ + .center-header { + text-align: center; + font-size: 2em; + } + + /*If XS screen - remove the side page labels*/ + @media (max-width: 768px) { + a.page-label::after { + display: none; + } + a.page-number::after { + display: none; + } + } + + .scraped-html p { + display: block; + text-indent: 1em; + } +} + +html { + scroll-behavior: smooth; +} \ No newline at end of file diff --git a/cl/assets/static-global/css/override.css b/cl/assets/static-global/css/override.css index 02f4be2062..ce071745d2 100644 --- a/cl/assets/static-global/css/override.css +++ b/cl/assets/static-global/css/override.css @@ -155,7 +155,30 @@ header { /* Standard target color. */ *:target { - background-color: lightyellow; + -webkit-animation: target-fade 3s; + -moz-animation: target-fade 3s; + -o-animation: target-fade 3s; + animation: target-fade 3s; +} + +@-webkit-keyframes target-fade { + from { background-color: lightyellow; } + to { background-color: transparent; } +} + +@-moz-keyframes target-fade { + from { background-color: lightyellow; } + to { background-color: transparent; } +} + +@-o-keyframes target-fade { + from { background-color: lightyellow; } + to { background-color: transparent; } +} + +@keyframes target-fade { + from { background-color: lightyellow; } + to { background-color: transparent; } } .alt { @@ -1008,17 +1031,9 @@ closely the content in the book*/ #headmatter > .footnotes > .footnote > a { color: #000099; - position: absolute; font-size: 1em; } -#headmatter { - border: 1px rgba(210, 210, 210, 0.55) solid; - padding: 10px; - background: rgba(232, 232, 232, 0.37); - margin: 10px; -} - #headmatter > attorneys, docketnumbers, judges, footnotes, court, decisiondate { line-height: 2em; font-size: 14px; @@ -1607,7 +1622,7 @@ textarea { /* Prevent images inside opinion from overflowing */ -#opinion-content img { +div.subopinion-content img { max-width: 100%; height: auto; } diff --git a/cl/assets/static-global/js/base.js b/cl/assets/static-global/js/base.js index 99355aa207..9d69f158bb 100644 --- a/cl/assets/static-global/js/base.js +++ b/cl/assets/static-global/js/base.js @@ -307,11 +307,8 @@ $(document).ready(function () { if (modal_exist) { $('#open-modal-on-load').modal(); } - }); - - // Debounce - rate limit a function // https://davidwalsh.name/javascript-debounce-function function debounce(func, wait, immediate) { @@ -369,3 +366,4 @@ if (form && button) { button.disabled = true; }); } + diff --git a/cl/assets/static-global/js/opinions.js b/cl/assets/static-global/js/opinions.js new file mode 100644 index 0000000000..984999c239 --- /dev/null +++ b/cl/assets/static-global/js/opinions.js @@ -0,0 +1,291 @@ + +//////////////// +// Pagination // +//////////////// + +// Star pagination weirdness for ANON 2020 dataset - + +$('.star-pagination').each(function (index, element) { + if ($(this).attr('pagescheme')) { + // For ANON 2020 this has two sets of numbers but only one can be + // verified with other databses so only showing one + var number = $(this).attr('number'); + if (number.indexOf('P') > -1) { + $(this).attr('label', ''); + } else { + $(this).attr('label', number); + } + } else { + $(this).attr('label', this.textContent.trim().replace('*Page ', '')); + } +}); + +// Systematize page numbers +$('page-number').each(function (index, element) { + // Get the label and citation index from the current element + const label = $(this).attr('label'); + const citationIndex = $(this).attr('citation-index'); + + // Clean up the label (remove '*') and use it for the new href and id + const cleanLabel = label.replace('*', '').trim(); + + // Create the new element + const $newAnchor = $('') + .addClass('page-label') + .attr('data-citation-index', citationIndex) + .attr('data-label', cleanLabel) + .attr('href', '#' + cleanLabel) + .attr('id', cleanLabel) + .text('*' + cleanLabel); + + // Replace the element with the new element + $(this).replaceWith($newAnchor); +}); + +// Systematize page numbers +$('span.star-pagination').each(function (index, element) { + // Get the label and citation index from the current element + const label = $(this).attr('label'); + const citationIndex = $(this).attr('citation-index'); + + // Clean up the label (remove '*') and use it for the new href and id + const cleanLabel = label.replace('*', '').trim(); + + // Create the new element + const $newAnchor = $('') + .addClass('page-label') + .attr('data-citation-index', citationIndex) + .attr('data-label', cleanLabel) + .attr('href', '#' + cleanLabel) + .attr('id', cleanLabel) + .text('*' + cleanLabel); + + // Replace the element with the new element + $(this).replaceWith($newAnchor); +}); +// Fix weird data-ref bug +document.querySelectorAll('strong').forEach((el) => { + if (/\[\d+\]/.test(el.textContent)) { + // Check if the text matches the pattern [XXX] + const match = el.textContent.match(/\[\d+\]/)[0]; // Get the matched pattern + el.setAttribute('data-ref', match); // Set a data-ref attribute + } +}); + +/////////////// +// Footnotes // +/////////////// + + + + +// We formatted the harvard footnotes oddly when they appeared inside the pre-opinion content. +// this removes the excess a tags and allows us to standardize footnotes across our contents +// footnote cleanup in harvard +// Update and modify footnotes to enable linking + + // This is needed for variations in resource.org footnotes +// This is needed for variations in resource.org footnotes +$('.footnotes > .footnote').each(function () { + var $this = $(this); + var newElement = $(''); // Create a new element + + // Copy attributes and content from the original element + $.each(this.attributes, function (_, attr) { + newElement.attr(attr.name, attr.value); + }); + newElement.html($this.html()); // Copy the inner content + $this.replaceWith(newElement); // Replace the original

    with +}); + + +$('div.footnote > a').remove(); +const headfootnotemarks = $('a.footnote'); +const divfootnotes = $('div.footnote'); + +if (headfootnotemarks.length === divfootnotes.length) { + headfootnotemarks.each(function (index) { + const footnoteMark = $(this); + const footnote = divfootnotes.eq(index); + + const $newElement = $(''); + $.each(footnoteMark.attributes, function () { + if (footnoteMark.specified) { + $newElement.attr(footnoteMark.name, footnoteMark.value); + } + }); + $newElement.html(footnoteMark.html()); + footnoteMark.replaceWith($newElement); + + const $newFootnote = $(''); + $.each(footnote.attributes, function () { + if (footnote.specified) { + $newFootnote.attr(footnote.name, footnote.value); + } + }); + $newFootnote.attr('label', footnote.attr('label')); + $newFootnote.html(footnote.html()); + footnote.replaceWith($newFootnote); + }); +} + +// This fixes many of the harvard footnotes so that they can +// easily link back and forth - we have a second set +// of harvard footnotes inside headnotes that need to be parsed out now +// okay. + +const footnoteMarks = $('footnotemark'); +const footnotes = $('footnote').not('[orphan="true"]'); + +if (footnoteMarks.length === footnotes.length) { + // we can make this work + footnoteMarks.each(function (index) { + const footnoteMark = $(this); + const $newElement = $(''); + // Copy attributes from the old element + $.each(footnoteMark.attributes, function () { + if (footnoteMark.specified) { + $newElement.attr(footnoteMark.name, footnoteMark.value); + } + }); + $newElement.html(footnoteMark.html()); + const $supElement = $('').append($newElement); + footnoteMark.replaceWith($supElement); + const footnote = footnotes.eq(index); + $newElement.attr('href', `#fn${index}`); + $newElement.attr('id', `fnref${index}`); + footnote.attr('id', `fn${index}`); + + const $jumpback = $(''); + $jumpback.attr('href', `#fnref${index}`); + + footnote.append($jumpback); + }); +} else { + // If the number of footnotes and footnotemarks are inconsistent use the method to scroll to the nearest one + // we dont use this by default because many older opinions will reuse * ^ and other icons repeatedly on every page + // and so label is no usable to identify the correct footnote. + + footnotes.each(function (index) { + const $jumpback = $(''); + $jumpback.attr('label', $(this).attr('label')); + $(this).append($jumpback); + }); + + // There is no silver bullet for footnotes + $('footnotemark').on('click', function () { + const markText = $(this).text().trim(); // Get the text of the clicked footnotemark + const currentScrollPosition = $(window).scrollTop(); // Get the current scroll position + + // Find the first matching footnote below the current scroll position + const targetFootnote = $('footnote') + .filter(function () { + return $(this).attr('label') === markText && $(this).offset().top > currentScrollPosition; + }) + .first(); + + // If a matching footnote is found, scroll to it + if (targetFootnote.length > 0) { + $('html, body').animate( + { + scrollTop: targetFootnote.offset().top, + }, + 500 + ); // Adjust the animation duration as needed + } else { + // console.warn('No matching footnote found below the current position for:', markText); + } + }); + + + ////////////// + // Sidebar // + ///////////// + + $('.jumpback').on('click', function () { + const footnoteLabel = $(this).attr('label').trim(); // Get the label attribute of the clicked footnote + const currentScrollPosition = $(window).scrollTop(); // Get the current scroll position + + // Find the first matching footnotemark above the current scroll position + const targetFootnotemark = $('footnotemark') + .filter(function () { + return $(this).text().trim() === footnoteLabel && $(this).offset().top < currentScrollPosition; + }) + .last(); + + // If a matching footnotemark is found, scroll to it + if (targetFootnotemark.length > 0) { + $('html, body').animate( + { + scrollTop: targetFootnotemark.offset().top, + }, + 500 + ); // Adjust the animation duration as needed + } else { + // console.warn('No matching footnotemark found above the current position for label:', footnoteLabel); + } + }); +} + +$(document).ready(function () { + function adjustSidebarHeight() { + if ($(window).width() > 767) { + // Only apply the height adjustment for screens wider than 767px + var scrollTop = $(window).scrollTop(); + if (scrollTop <= 175) { + $('.opinion-sidebar').css('height', 'calc(100vh - ' + (175 - scrollTop) + 'px)'); + // $('.main-document').css('height', 'calc(100vh + ' + (scrollTop) + 'px)'); + } else { + $('.opinion-sidebar').css('height', '100vh'); + } + } else { + $('.opinion-sidebar').css('height', 'auto'); // Reset height for mobile view + } + } + + // Adjust height on document ready and when window is scrolled or resized + adjustSidebarHeight(); + $(window).on('scroll resize', adjustSidebarHeight); +}); + +// Update sidebar to show where we are on the page +document.addEventListener('scroll', function () { + let sections = document.querySelectorAll('.jump-link'); + let currentSection = ''; + + // Determine which section is currently in view + sections.forEach((section) => { + let sectionTop = section.offsetTop; + let sectionHeight = section.offsetHeight; + if (window.scrollY >= sectionTop - sectionHeight / 3) { + currentSection = section.getAttribute('id'); + } + }); + if (!currentSection) currentSection = 'top'; + // Remove the active class from links and their parent elements + let links = document.querySelectorAll('.jump-links > a.active'); + links.forEach((link) => { + link.classList.remove('active'); + if (link.parentElement) { + link.parentElement.classList.remove('active'); + } + }); + + // Add the active class to the link and its parent that corresponds to the current section + let activeLink = document.getElementById(`nav_${currentSection}`); + if (!activeLink) return; + + activeLink.classList.add('active'); + if (activeLink.parentElement) { + activeLink.parentElement.classList.add('active'); + } +}); + +document.querySelectorAll("page-label").forEach(label => { + label.addEventListener("click", function() { + const href = this.getAttribute("href"); + if (href) { + window.location.href = href; + } + }); +}); diff --git a/cl/assets/static-global/js/webhooks-page.js b/cl/assets/static-global/js/webhooks-page.js index bad5cfdd82..22fe9184e4 100644 --- a/cl/assets/static-global/js/webhooks-page.js +++ b/cl/assets/static-global/js/webhooks-page.js @@ -20,10 +20,12 @@ htmx.on('htmx:afterSwap', (e) => { let webhook_form = document.getElementById('webhooks-body'); if (e.detail.target.id === 'webhooks-body') { // If the user already have a webhook configured for each type of event, show a message. - let event_type_options = document.getElementById('id_event_type').options.length; + let event_type_options = Array.from(document.getElementById('id_event_type').options) + .filter(option => option.value !== "") //Filter out default option + .length; if (event_type_options === 0) { webhook_form.innerHTML = - "You already have a webhook configured for each type of event. Please delete one before making another."; + "You already have a webhook configured for each type of event and version available. Please delete one before making another."; } //Toggle form modal $('#webhook-modal').modal('toggle'); diff --git a/cl/assets/templates/base.html b/cl/assets/templates/base.html index be91672e34..0cc58c47f3 100644 --- a/cl/assets/templates/base.html +++ b/cl/assets/templates/base.html @@ -84,7 +84,7 @@

    You did not supply the "private" variable to your template.
    {% if FUNDRAISING_MODE %} - {% include 'includes/dismissible_nav_banner.html' with link="https://free.law/2024/01/18/new-recap-archive-search-is-live" text="A year in the making, today we are launching a huge new search engine for the RECAP Archive" emoji="🎁" cookie_name="no_banner"%} + {% include 'includes/dismissible_nav_banner.html' with link="https://donate.free.law/forms/givingtuesday" text="Today is GivingTuesday. Your support of Free Law Project helps make the justice system more transparent and accessible to all." cookie_name="giving_tuesday" button_text="Donate Today!"%} {% endif %} diff --git a/cl/assets/templates/includes/dismissible_nav_banner.html b/cl/assets/templates/includes/dismissible_nav_banner.html index 501e33c4a5..c1f3480830 100644 --- a/cl/assets/templates/includes/dismissible_nav_banner.html +++ b/cl/assets/templates/includes/dismissible_nav_banner.html @@ -3,12 +3,16 @@ available and takes up to four keyword arguments described below: Parameters: - link: The URL for the "Learn More" button. - text: Text of the banner. - cookie_name: Name of the cookie used to remember if the user has already dismissed - the banner. This prevents them from seeing the same message repeatedly. - emoji: Insert an emoji next to your banner message using its decimal HTML entity - code (like 👍). + - text: Text of the banner. + - link: The URL for the button. + - cookie_name: Name of the cookie used to remember if the user has already + dismissed the banner. This prevents them from seeing the same message + repeatedly. + - button_text (optional): Text for the button. Defaults to "Learn More". + - button_emoji (optional): An Idiomatic Text element () to display + inside the button. + - emoji (optional): An HTML entity code (e.g., 👍) to insert an + emoji next to the banner message. It's advisable to wrap this template within an if tag and use the parent element to add extra conditions to handle the visibility of the banner. The current template only checks @@ -36,14 +40,14 @@

    - diff --git a/cl/audio/api_views.py b/cl/audio/api_views.py index a444db4a98..fa6d518ec9 100644 --- a/cl/audio/api_views.py +++ b/cl/audio/api_views.py @@ -1,4 +1,5 @@ from rest_framework import viewsets +from rest_framework.permissions import DjangoModelPermissionsOrAnonReadOnly from cl.api.api_permissions import V3APIPermission from cl.api.utils import LoggingMixin @@ -10,7 +11,10 @@ class AudioViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = AudioSerializer filterset_class = AudioFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", diff --git a/cl/corpus_importer/management/commands/probe_iquery_pages_daemon.py b/cl/corpus_importer/management/commands/probe_iquery_pages_daemon.py index 759700673e..8a99322eb2 100644 --- a/cl/corpus_importer/management/commands/probe_iquery_pages_daemon.py +++ b/cl/corpus_importer/management/commands/probe_iquery_pages_daemon.py @@ -81,7 +81,7 @@ def handle(self, *args, **options): iterations_completed = 0 r = get_redis_interface("CACHE") testing = True if testing_iterations else False - while True and settings.IQUERY_PROBE_DAEMON_ENABLED: + while True and settings.IQUERY_CASE_PROBE_DAEMON_ENABLED: for court_id in court_ids: if r.exists(f"iquery:court_wait:{court_id}"): continue diff --git a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py index 43611d240f..08b2de837d 100644 --- a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py +++ b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py @@ -331,10 +331,10 @@ def get_pdfs( throttle.update_min_items(min_items) logger.info( - f"Court cycle completed for: {row.court_id}. Current iteration: {cycle_checker.current_iteration}. Sleep 2 seconds " + f"Court cycle completed for: {row.court_id}. Current iteration: {cycle_checker.current_iteration}. Sleep 1 second " f"before starting the next cycle." ) - time.sleep(2) + time.sleep(1) logger.info(f"Processing row id: {row.id} from {row.court_id}") c = chain( process_free_opinion_result.si( diff --git a/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py new file mode 100644 index 0000000000..c98d619b93 --- /dev/null +++ b/cl/corpus_importer/management/commands/update_casenames_wl_dataset.py @@ -0,0 +1,440 @@ +import logging +import re +import time +from datetime import date, datetime + +import pandas as pd +from django.core.management.base import BaseCommand, CommandError +from django.db import transaction +from django.db.models import Q, QuerySet +from eyecite import get_citations +from eyecite.models import FullCaseCitation +from eyecite.tokenizers import HyperscanTokenizer +from juriscraper.lib.string_utils import harmonize + +from cl.citations.utils import map_reporter_db_cite_type +from cl.search.models import Citation, OpinionCluster + +logger = logging.getLogger(__name__) +HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan") + +# Compile regex pattern once for efficiency +WORD_PATTERN = re.compile(r"\b\w+\b|\b\w+\.\b") + +FALSE_POSITIVES = { + "and", + "personal", + "restraint", + "matter", + "county", + "city", + "of", + "the", + "estate", + "in", + "inc", + "re", + "st", + "ex", + "rel", + "vs", + "for", +} + +DATE_FORMATS = ( + "%B %d, %Y", + "%d-%b-%y", + "%m/%d/%Y", + "%m/%d/%y", + "%b. %d, %Y", + "%Y-%m-%d", +) + + +def tokenize_case_name(case_name: str) -> set[str]: + """Tokenizes case name and removes single-character words except for letters with periods. + + :param case_name: case name to tokenize + :return: list of words + """ + words = [] + for word in WORD_PATTERN.findall(case_name): + if len(word) > 1: + # Only keep words with more than one character + words.append(word.lower()) + + # Return only valid words + return set(words) - FALSE_POSITIVES + + +def check_case_names_match(west_case_name: str, cl_case_name: str) -> bool: + """Compare two case name and decide whether they are the same or not + + Tokenize each string, capturing both words and abbreviations with periods and + convert all words to lowercase for case-insensitive matching and check if there is + an overlap between case names + + :param west_case_name: case name from csv + :param cl_case_name: case name from cluster + :return: True if they match else False + """ + + west_set = tokenize_case_name(west_case_name.lower()) + cl_set = tokenize_case_name(cl_case_name.lower()) + + overlap = west_set & cl_set + if not overlap: + # if no hits no match on name - move along + return False + + # Check for "v." in title + if "v." not in west_case_name.lower() or ( + len(cl_set) == 1 or len(west_set) == 1 + ): + # in the matter of Smith + # if no V. - likely an "in re" case and only match on at least 1 name + return True + + # otherwise check if a match occurs on both sides of the `v.` + v_index = west_case_name.lower().index("v.") + hit_indices = [west_case_name.lower().find(hit) for hit in overlap] + return min(hit_indices) < v_index < max(hit_indices) + + +def parse_date(date_str: str) -> date | None: + """Attempts to parse the filed date into a datetime object. + + January 10, 1999 + 24-Jul-97 + 21-Jan-94 + 1/17/1961 + 12/1/1960 + 26-Sep-00 + Feb. 28, 2001 + 2007-01-24 + + :param date_str: date string + :return: date object or none + """ + for fmt in DATE_FORMATS: + try: + return datetime.strptime(date_str, fmt).date() + except (ValueError, TypeError): + continue + logger.warning("Invalid date format: %s", date_str) + return None + + +def parse_citations(citation_strings: list[str]) -> list[dict]: + """Validate citations with Eyecite. + + :param citation_strings: List of citation strings to validate. + :return: List of validated citation dictionaries with volume, reporter, and page. + """ + validated_citations = [] + + for cite_str in citation_strings: + # Get citations from the string + + # We find all the citations that could match a cluster to update the case name + found_cites = get_citations(cite_str, tokenizer=HYPERSCAN_TOKENIZER) + if not found_cites: + continue + + citation = found_cites[0] + + # Ensure we have valid citations to process + if isinstance(citation, FullCaseCitation): + volume = citation.groups.get("volume") + + # Validate the volume + if not volume or not volume.isdigit(): + continue + + cite_type_str = citation.all_editions[0].reporter.cite_type + reporter_type = map_reporter_db_cite_type(cite_type_str) + + # Append the validated citation as a dictionary + validated_citations.append( + { + "volume": citation.groups["volume"], + "reporter": citation.corrected_reporter(), + "page": citation.groups["page"], + "type": reporter_type, + } + ) + + return validated_citations + + +def query_possible_matches( + valid_citations: list[dict], docket_number: str, date_filed: date +) -> QuerySet[Citation]: + """Find matches for row data + + It will remove duplicates, it could happen if we already have both citations, if we + have multiple matches, these must be unique + + :param valid_citations: list of FullCaseCitation objects + :param docket_number: cleaned docket number from row + :param date_filed: formatted filed date from row + + :return: list of matched OpinionCluster + """ + + citation_queries = Q() + + for citation in valid_citations: + citation_query = Q(**citation) & Q( + cluster__docket__docket_number__contains=docket_number, + cluster__date_filed=date_filed, + ) + citation_queries |= citation_query + possible_matches = ( + Citation.objects.filter(citation_queries) + .select_related("cluster") + .distinct("cluster__id") + ) + + return possible_matches + + +def update_matched_case_name( + matched_cluster: OpinionCluster, west_case_name: str +) -> tuple[bool, bool]: + """Update case name of matched cluster and related docket if empty any of them + + :param matched_cluster: OpinionCluster object + :param west_case_name: case name from csv row + :return: tuple with boolean values if cluster and related docket case name updated + """ + cluster_case_name_updated = False + docket_case_name_updated = False + + if not matched_cluster.case_name: + # Save case name in cluster when we don't have it + matched_cluster.case_name = west_case_name + matched_cluster.save() + logger.info("Case name updated for cluster id: %s", matched_cluster.id) + cluster_case_name_updated = True + + if not matched_cluster.docket.case_name: + # Save case name in docket when we don't have it + matched_cluster.docket.case_name = west_case_name + matched_cluster.docket.save() + logger.info( + "Case name updated for docket id: %s", matched_cluster.docket.id + ) + docket_case_name_updated = True + + return cluster_case_name_updated, docket_case_name_updated + + +def process_csv( + filepath: str, + delay: float, + dry_run: bool, + limit: int | None, + start_row: int, +) -> None: + """Process rows from csv file + + :param filepath: path to csv file + :param delay: delay between saves in seconds + :param dry_run: flag to simulate update process + :param limit: limit number of rows to process + :param start_row: start row + """ + + total_clusters_updated = 0 + total_dockets_updated = 0 + total_citations_added = 0 + + logger.info("Processing %s", filepath) + + # Generate rows to skip, excluding the header row + skip_rows = list(range(1, start_row)) if start_row else None + + df = pd.read_csv(filepath, skiprows=skip_rows, nrows=limit).dropna() + + # Reset the index to start from 0 (needed if we pass skip_rows param) + df.reset_index(drop=True, inplace=True) + + if start_row: + # Update rows index to reflect the original csv row numbers + df.index = range(start_row, start_row + len(df)) + + for row in df.itertuples(): + index, case_name, court, date_str, cite1, cite2, docket, _ = row + west_case_name = harmonize(case_name) + clean_docket_num = docket.strip('="').strip('"') + if not clean_docket_num: + logger.info("Row index: %s - No docket number found.", index) + continue + + date_filed = parse_date(date_str) + if not date_filed: + logger.info( + "Row index: %s - No valid date found: %s", index, date_str + ) + continue + + west_citations: list[str] = [cite1, cite2] + valid_citations = parse_citations(west_citations) + + if not valid_citations: + logger.info("Row index: %s - Missing valid citations.", index) + continue + + # Query for possible matches using data from row + possible_matches = query_possible_matches( + valid_citations=valid_citations, + docket_number=clean_docket_num, + date_filed=date_filed, + ) + + if not possible_matches: + logger.info("Row index: %s - No possible matches found.", index) + continue + + matches = [] + for match in possible_matches: + cl_case_name = ( + match.cluster.case_name_full + if match.cluster.case_name_full + else match.cluster.case_name + ) + + case_name_match = check_case_names_match( + west_case_name, cl_case_name + ) + if case_name_match: + matches.append(match.cluster) + + if len(matches) == 0: + # No match found within possible matches, go to next row + logger.info( + "Row index: %s - No match found within possible matches.", + index, + ) + continue + elif len(matches) > 1: + # More than one match, log and go to next row + matches_found = ", ".join([str(cluster.id) for cluster in matches]) + logger.warning( + "Row index: %s - Multiple matches found: %s", + index, + matches_found, + ) + continue + + # Single match found + logger.info( + "Row index: %s - Match found: %s - West case name: %s", + index, + matches[0].id, + west_case_name, + ) + + if dry_run: + # Dry run, don't save anything + continue + + with transaction.atomic(): + matched_cluster = matches[0] + + # Update case names + cluster_updated, docket_updated = update_matched_case_name( + matched_cluster, west_case_name + ) + + if cluster_updated: + total_clusters_updated = +1 + + if docket_updated: + total_dockets_updated = +1 + + # Add any of the citations if possible + for citation in valid_citations: + + citation["cluster_id"] = matched_cluster.id + if Citation.objects.filter(**citation).exists(): + # We already have the citation + continue + elif Citation.objects.filter( + cluster_id=citation["cluster_id"], + reporter=citation.get("reporter"), + ).exists(): + # # Same reporter, different citation, revert changes + logger.warning( + "Row index: %s - Revert changes for cluster id: %s", + index, + matched_cluster.id, + ) + transaction.set_rollback(True) + break + else: + new_citation = Citation.objects.create(**citation) + logger.info( + "New citation added: %s to cluster id: %s", + new_citation, + matched_cluster.id, + ) + total_citations_added += 1 + + # Wait between each processed row to avoid sending to many indexing tasks + time.sleep(delay) + + logger.info("Clusters updated: %s", total_clusters_updated) + logger.info("Dockets updated: %s", total_dockets_updated) + logger.info("Citations added: %s", total_citations_added) + + +class Command(BaseCommand): + help = "Match and compare case details from a CSV file with existing records in the database." + + def add_arguments(self, parser): + parser.add_argument( + "--filepath", + type=str, + required=True, + help="Path to the CSV file to process.", + ) + parser.add_argument( + "--delay", + type=float, + default=0.1, + help="How long to wait to update each opinion cluster (in seconds, allows floating numbers).", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Simulate the update process without making changes", + ) + parser.add_argument( + "--start-row", + default=0, + type=int, + help="Start row (inclusive).", + ) + parser.add_argument( + "--limit", + default=None, + type=int, + help="Limit number of rows to process.", + required=False, + ) + + def handle(self, *args, **options): + filepath = options["filepath"] + delay = options["delay"] + dry_run = options["dry_run"] + limit = options["limit"] + start_row = options["start_row"] + + if not filepath: + raise CommandError( + "Filepath is required. Use --filepath to specify the CSV file location." + ) + + process_csv(filepath, delay, dry_run, limit, start_row) diff --git a/cl/corpus_importer/signals.py b/cl/corpus_importer/signals.py index d2443b62f3..08254d7d85 100644 --- a/cl/corpus_importer/signals.py +++ b/cl/corpus_importer/signals.py @@ -76,6 +76,10 @@ def update_latest_case_id_and_schedule_iquery_sweep(docket: Docket) -> None: countdown=task_scheduled_countdown, queue=settings.CELERY_IQUERY_QUEUE, ) + logger.info( + f"Enqueued iquery docket case ID: {iquery_pacer_case_id_current} " + f"for court {court_id} with countdown {task_scheduled_countdown}" + ) # Update the iquery_pacer_case_id_current in Redis r.hset( diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 8ed46333f7..bfa21e43b5 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -25,6 +25,7 @@ from httpx import ( HTTPStatusError, NetworkError, + ReadError, RemoteProtocolError, TimeoutException, ) @@ -598,6 +599,7 @@ def process_free_opinion_result( ConnectionError, ReadTimeout, RedisConnectionError, + ReadError, ), max_retries=15, interval_start=5, diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 7a76435ded..5b3d858897 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -62,6 +62,9 @@ log_added_items_to_redis, merge_rss_data, ) +from cl.corpus_importer.management.commands.update_casenames_wl_dataset import ( + check_case_names_match, +) from cl.corpus_importer.signals import ( handle_update_latest_case_id_and_schedule_iquery_sweep, update_latest_case_id_and_schedule_iquery_sweep, @@ -3343,7 +3346,7 @@ def test_merger(self): @patch("cl.corpus_importer.tasks.get_or_cache_pacer_cookies") @override_settings( - IQUERY_PROBE_DAEMON_ENABLED=True, + IQUERY_CASE_PROBE_DAEMON_ENABLED=True, IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED=True, EGRESS_PROXY_HOSTS=["http://proxy_1:9090", "http://proxy_2:9090"], ) @@ -4078,3 +4081,56 @@ def test_probe_iquery_pages_daemon_court_got_stuck( f"iquery:court_empty_probe_attempts:{self.court_cacd.pk}" ) self.assertEqual(int(court_empty_attempts), 0) + + +class CaseNamesTest(SimpleTestCase): + def test_check_case_names_match(self) -> None: + """Can we check if the case names match?""" + case_names_tests = ( + ( + "U.S. v. Smith", + "United States v. Smith", + True, + ), + ( + "United States v. Guerrero-Martinez", # 736793 + "United States v. Hector Guerrero-Martinez, AKA Hector Guerrero AKA Hector Martinez-Guerrero", + True, + ), + ( + "In re CP", # 2140442 + "In Re CP", + True, + ), + ( + "Dennis v. City of Easton", # 730246 + "Richard Dennis, Penelope Dennis, Loretta M. Dennis v. City of Easton, Edward J. Ferraro, Robet S. Stein, Doris Asteak, Paul Schleuter, Howard B. White, Easton Board of Health", + True, + ), + ( + "Parmelee v. Bruggeman", # 736598 + "Allan Parmelee v. Milford Bruggeman Janine Bruggeman Friend of the Court for the State of Michigan Nancy Rose, Employee of the State of Michigan for the Friend of the Court Glenda Friday, Employee of the State of Michigan for the Friend of the Court Karen Dunn, Employee of the State of Michigan for the Friend of the Court Thomas Kreckman, Employee of the State of Michigan for the Friend of the Court State of Michigan", + True, + ), + ( + "Automobile Assur. Financial Corp. v. Syrett Corp.", # 735935 + "Automobile Assurance Financial Corporation, a Utah Corporation Venuti and Associates, Inc., a Utah Corporation Venuti Partners, Ltd., a Utah Limited Partnership Frank P. Venuti, an Individual, Parker M. Nielson v. Syrett Corporation, a Delaware Corporation, Formerly a Utah Corporation, John R. Riley, an Individual, Third-Party-Defendant", + True, + ), + ( + "Christopher Ambroze, M.D., PC v. Aetna Health Plans of New York, Inc.", # 735476 + "Christopher Ambroze, M.D., P.C., Rockville Anesthesia Group, Llp, Harvey Finkelstein, Plainview Anesthesiologists, P.C., Joseph A. Singer, Atlantic Anesthesia Associates, P.C. v. Aetna Health Plans of New York, Inc., Aetna Health Management, Inc., Aetna Life and Casualty Company, C. Frederick Berger, and Gregg Stolzberg", + True, + ), + ( + "O'Neal v. Merkel", # 730350 + "Terence Kenneth O'Neal v. T.E. Merkel Nurse Cashwell Nurse Allen Nurse Davis Mr. Conn, and Franklin E. Freeman, Jr. Gary Dixon Doctor Lowy Doctor Shaw Doctor Castalloe Harry Allsbrook Mr. Cherry", + True, + ), + ) + for wl_casename, cl_casename, overlap in case_names_tests: + self.assertEqual( + check_case_names_match(wl_casename, cl_casename), + overlap, + msg=f"Case names don't match: {wl_casename} - {cl_casename}", + ) diff --git a/cl/custom_filters/templatetags/extras.py b/cl/custom_filters/templatetags/extras.py index 39d535b2df..6532ca2881 100644 --- a/cl/custom_filters/templatetags/extras.py +++ b/cl/custom_filters/templatetags/extras.py @@ -1,7 +1,7 @@ import random import re import urllib.parse -from datetime import datetime +from datetime import datetime, timezone import waffle from django import template @@ -337,6 +337,21 @@ def format_date(date_str: str) -> str: return date_str +@register.filter +def datetime_in_utc(date_obj) -> str: + """Formats a datetime object in UTC with timezone displayed. + For example: 'Nov. 25, 2024, 01:28 p.m. UTC'""" + if date_obj is None: + return "" + try: + return date_filter( + date_obj.astimezone(timezone.utc), + "M. j, Y, h:i a T", + ) + except (ValueError, TypeError): + return date_obj + + @register.filter def build_docket_id_q_param(request_q: str, docket_id: str) -> str: """Build a query string that includes the docket ID and any existing query diff --git a/cl/disclosures/api_views.py b/cl/disclosures/api_views.py index 1c1be6f3a4..64ce52bac4 100644 --- a/cl/disclosures/api_views.py +++ b/cl/disclosures/api_views.py @@ -1,4 +1,5 @@ from rest_framework import viewsets +from rest_framework.permissions import DjangoModelPermissionsOrAnonReadOnly from cl.api.api_permissions import V3APIPermission from cl.api.utils import LoggingMixin @@ -40,7 +41,10 @@ class AgreementViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Agreement.objects.all().order_by("-id") serializer_class = AgreementSerializer - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") filterset_class = AgreementFilter # Default cursor ordering key @@ -56,7 +60,10 @@ class AgreementViewSet(LoggingMixin, viewsets.ModelViewSet): class DebtViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Debt.objects.all().order_by("-id") serializer_class = DebtSerializer - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") filterset_class = DebtFilter # Default cursor ordering key @@ -87,7 +94,10 @@ class FinancialDisclosureViewSet(LoggingMixin, viewsets.ModelViewSet): ) serializer_class = FinancialDisclosureSerializer filterset_class = FinancialDisclosureFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -103,7 +113,10 @@ class GiftViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Gift.objects.all().order_by("-id") serializer_class = GiftSerializer filterset_class = GiftFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -119,7 +132,10 @@ class InvestmentViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Investment.objects.all().order_by("-id") serializer_class = InvestmentSerializer filterset_class = InvestmentFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -135,7 +151,10 @@ class NonInvestmentIncomeViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = NonInvestmentIncome.objects.all().order_by("-id") serializer_class = NonInvestmentIncomeSerializer filterset_class = NonInvestmentIncomeFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -151,7 +170,10 @@ class PositionViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Position.objects.all().order_by("-id") serializer_class = PositionSerializer filterset_class = PositionFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -167,7 +189,10 @@ class ReimbursementViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Reimbursement.objects.all().order_by("-id") serializer_class = ReimbursementSerializer filterset_class = ReimbursementFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -183,7 +208,10 @@ class SpouseIncomeViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = SpouseIncome.objects.all().order_by("-id") serializer_class = SpouseIncomeSerializer filterset_class = SpouseIncomeFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" diff --git a/cl/favorites/tests.py b/cl/favorites/tests.py index e2aa34ab56..9518884d95 100644 --- a/cl/favorites/tests.py +++ b/cl/favorites/tests.py @@ -14,6 +14,7 @@ from django.utils.timezone import make_naive, now from selenium.webdriver.common.by import By from timeout_decorator import timeout_decorator +from waffle.testutils import override_flag from cl.custom_filters.templatetags.pacer import price from cl.favorites.factories import NoteFactory, PrayerFactory @@ -107,6 +108,7 @@ def setUp(self) -> None: super().setUp() @timeout_decorator.timeout(SELENIUM_TIMEOUT) + @override_flag("ui_flag_for_o", False) def test_anonymous_user_is_prompted_when_favoriting_an_opinion( self, ) -> None: @@ -167,6 +169,7 @@ def test_anonymous_user_is_prompted_when_favoriting_an_opinion( modal_title = self.browser.find_element(By.ID, "save-note-title") self.assertIn("Save Note", modal_title.text) + @override_flag("ui_flag_for_o", False) @timeout_decorator.timeout(SELENIUM_TIMEOUT) def test_logged_in_user_can_save_note(self) -> None: # Meta: assure no Faves even if part of fixtures diff --git a/cl/lib/command_utils.py b/cl/lib/command_utils.py index 2c3797f9f5..ee86463812 100644 --- a/cl/lib/command_utils.py +++ b/cl/lib/command_utils.py @@ -3,6 +3,8 @@ from django.core.management import BaseCommand, CommandError +from cl.lib.juriscraper_utils import get_module_by_court_id + logger = logging.getLogger(__name__) @@ -22,6 +24,40 @@ def handle(self, *args, **options): juriscraper_logger.setLevel(logging.DEBUG) +class ScraperCommand(VerboseCommand): + """Base class for cl.scrapers commands that use Juriscraper + + Implements the `--courts` argument to lookup for a Site object + """ + + # To be used on get_module_by_court_id + # Defined by inheriting classes + juriscraper_module_type = "" + + def add_arguments(self, parser): + parser.add_argument( + "--courts", + dest="court_id", + metavar="COURTID", + type=lambda s: ( + s + if "." in s + else get_module_by_court_id(s, self.juriscraper_module_type) + ), + required=True, + help=( + "The court(s) to scrape and extract. One of: " + "1. a python module or package import from the Juriscraper library, e.g." + "'juriscraper.opinions.united_states.federal_appellate.ca1' " + "or simply 'juriscraper.opinions' to do all opinions." + "" + "2. a court_id, to be used to lookup for a full module path" + "An error will be raised if the `court_id` matches more than " + "one module path. In that case, use the full path" + ), + ) + + class CommandUtils: """A mixin to give some useful methods to sub classes.""" diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 129115ff20..93d15948ad 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -4,6 +4,7 @@ import re import time import traceback +from collections import defaultdict from copy import deepcopy from dataclasses import fields from functools import reduce, wraps @@ -175,22 +176,45 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query: exclusions for specific opinion clusters. """ - document_list = [{"_id": f"o_{id}"} for id in related_ids] + opinion_cluster_pairs = [ + opinion_pair + for opinion_id in related_ids + if ( + opinion_pair := await Opinion.objects.filter(pk=opinion_id) + .values("pk", "cluster_id") + .afirst() + ) + ] + unique_clusters = {pair["cluster_id"] for pair in opinion_cluster_pairs} + + document_list = [ + { + "_id": f'o_{pair["pk"]}', + "routing": pair["cluster_id"], + # Important to match documents in the production cluster + } + for pair in opinion_cluster_pairs + ] or [ + {"_id": f"o_{pk}"} for pk in related_ids + ] # Fallback in case IDs are not found in the database. + # The user might have provided non-existent Opinion IDs. + # This ensures that the query does not raise an error and instead returns + # no results. + more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy() mlt_query = Q( "more_like_this", fields=more_like_this_fields, like=document_list, - min_term_freq=1, - max_query_terms=12, + min_term_freq=settings.RELATED_MLT_MINTF, + max_query_terms=settings.RELATED_MLT_MAXQT, + min_word_length=settings.RELATED_MLT_MINWL, + max_word_length=settings.RELATED_MLT_MAXWL, + max_doc_freq=settings.RELATED_MLT_MAXDF, + analyzer="search_analyzer_exact", ) # Exclude opinion clusters to which the related IDs to query belong. - cluster_ids_to_exclude = ( - OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids) - .distinct("pk") - .values_list("pk", flat=True) - ) - cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()] + cluster_ids_list = list(unique_clusters) exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)] bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids) return bool_query @@ -1239,7 +1263,7 @@ def build_es_base_query( {"opinion": []}, [], mlt_query, - child_highlighting=False, + child_highlighting=True, api_version=api_version, ) ) @@ -1281,6 +1305,7 @@ def build_es_base_query( mlt_query, child_highlighting=child_highlighting, api_version=api_version, + alerts=alerts, ) ) @@ -2964,9 +2989,10 @@ def do_es_api_query( child documents. """ + alerts = True if hl_tag == ALERTS_HL_TAG else False try: es_queries = build_es_base_query( - search_query, cd, cd["highlight"], api_version + search_query, cd, cd["highlight"], api_version, alerts=alerts ) s = es_queries.search_query child_docs_query = es_queries.child_query @@ -3047,7 +3073,7 @@ def do_es_api_query( # parameters as in the frontend. Only switch highlighting according # to the user request. main_query = add_es_highlighting( - s, cd, highlighting=cd["highlight"] + s, cd, alerts=alerts, highlighting=cd["highlight"] ) return main_query, child_docs_query @@ -3081,7 +3107,7 @@ def build_cardinality_count(count_query: Search, unique_field: str) -> Search: def do_collapse_count_query( search_type: str, main_query: Search, query: Query -) -> int | None: +) -> int: """Execute an Elasticsearch count query for queries that uses collapse. Uses a query with aggregation to determine the number of unique opinions based on the 'cluster_id' or 'docket_id' according to the search_type. @@ -3106,7 +3132,7 @@ def do_collapse_count_query( f"Error on count query request: {search_query.to_dict()}" ) logger.warning(f"Error was: {e}") - total_results = None + total_results = 0 return total_results @@ -3216,18 +3242,20 @@ def do_es_sweep_alert_query( multi_search = multi_search.add(main_query) if parent_query: parent_search = search_query.query(parent_query) + # Ensure accurate tracking of total hit count for up to 10,001 query results parent_search = parent_search.extra( - from_=0, size=settings.SCHEDULED_ALERT_HITS_LIMIT + from_=0, + track_total_hits=settings.ELASTICSEARCH_MAX_RESULT_COUNT + 1, ) parent_search = parent_search.source(includes=["docket_id"]) multi_search = multi_search.add(parent_search) if child_query: child_search = child_search_query.query(child_query) + # Ensure accurate tracking of total hit count for up to 10,001 query results child_search = child_search.extra( from_=0, - size=settings.SCHEDULED_ALERT_HITS_LIMIT - * settings.RECAP_CHILD_HITS_PER_RESULT, + track_total_hits=settings.ELASTICSEARCH_MAX_RESULT_COUNT + 1, ) child_search = child_search.source(includes=["id"]) multi_search = multi_search.add(child_search) @@ -3241,15 +3269,45 @@ def do_es_sweep_alert_query( if child_query: rd_results = responses[2] + # Re-run parent query to fetch potentially missed docket IDs due to large + # result sets. + should_repeat_parent_query = ( + docket_results + and docket_results.hits.total.value + >= settings.ELASTICSEARCH_MAX_RESULT_COUNT + ) + if should_repeat_parent_query: + docket_ids = [int(d.docket_id) for d in main_results] + # Adds extra filter to refine results. + parent_query.filter.append(Q("terms", docket_id=docket_ids)) + parent_search = search_query.query(parent_query) + parent_search = parent_search.source(includes=["docket_id"]) + docket_results = parent_search.execute() + limit_inner_hits({}, main_results, cd["type"]) set_results_highlights(main_results, cd["type"]) - for result in main_results: - child_result_objects = [] - if hasattr(result, "child_docs"): - for child_doc in result.child_docs: - child_result_objects.append(child_doc.to_dict()) - result["child_docs"] = child_result_objects + # This block addresses a potential issue where the initial child query + # might not return all expected results, especially when the result set is + # large. To ensure complete data retrieval, it extracts child document IDs + # from the main results and refines the child query filter with these IDs. + # Finally, it re-executes the child search. + should_repeat_child_query = ( + rd_results + and rd_results.hits.total.value + >= settings.ELASTICSEARCH_MAX_RESULT_COUNT + ) + if should_repeat_child_query: + rd_ids = [ + int(rd["_source"]["id"]) + for docket in main_results + if hasattr(docket, "child_docs") + for rd in docket.child_docs + ] + child_query.filter.append(Q("terms", id=rd_ids)) + child_search = child_search_query.query(child_query) + child_search = child_search.source(includes=["id"]) + rd_results = child_search.execute() return main_results, docket_results, rd_results @@ -3279,3 +3337,45 @@ def simplify_estimated_count(search_count: int) -> int: zeroes = (len(search_count_str) - 2) * "0" return int(first_two + zeroes) return search_count + + +def set_child_docs_and_score( + results: list[Hit] | list[dict[str, Any]] | Response, + merge_highlights: bool = False, + merge_score: bool = False, +) -> None: + """Process and attach child documents to the main search results. + + :param results: A list of search results, which can be ES Hit objects + or a list of dicts. + :param merge_highlights: A boolean indicating whether to merge + highlight data into the results. + :param merge_score: A boolean indicating whether to merge + the BM25 score into the results. + :return: None. Results are modified in place. + """ + + for result in results: + result_is_dict = isinstance(result, dict) + if result_is_dict: + # If the result is a dictionary, do nothing, or assign [] to + # child_docs if it is not present. + result["child_docs"] = result.get("child_docs", []) + else: + # Process child hits if the result is an ES AttrDict instance, + # so they can be properly serialized. + child_docs = getattr(result, "child_docs", []) + result["child_docs"] = [ + defaultdict(lambda: None, doc["_source"].to_dict()) + for doc in child_docs + ] + + # Optionally merges highlights. Used for integrating percolator + # highlights into the percolated document. + if merge_highlights and result_is_dict: + meta_hl = result.get("meta", {}).get("highlight", {}) + merge_highlights_into_result(meta_hl, result) + + # Optionally merges the BM25 score for display in the API. + if merge_score and isinstance(result, AttrDict): + result["bm25_score"] = result.meta.score diff --git a/cl/lib/juriscraper_utils.py b/cl/lib/juriscraper_utils.py index ae8c090f41..f2484e8b86 100644 --- a/cl/lib/juriscraper_utils.py +++ b/cl/lib/juriscraper_utils.py @@ -5,6 +5,12 @@ import juriscraper +def walk_juriscraper(): + return pkgutil.walk_packages( + juriscraper.__path__, f"{juriscraper.__name__}." + ) + + def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""): """Identify and instantiate a Site() object given the name of a court @@ -25,9 +31,7 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""): return importlib.import_module(juriscraper_module).Site() - for _, full_module_path, _ in pkgutil.walk_packages( - juriscraper.__path__, f"{juriscraper.__name__}." - ): + for _, full_module_path, _ in walk_juriscraper(): # Get the module name from the full path and trim # any suffixes like _p, _u module_name = full_module_path.rsplit(".", 1)[1].rsplit("_", 1)[0] @@ -42,3 +46,45 @@ def get_scraper_object_by_name(court_id: str, juriscraper_module: str = ""): # has been stripped off it. In any case, just ignore it when # this happens. continue + + +def get_module_by_court_id(court_id: str, module_type: str) -> str: + """Given a `court_id` return a juriscraper module path + + Some court_ids match multiple scraper files. These will force the user + to use the full module path. For example, "lactapp_1" and "lactapp_5" + match the same `court_id`, but scrape totally different sites, and + their Site objects are expected to have different `extract_from_text` + behavior + + :param court_id: court id to look for + :param module_type: 'opinions' or 'oral_args'. Without this, some + court_ids may match the 2 classes of scrapers + + :raises: ValueError if there is no match or there is more than 1 match + :return: the full module path + """ + if module_type not in ["opinions", "oral_args"]: + raise ValueError( + "module_type has to be one of ['opinions', 'oral_args']" + ) + + matches = [] + for _, module_string, _ in walk_juriscraper(): + if module_string.count(".") != 4 or module_type not in module_string: + # Skip folder and lib modules. Skip type + continue + + module_court_id = module_string.rsplit(".", 1)[1].rsplit("_", 1)[0] + if module_court_id == court_id: + matches.append(module_string) + + if len(matches) == 1: + return matches[0] + elif len(matches) == 0: + raise ValueError(f"'{court_id}' doesn't match any juriscraper module") + else: + raise ValueError( + f"'{court_id}' matches more than 1 juriscraper module." + f"Use a full module path. Matches: '{matches}'" + ) diff --git a/cl/lib/utils.py b/cl/lib/utils.py index 223056420f..592f8876d0 100644 --- a/cl/lib/utils.py +++ b/cl/lib/utils.py @@ -248,7 +248,7 @@ def cleanup_main_query(query_string: str) -> str: """ inside_a_phrase = False cleaned_items = [] - for item in re.split(r'([^a-zA-Z0-9_\-~":]+)', query_string): + for item in re.split(r'([^a-zA-Z0-9_\-^~":]+)', query_string): if not item: continue diff --git a/cl/opinion_page/templates/includes/add_download_button.html b/cl/opinion_page/templates/includes/add_download_button.html new file mode 100644 index 0000000000..bcd7a508ea --- /dev/null +++ b/cl/opinion_page/templates/includes/add_download_button.html @@ -0,0 +1,46 @@ +
    + + +
    diff --git a/cl/opinion_page/templates/includes/add_note_button.html b/cl/opinion_page/templates/includes/add_note_button.html index c5392897e8..f107802383 100644 --- a/cl/opinion_page/templates/includes/add_note_button.html +++ b/cl/opinion_page/templates/includes/add_note_button.html @@ -1,6 +1,8 @@ - +
    + +
    \ No newline at end of file diff --git a/cl/opinion_page/templates/includes/opinion_tabs.html b/cl/opinion_page/templates/includes/opinion_tabs.html new file mode 100644 index 0000000000..e5eaf1b7c0 --- /dev/null +++ b/cl/opinion_page/templates/includes/opinion_tabs.html @@ -0,0 +1,390 @@ +{% load humanize %} +{% load text_filters %} + +{% if tab == "authorities" %} +{# Table of Authorities #} +
    + +
    +
    + {% for authority in authorities_with_data %} +
    +

    + + {{ authority.caption|safe|v_wrapper }} + +

    +
    +
    + + +
    +
    + + +
    +
    + + +
    +
    +
    +
    + +
    +
    +
    +
    + {% endfor %} +
    +
    +{#{% elif tab == "details" %}#} +{# {% include "includes/tab_details.html" %}#} +{% elif tab == "summaries" %} + {# Summaries #} +
    + +
    +
    +
      + {% for group in parenthetical_groups %} + {% with representative=group.representative %} + {% with representative_cluster=representative.describing_opinion.cluster %} +
      +

      + + {{ representative_cluster|best_case_name|safe }} + +

      +
      +
      + + +
      +
      + + +
      +
      + + +
      +
      +
      +
    • + {{ representative.text|capfirst }} -- +
      + +
    • +
        + {% for summary in group.parentheticals.all %} + {% with describing_cluster=summary.describing_opinion.cluster %} + {% if summary != representative %} +
      • + {{ summary.text|capfirst }} +
        + {{ describing_cluster.date_filed }} + + + {{ describing_cluster|best_case_name|safe }} + + + {{ describing_cluster.docket.court }} +
      • + {% endif %} + {% endwith %} + {% endfor %} +
      + {% endwith %} + {% endwith %} + {% endfor %} +
    +
    +
    +{% elif tab == "cited-by" %} + {# Cited By #} +
    + +
    + + {% if citing_cluster_count > 0 %} + {% for citing_cluster in citing_clusters %} + + {% endfor %} + {% else %} +

    This case has not yet been cited in our system.

    + {% endif %} + +
    +

    + View Citing Opinions +

    +
    + +{% elif tab == "related-cases" %} + {# Related Cases #} + + +{% elif tab == "pdf" %} + {# PDF #} +
    +
    + +
    +
    +
    + +
    +
    +
    +

    Oops! Your browser does not support embedded PDF viewing.

    +
    + {% include "includes/rd_download_button.html" %} +
    +
    +
    +
    +
    +
    +{% else %} + + {# The section of the document I refer to as headmatter goes here #} +
    +
    + {% with opinion_count=cluster.sub_opinions.all.count %} + + {% if cluster.headmatter %} + +
    +
    + {{ cluster.headmatter|safe }} +
    + {% else %} + {% if cluster.correction %} + +
    +
    + {{ cluster.correction|safe }} +
    + {% endif %} + + {% if cluster.attorneys %} + +
    +
    +

    {{ cluster.attorneys|safe|linebreaksbr }}

    +
    + {% endif %} + + {% if cluster.headnotes %} + +
    +

    {{ cluster.headnotes | safe}}

    + {% endif %} + + {% if cluster.syllabus %} + +
    +
    + {{ cluster.syllabus|safe }} +
    + {% endif %} + + {% if cluster.summary %} + +
    +
    + {{ cluster.summary|safe }} +
    + {% endif %} + {% if cluster.history %} + +
    +
    + {{ cluster.history|safe }} +
    + {% endif %} + + {% if cluster.disposition %} + +
    +
    + {{ cluster.disposition|safe }} +
    + {% endif %} + {% endif %} + + {% for sub_opinion in cluster.ordered_opinions %} + +
    + + {% if 'U' in cluster.source %} +
    + {% elif 'Z' in cluster.source %} +
    + {% elif 'L' in cluster.source %} +
    + {% elif 'R' in cluster.source %} +
    + {% else %} +
    + {% endif %} + +
    + {% if sub_opinion.xml_harvard and sub_opinion.html_with_citations %} +
    {{ sub_opinion.html_with_citations|safe }}
    + {% elif sub_opinion.xml_harvard %} +
    {{ sub_opinion.xml_harvard|safe }}
    + {% elif sub_opinion.html_with_citations %} + {% if cluster.source == "C" %} + {# It's a PDF with no HTML enrichment#} + {% if sub_opinion.html %} + {# for scrpaed HTML eg. Colo, Okla we do not want to insert line breaks #} +
    {{ sub_opinion.html_with_citations|safe }}
    + {% else %} +
    {{ sub_opinion.html_with_citations|safe|linebreaksbr }}
    + {% endif %} + {% else %} +
    {{ sub_opinion.html_with_citations|safe }}
    + {% endif %} + {% elif sub_opinion.html_columbia %} +
    {{ sub_opinion.html_columbia|safe }}
    + {% elif sub_opinion.html_lawbox %} +
    {{ sub_opinion.html_lawbox|safe }}
    + {% elif sub_opinion.html_anon_2020 %} +
    {{ sub_opinion.html_anon_2020|safe }}
    + {% elif sub_opinion.html %} +
    {{sub_opinion.html|safe}}
    + {% else %} +
    {{sub_opinion.plain_text}}
    + {% endif %} +
    + + {% endfor %} + {% endwith %} +
    +
    + +{% endif %} \ No newline at end of file diff --git a/cl/opinion_page/templates/opinion.html b/cl/opinion_page/templates/opinion.html index 16a33820fd..a0c4c797c7 100644 --- a/cl/opinion_page/templates/opinion.html +++ b/cl/opinion_page/templates/opinion.html @@ -100,7 +100,7 @@

    Summaries ({{ summaries_count|intcomma }})

    {% endfor %}

    - View All Summaries diff --git a/cl/opinion_page/templates/opinions.html b/cl/opinion_page/templates/opinions.html new file mode 100644 index 0000000000..3cb9746763 --- /dev/null +++ b/cl/opinion_page/templates/opinions.html @@ -0,0 +1,337 @@ +{% extends "base.html" %} +{% load extras %} +{% load humanize %} +{% load static %} +{% load text_filters %} + + +{% block canonical %}{% get_canonical_element %}{% endblock %} +{% block title %}{{ title }} – CourtListener.com{% endblock %} +{% block og_title %}{{ title }} – CourtListener.com{% endblock %} +{% block description %}{{ title }} — Brought to you by Free Law Project, a non-profit dedicated to creating high quality open legal information.{% endblock %} +{% block og_description %}{{ cluster|best_case_name }}{% if summaries_count > 0 %} — {{ top_parenthetical_groups.0.representative.text|capfirst }}{% else %} — Brought to you by Free Law Project, a non-profit dedicated to creating high quality open legal information.{% endif %} +{% endblock %} + +{% block head %} + + + +{% endblock %} + + +{% block navbar-o %}active{% endblock %} + + +{% block sidebar %} + +{% endblock %} + +{% block body-classes %}opinion-body{% endblock %} + +{% block content %} + +
    +
    + +
    +
    + {{ cluster.date_filed }} +
    + + {% if pdf_path %} + {% include "includes/add_download_button.html" %} + {% endif %} + {% include "includes/add_note_button.html" with form_instance_id=note_form.instance.cluster_id %} +
    +
    + + +

    {{ cluster.docket.court }}

    +
    +
    +
      +
    • Citations: {{ cluster.citation_string|default:"None known" }}
    • + {% if cluster.case_name_full != cluster|best_case_name %} +
    • Full Case Name: + {{ cluster.case_name_full }} +
    • + {% endif %} + + {% if cluster.docket.court_id != "olc" %} +
    • Docket Number: {{ cluster.docket.docket_number|default:"Unknown" }}
    • + {% endif %} + + {% if cluster.get_precedential_status_display != "Precedential" %} +
    • Precedential Status: {{ cluster.get_precedential_status_display|default:"Unknown" }}
    • + {% endif %} + + {% if cluster.docket.court_id == 'scotus' and cluster.scbd %} +
    • Supreme Court DB ID: + + {{ cluster.scdb_id }} + +
    • + {% endif %} + + {% if cluster.panel.all.count > 0 %} +
    • Panel: + {% for p in cluster.panel.all %} + {{ p.name_full }}{% if not forloop.last %}, {% endif %} + {% endfor %} +
    • + {% endif %} + + {% if cluster.judges %} +
    • Judges: {{ cluster.judges }}
    • + {% endif %} + + {% if opinion.author %} +
    • Author: {{ opinion.author.name_full }}
    • + {% endif %} + + {% if opinion.joined_by.all.count > 0 %} +
    • Joined By: + {% for p in opinion.joined_by.all %} + {{ p.name_full }}{% if not forloop.last %}, {% endif %} + {% endfor %} +
    • + {% endif %} + + {% if cluster.nature_of_suit %} +
    • Nature of Suit: {{ cluster.nature_of_suit }}
    • + {% endif %} + + {% if cluster.nature_of_suit %} +
    • Posture: {{ cluster.posture }}
    • + {% endif %} + + {% if cluster.other_dates %} + {{ cluster.other_dates.items }} +
    • Other Dates: {{ cluster.other_dates }}
    • + {% endif %} + + {% if cluster.disposition %} +
    • Disposition: {{ cluster.disposition }}
    • + {% endif %} +
    +
    +
    + + +
    + {% include "includes/opinion_tabs.html" %} + {% include "includes/notes_modal.html" %} + +
    +{% endblock %} + + +{% block footer-scripts %} + + + {% if request.user.is_staff %} + + {% if DEBUG %} + + {% else %} + + {% endif %} + {% endif %} + +{% endblock %} diff --git a/cl/opinion_page/tests.py b/cl/opinion_page/tests.py index ddc8a1e14e..1c1bd8c60c 100644 --- a/cl/opinion_page/tests.py +++ b/cl/opinion_page/tests.py @@ -114,6 +114,7 @@ async def test_simple_rd_page(self) -> None: self.assertEqual(response.status_code, HTTPStatus.OK) +@override_flag("ui_flag_for_o", False) class OpinionPageLoadTest( ESIndexTestCase, CourtTestCase, @@ -652,6 +653,7 @@ async def test_volume_pagination(self) -> None: self.assertEqual(volume_next, None) @override_flag("o-es-active", False) + @override_flag("ui_flag_for_o", False) def test_full_citation_redirect(self) -> None: """Do we get redirected to the correct URL when we pass in a full citation?""" @@ -740,7 +742,7 @@ async def test_can_filter_out_non_case_law_citation(self): ) self.assertEqual(r.status_code, HTTPStatus.OK) - self.assertTemplateUsed(r, "opinion.html") + self.assertTemplateUsed(r, "opinions.html") self.assertIn(str(chests_of_tea), r.content.decode()) async def test_show_error_for_non_opinion_citations(self): diff --git a/cl/opinion_page/urls.py b/cl/opinion_page/urls.py index 5e7a9e1a54..994b257993 100644 --- a/cl/opinion_page/urls.py +++ b/cl/opinion_page/urls.py @@ -12,14 +12,17 @@ download_docket_entries_csv, redirect_docket_recap, redirect_og_lookup, - view_authorities, view_docket, view_docket_feed, view_opinion, + view_opinion_authorities, + view_opinion_cited_by, + view_opinion_pdf, + view_opinion_related_cases, + view_opinion_summaries, view_parties, view_recap_authorities, view_recap_document, - view_summaries, ) urlpatterns = [ @@ -31,16 +34,6 @@ name="court_publish_page", ), # Opinion pages - path( - "opinion///summaries/", - view_summaries, # type: ignore[arg-type] - name="view_summaries", - ), - path( - "opinion///authorities/", - view_authorities, # type: ignore[arg-type] - name="view_authorities", - ), path( "opinion///visualizations/", cluster_visualizations, # type: ignore[arg-type] @@ -52,6 +45,31 @@ name="docket_feed", ), path("opinion///", view_opinion, name="view_case"), # type: ignore[arg-type] + path( + "opinion///authorities/", + view_opinion_authorities, + name="view_case_authorities", + ), # with the tab + path( + "opinion///cited-by/", + view_opinion_cited_by, + name="view_case_cited_by", + ), # with the tab + path( + "opinion///summaries/", + view_opinion_summaries, + name="view_case_summaries", + ), # with the tab + path( + "opinion///related-cases/", + view_opinion_related_cases, + name="view_case_related_cases", + ), # with the tab + path( + "opinion///pdf/", + view_opinion_pdf, + name="view_case_pdf", + ), # with the tab path( "docket//download/", download_docket_entries_csv, # type: ignore[arg-type] diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index 160453bb1f..33e0682211 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -157,8 +157,19 @@ async def build_cites_clusters_query( cluster_cites_query = cluster_search.query(cites_query) search_query = ( cluster_cites_query.sort({"citeCount": {"order": "desc"}}) - .source(includes=["absolute_url", "caseName", "dateFiled"]) - .extra(size=5, track_total_hits=True) + .source( + includes=[ + "absolute_url", + "caseName", + "cluster_id", + "docketNumber", + "citation", + "status", + "dateFiled", + ] + ) + .extra(size=20, track_total_hits=True) + .collapse(field="cluster_id") ) return search_query @@ -166,13 +177,11 @@ async def build_cites_clusters_query( async def build_related_clusters_query( cluster_search: Search, sub_opinion_pks: list[str], - search_params: dict[str, str], ) -> Search: """Build the ES related clusters query based on sub-opinion IDs. :param cluster_search: The Elasticsearch DSL Search object :param sub_opinion_pks: A list of IDs representing sub-opinions to be queried. - :param search_params: A dict of parameters used to form the query. :return: The ES DSL Search object representing the query to find the related clusters. """ @@ -192,8 +201,18 @@ async def build_related_clusters_query( cluster_related_query = cluster_search.query(main_query) search_query = ( cluster_related_query.sort({"_score": {"order": "desc"}}) - .source(includes=["absolute_url", "caseName", "cluster_id"]) - .extra(size=5) + .source( + includes=[ + "absolute_url", + "caseName", + "cluster_id", + "docketNumber", + "citations", + "status", + "dateFiled", + ] + ) + .extra(size=20) .collapse(field="cluster_id") ) return search_query @@ -211,6 +230,186 @@ class RelatedCitingResults: timeout: bool = False +@dataclass +class RelatedClusterResults: + related_clusters: list[OpinionClusterDocument] = field( + default_factory=list + ) + sub_opinion_pks: list[int] = field(default_factory=list) + url_search_params: dict[str, str] = field(default_factory=dict) + timeout: bool = False + + +async def es_get_related_clusters_with_cache( + cluster: OpinionCluster, + request: HttpRequest, +) -> RelatedClusterResults: + """Elastic Related Clusters Search or Cache + + :param cluster:The cluster to use + :param request:The user request + :return:Related Cluster Data + """ + cache = caches["db_cache"] + mlt_cache_key = f"clusters-mlt-es:{cluster.pk}" + # By default, all statuses are included. Retrieve the PRECEDENTIAL_STATUS + # attributes (since they're indexed in ES) instead of the NAMES values. + search_params: CleanData = {} + url_search_params = { + f"stat_{v[0]}": "on" for v in PRECEDENTIAL_STATUS.NAMES + } + sub_opinion_pks = [ + str(pk) + async for pk in cluster.sub_opinions.values_list("pk", flat=True) + ] + if settings.RELATED_FILTER_BY_STATUS: + # Filter results by status (e.g., Precedential) + # Update URL parameters accordingly + search_params[ + f"stat_{PRECEDENTIAL_STATUS.get_status_value(settings.RELATED_FILTER_BY_STATUS)}" + ] = True + url_search_params = { + f"stat_{PRECEDENTIAL_STATUS.get_status_value(settings.RELATED_FILTER_BY_STATUS)}": "on" + } + + related_cluster_result = RelatedClusterResults( + url_search_params=url_search_params + ) + + if is_bot(request) or not sub_opinion_pks: + return related_cluster_result + + cached_related_clusters, timeout_related = ( + await cache.aget(mlt_cache_key) or (None, False) + if settings.RELATED_USE_CACHE + else (None, False) + ) + + # Prepare related cluster query if not cached results. + cluster_search = OpinionClusterDocument.search() + + if cached_related_clusters is not None: + related_cluster_result.related_clusters = cached_related_clusters + related_cluster_result.timeout = timeout_related + related_cluster_result.sub_opinion_pks = list( + map(int, sub_opinion_pks) + ) + related_cluster_result.url_search_params = url_search_params + return related_cluster_result + + related_query = await build_related_clusters_query( + cluster_search, sub_opinion_pks + ) + + related_query = related_query.params( + timeout=f"{settings.ELASTICSEARCH_FAST_QUERIES_TIMEOUT}s" + ) + related_query = related_query.extra( + size=settings.RELATED_COUNT, track_total_hits=False + ) + try: + # Execute the Related Query if needed + response = related_query.execute() + timeout_related = False + except (ConnectionError, RequestError, ApiError) as e: + logger.warning("Error getting cited and related clusters: %s", e) + if settings.DEBUG is True: + traceback.print_exc() + return related_cluster_result + except ConnectionTimeout as e: + logger.warning( + "ConnectionTimeout getting cited and related clusters: %s", e + ) + response = None + timeout_related = True + + related_cluster_result.related_clusters = ( + response if response is not None else cached_related_clusters or [] + ) + related_cluster_result.timeout = False + related_cluster_result.sub_opinion_pks = list(map(int, sub_opinion_pks)) + + if timeout_related == False: + await cache.aset( + mlt_cache_key, + (related_cluster_result.related_clusters, timeout_related), + settings.RELATED_CACHE_TIMEOUT, + ) + return related_cluster_result + + +async def es_get_cited_clusters_with_cache( + cluster: OpinionCluster, + request: HttpRequest, +) -> RelatedCitingResults: + """Elastic cited by cluster search or cache + + :param cluster:The cluster to check + :param request:The user request + :return:The cited by data + """ + cache = caches["db_cache"] + cache_citing_key = f"clusters-cited-es:{cluster.pk}" + + sub_opinion_pks = [ + str(pk) + async for pk in cluster.sub_opinions.values_list("pk", flat=True) + ] + cluster_results = RelatedCitingResults() + if is_bot(request) or not sub_opinion_pks: + return cluster_results + + cached_citing_results, cached_citing_clusters_count, timeout_cited = ( + await cache.aget(cache_citing_key) or (None, False, False) + if settings.RELATED_USE_CACHE + else (None, False, False) + ) + + if cached_citing_results is not None: + cluster_results.citing_clusters = cached_citing_results + cluster_results.citing_cluster_count = cached_citing_clusters_count + cluster_results.timeout = timeout_cited + return cluster_results + + cluster_search = OpinionClusterDocument.search() + cited_query = await build_cites_clusters_query( + cluster_search, sub_opinion_pks + ) + try: + # Execute the Related Query if needed + response = cited_query.execute() + timeout_cited = False + except (ConnectionError, RequestError, ApiError) as e: + logger.warning("Error getting cited and related clusters: %s", e) + if settings.DEBUG is True: + traceback.print_exc() + return cluster_results + except ConnectionTimeout as e: + logger.warning( + "ConnectionTimeout getting cited and related clusters: %s", e + ) + response = None + timeout_cited = True + + citing_clusters = list(response) if not timeout_cited else [] + cluster_results.citing_clusters = citing_clusters + cluster_results.citing_cluster_count = ( + response.hits.total.value if response is not None else 0 + ) + cluster_results.timeout = False if citing_clusters else timeout_cited + if not cluster_results.timeout: + await cache.aset( + cache_citing_key, + ( + cluster_results.citing_clusters, + cluster_results.citing_cluster_count, + cluster_results.timeout, + ), + settings.RELATED_CACHE_TIMEOUT, + ) + return cluster_results + + async def es_get_citing_and_related_clusters_with_cache( cluster: OpinionCluster, request: HttpRequest, @@ -251,9 +450,12 @@ async def es_get_citing_and_related_clusters_with_cache( if is_bot(request) or not sub_opinion_pks: return RelatedCitingResults(url_search_params=url_search_params) - cached_citing_results, cached_citing_cluster_count, timeout_cited = ( - await cache.aget(cache_citing_key) or (None, 0, False) - ) + ( + cached_citing_results, + cached_citing_cluster_count, + timeout_cited, + ) = await cache.aget(cache_citing_key) or (None, 0, False) + cached_related_clusters, timeout_related = ( await cache.aget(mlt_cache_key) or (None, False) if settings.RELATED_USE_CACHE @@ -267,10 +469,11 @@ async def es_get_citing_and_related_clusters_with_cache( related_index = citing_index = None if cached_related_clusters is None: related_query = await build_related_clusters_query( - cluster_search, sub_opinion_pks, search_params + cluster_search, sub_opinion_pks ) related_query = related_query.extra( - size=settings.RELATED_COUNT, track_total_hits=False + size=settings.RELATED_COUNT, + track_total_hits=False, ) multi_search = multi_search.add(related_query) related_index = response_index @@ -340,3 +543,77 @@ async def es_get_citing_and_related_clusters_with_cache( results.timeout = any([timeout_cited, timeout_related]) results.sub_opinion_pks = list(map(int, sub_opinion_pks)) return results + + +async def es_cited_case_count( + cluster_id: int, sub_opinion_pks: list[str] +) -> int: + """Elastic quick cited by count query + + :param cluster_id: The cluster id to search with + :param sub_opinion_pks: The subopinion ids of the cluster + :return: Opinion Cited Count + """ + cache = caches["db_cache"] + cache_cited_by_key = f"cited-by-count-es:{cluster_id}" + cached_cited_by_count = await cache.aget(cache_cited_by_key) or None + if cached_cited_by_count is not None: + return cached_cited_by_count + + cluster_search = OpinionClusterDocument.search() + cites_query = Q( + "bool", + filter=[ + Q("match", cluster_child="opinion"), + Q("terms", **{"cites": sub_opinion_pks}), + ], + ) + cluster_cites_query = cluster_search.query(cites_query) + cited_by_count = cluster_cites_query.count() + + await cache.aset( + cache_cited_by_key, + cited_by_count, + settings.RELATED_CACHE_TIMEOUT, + ) + + return cited_by_count + + +async def es_related_case_count(cluster_id, sub_opinion_pks: list[str]) -> int: + """Elastic quick related cases count + + :param cluster_id: The cluster id of the object + :param sub_opinion_pks: The sub opinion ids of the cluster + :return: The count of related cases in elastic + """ + cache = caches["db_cache"] + cache_related_cases_key = f"related-cases-count-es:{cluster_id}" + cached_related_cases_count = ( + await cache.aget(cache_related_cases_key) or None + ) + if cached_related_cases_count is not None: + return cached_related_cases_count + + cluster_search = OpinionClusterDocument.search() + mlt_query = await build_more_like_this_query(sub_opinion_pks) + parent_filters = await sync_to_async(build_join_es_filters)( + {"type": SEARCH_TYPES.OPINION, "stat_published": True} + ) + default_parent_filter = [Q("match", cluster_child="opinion")] + parent_filters.extend(default_parent_filter) + main_query = Q( + "bool", + filter=default_parent_filter, + should=mlt_query, + minimum_should_match=1, + ) + cluster_related_query = cluster_search.query(main_query) + related_cases_count = cluster_related_query.count() + await cache.aset( + cache_related_cases_key, + related_cases_count, + settings.RELATED_CACHE_TIMEOUT, + ) + + return related_cases_count diff --git a/cl/opinion_page/views.py b/cl/opinion_page/views.py index 0eaf0addb9..38e129ef5f 100644 --- a/cl/opinion_page/views.py +++ b/cl/opinion_page/views.py @@ -10,7 +10,7 @@ from django.contrib import messages from django.core.exceptions import ObjectDoesNotExist, PermissionDenied from django.core.paginator import EmptyPage, PageNotAnInteger, Paginator -from django.db.models import IntegerField, Prefetch +from django.db.models import IntegerField, Prefetch, QuerySet from django.db.models.functions import Cast from django.http import HttpRequest, HttpResponseRedirect from django.http.response import ( @@ -78,7 +78,11 @@ from cl.opinion_page.types import AuthoritiesContext from cl.opinion_page.utils import ( core_docket_data, + es_cited_case_count, + es_get_cited_clusters_with_cache, es_get_citing_and_related_clusters_with_cache, + es_get_related_clusters_with_cache, + es_related_case_count, generate_docket_entries_csv_data, get_case_title, ) @@ -358,7 +362,6 @@ async def fetch_docket_entries(docket): async def view_docket( request: HttpRequest, pk: int, slug: str ) -> HttpResponse: - sort_order_asc = True form = DocketEntryFilterForm(request.GET, request=request) docket, context = await core_docket_data(request, pk) @@ -806,7 +809,9 @@ async def view_recap_authorities( @never_cache -async def view_opinion(request: HttpRequest, pk: int, _: str) -> HttpResponse: +async def view_opinion_old( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: """Using the cluster ID, return the cluster of opinions. We also test if the cluster ID has a user note, and send data @@ -891,7 +896,7 @@ async def view_opinion(request: HttpRequest, pk: int, _: str) -> HttpResponse: sponsored = True view_authorities_url = reverse( - "view_authorities", args=[cluster.pk, cluster.slug] + "view_case_authorities", args=[cluster.pk, cluster.slug] ) authorities_context: AuthoritiesContext = AuthoritiesContext( citation_record=cluster, @@ -932,6 +937,155 @@ async def view_opinion(request: HttpRequest, pk: int, _: str) -> HttpResponse: ) +async def setup_opinion_context( + cluster: OpinionCluster, request: HttpRequest, tab: str +) -> dict[str, Any]: + """Generate the basic page information we need to load the page + + :param cluster: The opinion cluster + :param request: The HTTP request from the user + :param tab: The tab to load + :return: The opinion page context used to generate the page + """ + title = ", ".join( + [ + s + for s in [ + trunc(best_case_name(cluster), 100, ellipsis="..."), + await cluster.acitation_string(), + ] + if s.strip() + ] + ) + has_downloads = False + pdf_path = None + if cluster.filepath_pdf_harvard: + has_downloads = True + pdf_path = cluster.filepath_pdf_harvard + else: + async for sub_opinion in cluster.sub_opinions.all(): + if str(sub_opinion.local_path).endswith(".pdf"): + has_downloads = True + pdf_path = sub_opinion.local_path.url + break + elif sub_opinion.download_url: + has_downloads = True + pdf_path = None + + get_string = make_get_string(request) + + sub_opinion_pks = [ + str(opinion.pk) async for opinion in cluster.sub_opinions.all() + ] + + es_has_cited_opinions = await es_cited_case_count( + cluster.id, sub_opinion_pks + ) + es_has_related_opinions = await es_related_case_count( + cluster.id, sub_opinion_pks + ) + + try: + note = await Note.objects.aget( + cluster_id=cluster.pk, + user=await request.auser(), # type: ignore[attr-defined] + # type: ignore[attr-defined] + ) + except (ObjectDoesNotExist, TypeError): + # Not note or anonymous user + note_form = NoteForm( + initial={ + "cluster_id": cluster.pk, + "name": trunc(best_case_name(cluster), 100, ellipsis="..."), + } + ) + else: + note_form = NoteForm(instance=note) + + # Identify opinions updated/added in partnership with v|lex for 3 years + sponsored = False + if ( + cluster.date_created.date() > datetime.datetime(2022, 6, 1).date() + and cluster.filepath_json_harvard + ): + sponsored = True + + context = { + "tab": tab, + "title": title, + "caption": await cluster.acaption(), + "cluster": cluster, + "has_downloads": has_downloads, + "pdf_path": pdf_path, + "note_form": note_form, + "get_string": get_string, + "private": cluster.blocked, + "sponsored": sponsored, + "summaries_count": await cluster.parentheticals.acount(), + "authorities_count": await cluster.aauthority_count(), + "related_cases_count": es_has_related_opinions, + "cited_by_count": es_has_cited_opinions, + } + + return context + + +async def get_opinions_base_queryset() -> QuerySet: + return OpinionCluster.objects.prefetch_related( + "sub_opinions__opinions_cited", "citations" + ).select_related("docket__court") + + +async def render_opinion_view( + request: HttpRequest, + cluster: OpinionCluster, + tab: str, + additional_context: dict = {}, +) -> HttpResponse: + """Helper function to render opinion views with common context. + + :param request: The HttpRequest object + :param pk: The primary key for the OpinionCluster + :param tab: The selected tab + :param additional_context: Any additional context to be passed to the view + :return: HttpResponse + """ + ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "ui_flag_for_o" + ) + + if not any([ui_flag_for_o]): + return await view_opinion_old(request, cluster.pk, "str") + + context = await setup_opinion_context(cluster, request, tab=tab) + + if additional_context: + context.update(additional_context) + + # Just redirect if people attempt to URL hack to pages without content + tab_count_mapping = { + "pdf": "has_downloads", + "authorities": "authorities_count", + "cited-by": "cited_by_count", + "related-by": "related_by_count", + "summaries": "summaries_count", + } + + # Check if the current tab needs a redirect based on the mapping + if context["tab"] in tab_count_mapping: + count_key = tab_count_mapping[context["tab"]] + if not context[count_key]: + return HttpResponseRedirect( + reverse("view_case", args=[cluster.pk, cluster.slug]) + ) + + return TemplateResponse( + request, + "opinions.html", + context, + ) + + async def view_summaries( request: HttpRequest, pk: int, slug: str ) -> HttpResponse: @@ -984,6 +1138,174 @@ async def view_authorities( ) +@never_cache +async def view_opinion(request: HttpRequest, pk: int, _: str) -> HttpResponse: + """View Opinions + + :param request: HTTP request + :param pk: The cluster PK + :param _: url slug + :return: The old or new opinion HTML + """ + ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "ui_flag_for_o" + ) + if not ui_flag_for_o: + return await view_opinion_old(request, pk, "str") + + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) + return await render_opinion_view(request, cluster, "opinions") + + +async def view_opinion_pdf( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """View Opinion PDF Tab + + :param request: HTTP request + :param pk: The cluster PK + :param _: url slug + :return: Opinion PDF tab + """ + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) + return await render_opinion_view(request, cluster, "pdf") + + +async def view_opinion_authorities( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """View Opinion Table of Authorities + + :param request: HTTP request + :param pk: The cluster PK + :param _: url slug + :return: Table of Authorities tab + """ + ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "ui_flag_for_o" + ) + if not ui_flag_for_o: + # Old page to load for people outside the flag + return await view_authorities( + request=request, pk=pk, slug="authorities" + ) + + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) + + additional_context = { + "authorities_with_data": await cluster.aauthorities_with_data(), + } + return await render_opinion_view( + request, cluster, "authorities", additional_context + ) + + +async def view_opinion_cited_by( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """View Cited By Tab + + :param request: HTTP request + :param pk: The cluster PK + :param _: url slug + :return: Cited By tab + """ + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) + cited_query = await es_get_cited_clusters_with_cache(cluster, request) + additional_context = { + "citing_clusters": cited_query.citing_clusters, + "citing_cluster_count": cited_query.citing_cluster_count, + } + return await render_opinion_view( + request, cluster, "cited-by", additional_context + ) + + +async def view_opinion_summaries( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """View Opinion Summaries tab + + :param request: HTTP request + :param pk: The cluster PK + :param _: url slug + :return: Summaries tab + """ + ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "ui_flag_for_o" + ) + if not ui_flag_for_o: + # Old page to load for people outside the flag + return await view_summaries(request=request, pk=pk, slug="summaries") + + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) + parenthetical_groups_qs = await get_or_create_parenthetical_groups(cluster) + parenthetical_groups = [ + parenthetical_group + async for parenthetical_group in parenthetical_groups_qs.prefetch_related( + Prefetch( + "parentheticals", + queryset=Parenthetical.objects.order_by("-score"), + ), + "parentheticals__describing_opinion__cluster__citations", + "parentheticals__describing_opinion__cluster__docket__court", + "representative__describing_opinion__cluster__citations", + "representative__describing_opinion__cluster__docket__court", + ) + ] + ui_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "ui_flag_for_o" + ) + if not ui_flag_for_o: + # Old page to load for people outside the flag + return await view_summaries(request=request, pk=pk, slug="summaries") + additional_context = { + "parenthetical_groups": parenthetical_groups, + "ui_flag_for_o": ui_flag_for_o, + } + return await render_opinion_view( + request, cluster, "summaries", additional_context + ) + + +async def view_opinion_related_cases( + request: HttpRequest, pk: int, _: str +) -> HttpResponse: + """View Related Cases Tab + + :param request: HTTP request + :param pk: The cluster PK + :param _: url slug + :return: Related Cases tab + """ + cluster: OpinionCluster = await aget_object_or_404( + await get_opinions_base_queryset(), pk=pk + ) + related_cluster_object = await es_get_related_clusters_with_cache( + cluster, request + ) + additional_context = { + "related_algorithm": "mlt", + "related_clusters": related_cluster_object.related_clusters, + "sub_opinion_ids": related_cluster_object.sub_opinion_pks, + "related_search_params": f"&{urlencode(related_cluster_object.url_search_params)}", + "queries_timeout": related_cluster_object.timeout, + } + return await render_opinion_view( + request, cluster, "related-cases", additional_context + ) + + async def cluster_visualizations( request: HttpRequest, pk: int, slug: str ) -> HttpResponse: diff --git a/cl/people_db/api_views.py b/cl/people_db/api_views.py index c593c1789a..7675ef32b6 100644 --- a/cl/people_db/api_views.py +++ b/cl/people_db/api_views.py @@ -1,5 +1,6 @@ from django.db.models import Exists, OuterRef, Prefetch from rest_framework import viewsets +from rest_framework.permissions import DjangoModelPermissionsOrAnonReadOnly from cl.api.api_permissions import V3APIPermission from cl.api.pagination import TinyAdjustablePagination @@ -90,7 +91,10 @@ class PersonDisclosureViewSet(viewsets.ModelViewSet): serializer_class = PersonDisclosureSerializer filterset_class = PersonDisclosureFilter pagination_class = TinyAdjustablePagination - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -122,7 +126,10 @@ class PersonViewSet(LoggingMixin, viewsets.ModelViewSet): ) serializer_class = PersonSerializer filterset_class = PersonFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -145,7 +152,10 @@ class PositionViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Position.objects.all().order_by("-id") serializer_class = PositionSerializer filterset_class = PositionFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -175,7 +185,10 @@ class RetentionEventViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = RetentionEvent.objects.all().order_by("-id") serializer_class = RetentionEventSerializer filterset_class = RetentionEventFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified", "date_retention") # Default cursor ordering key ordering = "-id" @@ -191,7 +204,10 @@ class EducationViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Education.objects.all().order_by("-id") serializer_class = EducationSerializer filterset_class = EducationFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified") # Default cursor ordering key ordering = "-id" @@ -207,7 +223,10 @@ class SchoolViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = School.objects.all().order_by("-id") serializer_class = SchoolSerializer filterset_class = SchoolFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ("id", "date_created", "date_modified", "name") # Default cursor ordering key ordering = "-id" @@ -223,7 +242,10 @@ class PoliticalAffiliationViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = PoliticalAffiliation.objects.all().order_by("-id") serializer_class = PoliticalAffiliationSerializer filterset_class = PoliticalAffiliationFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -245,7 +267,10 @@ class SourceViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = Source.objects.all().order_by("-id") serializer_class = SourceSerializer filterset_class = SourceFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_modified", @@ -261,7 +286,10 @@ class ABARatingViewSet(LoggingMixin, viewsets.ModelViewSet): queryset = ABARating.objects.all().order_by("-id") serializer_class = ABARatingSerializer filterset_class = ABARatingFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 396314a25d..a2b6898cd0 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -2166,6 +2166,12 @@ def fetch_docket(self, fq_pk): newly_enqueued = enqueue_docket_alert(d_pk) if newly_enqueued: send_alert_and_webhook(d_pk, start_time) + + # Link docket to fq if not previously linked + if not fq.docket_id: + fq.docket_id = d_pk + fq.save() + return result diff --git a/cl/recap/tests.py b/cl/recap/tests.py index 099855883a..f0a4c77ae0 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -37,7 +37,10 @@ retry_webhook_events, ) from cl.api.models import Webhook, WebhookEvent, WebhookEventType -from cl.api.utils import get_next_webhook_retry_date +from cl.api.utils import ( + get_next_webhook_retry_date, + get_webhook_deprecation_date, +) from cl.lib.pacer import is_pacer_court_accessible, lookup_and_save from cl.lib.recap_utils import needs_ocr from cl.lib.redis_utils import get_redis_interface @@ -1127,6 +1130,7 @@ def test_fetch_docket_by_docket_number( result.get() fq.refresh_from_db() + self.assertEqual(fq.docket, self.docket) self.assertEqual(fq.status, PROCESSING_STATUS.SUCCESSFUL) rds = RECAPDocument.objects.all() self.assertEqual(rds.count(), 1) @@ -1143,6 +1147,7 @@ def test_fetch_docket_by_pacer_case_id( result = do_pacer_fetch(fq) result.get() fq.refresh_from_db() + self.assertEqual(fq.docket, self.docket) self.assertEqual(fq.status, PROCESSING_STATUS.SUCCESSFUL) rds = RECAPDocument.objects.all() self.assertEqual(rds.count(), 1) @@ -1179,6 +1184,8 @@ def test_fetch_docket_send_alert( result.get() self.assertEqual(len(mail.outbox), 1) self.assertIn(fakes.CASE_NAME, mail.outbox[0].subject) + fq.refresh_from_db() + self.assertEqual(fq.docket, self.docket) @mock.patch("cl.recap.api_serializers.get_or_cache_pacer_cookies") @@ -3806,12 +3813,14 @@ def setUpTestData(cls): event_type=WebhookEventType.DOCKET_ALERT, url="https://example.com/", enabled=True, + version=1, ) cls.webhook_2 = WebhookFactory( user=cls.user_profile_2.user, event_type=WebhookEventType.DOCKET_ALERT, url="https://example.com/", enabled=True, + version=2, ) test_dir = Path(settings.INSTALL_ROOT) / "cl" / "recap" / "test_assets" with ( @@ -4086,6 +4095,19 @@ async def test_new_recap_email_case_auto_subscription_prev_user( WEBHOOK_EVENT_STATUS.SUCCESSFUL, ) + with mock.patch("cl.users.signals.notify_new_or_updated_webhook"): + webhook_2_1 = await sync_to_async(WebhookFactory)( + user=self.user_profile.user, + event_type=WebhookEventType.DOCKET_ALERT, + url="https://example.com/", + enabled=True, + version=2, + ) + self.assertEqual( + await Webhook.objects.all().acount(), + 3, + msg="Wrong number of webhook endpoints", + ) # Trigger a new recap.email notification, same case, different document # from testing_1@recap.email, auto-subscription option enabled await self.async_client.post(self.path, self.data, format="json") @@ -4111,10 +4133,14 @@ async def test_new_recap_email_case_auto_subscription_prev_user( self.assertEqual(message_sent.to, [self.recipient_user.user.email]) self.assertEqual(len(mail.outbox), 3) - # Two more webhooks should be triggered, one for testing_2@recap.email - # and one for testing_1@recap.email + # 3 more webhooks should be triggered, one for testing_2@recap.email + # and 2 for testing_1@recap.email webhooks_triggered = WebhookEvent.objects.filter() - self.assertEqual(await webhooks_triggered.acount(), 3) + self.assertEqual( + await webhooks_triggered.acount(), + 4, + msg="Wrong number of webhooks.", + ) async for webhook_sent in webhooks_triggered: self.assertEqual( @@ -4124,6 +4150,36 @@ async def test_new_recap_email_case_auto_subscription_prev_user( self.assertEqual(await webhook_user_2.acount(), 2) webhook_user_1 = WebhookEvent.objects.filter(webhook=self.webhook) self.assertEqual(await webhook_user_1.acount(), 1) + webhook_2_user_1 = WebhookEvent.objects.filter(webhook=webhook_2_1) + self.assertEqual(await webhook_2_user_1.acount(), 1) + + # Confirm webhook versions. + version_1_webhook = await webhook_user_1.afirst() + webhook_version = version_1_webhook.content["webhook"]["version"] + self.assertEqual(webhook_version, 1) + + version_2_webhook = await webhook_2_user_1.afirst() + webhook_version = version_2_webhook.content["webhook"]["version"] + self.assertEqual(webhook_version, 2) + + version_2_webhook = await webhook_user_2.afirst() + webhook_version = version_2_webhook.content["webhook"]["version"] + self.assertEqual(webhook_version, 2) + + # Confirm deprecation date webhooks according the version. + v1_webhook_event = await WebhookEvent.objects.filter( + webhook=self.webhook + ).afirst() + v2_webhook_event = await WebhookEvent.objects.filter( + webhook=webhook_2_1 + ).afirst() + self.assertEqual( + v1_webhook_event.content["webhook"]["deprecation_date"], + get_webhook_deprecation_date(settings.WEBHOOK_V1_DEPRECATION_DATE), + ) + self.assertEqual( + v2_webhook_event.content["webhook"]["deprecation_date"], None + ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", @@ -7807,11 +7863,19 @@ class RecapFetchWebhooksTest(TestCase): def setUpTestData(cls): cls.court = CourtFactory(id="canb", jurisdiction="FB") cls.user_profile = UserProfileWithParentsFactory() - cls.webhook_enabled = WebhookFactory( + cls.webhook_v1_enabled = WebhookFactory( user=cls.user_profile.user, event_type=WebhookEventType.RECAP_FETCH, url="https://example.com/", enabled=True, + version=1, + ) + cls.webhook_v2_enabled = WebhookFactory( + user=cls.user_profile.user, + event_type=WebhookEventType.RECAP_FETCH, + url="https://example.com/", + enabled=True, + version=2, ) cls.user_profile_2 = UserProfileWithParentsFactory() @@ -7865,14 +7929,18 @@ def test_recap_fetch_docket_webhook( self.assertEqual(dockets.count(), 2) - # Only one webhook event should be triggered for user_profile since + # Two webhook events (v1, v2) should be triggered for user_profile since # user_profile_2 webhook endpoint is disabled. webhook_events = WebhookEvent.objects.all() - self.assertEqual(len(webhook_events), 1) + self.assertEqual(len(webhook_events), 2) self.assertEqual( webhook_events[0].webhook.user, self.user_profile.user, ) + self.assertEqual( + webhook_events[1].webhook.user, + self.user_profile.user, + ) content = webhook_events[0].content # Compare the webhook event payload self.assertEqual( @@ -7885,6 +7953,27 @@ def test_recap_fetch_docket_webhook( ) self.assertNotEqual(content["payload"]["date_completed"], None) + # Confirm webhooks for V1 and V2 are properly triggered. + webhook_versions = { + webhook.content["webhook"]["version"] for webhook in webhook_events + } + self.assertEqual(webhook_versions, {1, 2}) + + # Confirm deprecation date webhooks according the version. + v1_webhook_event = WebhookEvent.objects.filter( + webhook=self.webhook_v1_enabled + ).first() + v2_webhook_event = WebhookEvent.objects.filter( + webhook=self.webhook_v2_enabled + ).first() + self.assertEqual( + v1_webhook_event.content["webhook"]["deprecation_date"], + get_webhook_deprecation_date(settings.WEBHOOK_V1_DEPRECATION_DATE), + ) + self.assertEqual( + v2_webhook_event.content["webhook"]["deprecation_date"], None + ) + @mock.patch( "cl.recap.mergers.AttachmentPage", new=fakes.FakeAttachmentPage, @@ -7923,10 +8012,10 @@ def test_recap_attachment_page_webhook( fq.refresh_from_db() self.assertEqual(fq.status, PROCESSING_STATUS.SUCCESSFUL) - # Only one webhook event should be triggered for user_profile since + # Two webhook events (v1, v2) should be triggered for user_profile since # user_profile_2 webhook endpoint is disabled. webhook_events = WebhookEvent.objects.all() - self.assertEqual(len(webhook_events), 1) + self.assertEqual(len(webhook_events), 2) self.assertEqual( webhook_events[0].webhook.user, @@ -7984,10 +8073,10 @@ def test_recap_pacer_doc_webhook( fq.refresh_from_db() self.assertEqual(fq.status, PROCESSING_STATUS.SUCCESSFUL) - # Only one webhook event should be triggered for user_profile since + # Two webhook events (v1, v2) should be triggered for user_profile since # user_profile_2 webhook endpoint is disabled. webhook_events = WebhookEvent.objects.all() - self.assertEqual(len(webhook_events), 1) + self.assertEqual(len(webhook_events), 2) self.assertEqual( webhook_events[0].webhook.user, diff --git a/cl/recap/views.py b/cl/recap/views.py index 9bb70cb6cf..f2383b8868 100644 --- a/cl/recap/views.py +++ b/cl/recap/views.py @@ -3,7 +3,10 @@ from asgiref.sync import async_to_sync, sync_to_async from django.contrib.auth.models import User from rest_framework.exceptions import ValidationError -from rest_framework.permissions import IsAuthenticatedOrReadOnly +from rest_framework.permissions import ( + DjangoModelPermissionsOrAnonReadOnly, + IsAuthenticatedOrReadOnly, +) from rest_framework.viewsets import ModelViewSet from cl.api.api_permissions import V3APIPermission @@ -179,7 +182,10 @@ class FjcIntegratedDatabaseViewSet(LoggingMixin, ModelViewSet): queryset = FjcIntegratedDatabase.objects.all().order_by("-id") serializer_class = FjcIntegratedDatabaseSerializer filterset_class = FjcIntegratedDatabaseFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", diff --git a/cl/scrapers/admin.py b/cl/scrapers/admin.py index 694fb79d17..56ec54df03 100644 --- a/cl/scrapers/admin.py +++ b/cl/scrapers/admin.py @@ -1,4 +1,5 @@ from django.contrib import admin +from django.db import models from cl.scrapers.models import ( PACERFreeDocumentLog, @@ -29,3 +30,39 @@ class PACERFreeDocumentRowAdmin(admin.ModelAdmin): admin.site.register(UrlHash) + + +class MVLatestOpinion(models.Model): + """ + Model linked to materialized view for monitoring scrapers + + The SQL for creating the view is on it's migration file. + + Must use `REFRESH MATERIALIZED VIEW scrapers_mv_latest_opinion` + periodically + """ + + # a django model must have a primary key + court_id = models.TextField(primary_key=True) + latest_creation_date = models.DateTimeField() + time_since = models.TextField() + view_last_updated = models.DateTimeField() + + class Meta: + managed = False + db_table = "scrapers_mv_latest_opinion" + + +@admin.register(MVLatestOpinion) +class MVLatestOpinionAdmin(admin.ModelAdmin): + """Admin page to look at the latest opinion for each court + + Use this to monitor silently failing scrapers + """ + + list_display = [ + "court_id", + "latest_creation_date", + "time_since", + "view_last_updated", + ] diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py index b2da0a4581..a445df9438 100644 --- a/cl/scrapers/management/commands/cl_back_scrape_citations.py +++ b/cl/scrapers/management/commands/cl_back_scrape_citations.py @@ -24,6 +24,7 @@ class Command(cl_back_scrape_opinions.Command): scrape_target_descr = "citations" + juriscraper_module_type = "opinions" def scrape_court( self, diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py index 67dac880ab..8fe42e893a 100644 --- a/cl/scrapers/management/commands/cl_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_scrape_opinions.py @@ -18,7 +18,7 @@ from cl.alerts.models import RealTimeQueue from cl.citations.utils import map_reporter_db_cite_type -from cl.lib.command_utils import VerboseCommand, logger +from cl.lib.command_utils import ScraperCommand, logger from cl.lib.crypto import sha1 from cl.lib.string_utils import trunc from cl.people_db.lookup_utils import lookup_judges_by_messy_str @@ -217,14 +217,16 @@ def save_everything( ) -class Command(VerboseCommand): +class Command(ScraperCommand): help = "Runs the Juriscraper toolkit against one or many jurisdictions." + juriscraper_module_type = "opinions" scrape_target_descr = "opinions" # for logging purposes def __init__(self, stdout=None, stderr=None, no_color=False): super().__init__(stdout=None, stderr=None, no_color=False) def add_arguments(self, parser): + super().add_arguments(parser) parser.add_argument( "--daemon", action="store_true", @@ -246,20 +248,6 @@ def add_arguments(self, parser): "is 30 minutes." ), ) - parser.add_argument( - "--courts", - type=str, - dest="court_id", - metavar="COURTID", - required=True, - help=( - "The court(s) to scrape and extract. This should be " - "in the form of a python module or package import " - "from the Juriscraper library, e.g. " - '"juriscraper.opinions.united_states.federal_appellate.ca1" ' - 'or simply "opinions" to do all opinions.' - ), - ) parser.add_argument( "--fullcrawl", dest="full_crawl", diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py index ad284381f4..62377a98ec 100644 --- a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py +++ b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py @@ -107,6 +107,7 @@ def make_objects( class Command(cl_scrape_opinions.Command): scrape_target_descr = "oral arguments" + juriscraper_module_type = "oral_args" def ingest_a_case( self, diff --git a/cl/scrapers/management/commands/refresh_scrapers_status_view.py b/cl/scrapers/management/commands/refresh_scrapers_status_view.py new file mode 100644 index 0000000000..e0bf692f30 --- /dev/null +++ b/cl/scrapers/management/commands/refresh_scrapers_status_view.py @@ -0,0 +1,17 @@ +from django.db import connection + +from cl.lib.command_utils import VerboseCommand, logger + + +class Command(VerboseCommand): + help = """Refreshes the `scrapers_mv_latest_opinion` materialized view. + + Check the cl.scrapers.admin.py file for more info about the view + """ + + def handle(self, *args, **options): + query = "REFRESH MATERIALIZED VIEW scrapers_mv_latest_opinion;" + with connection.cursor() as cursor: + cursor.execute(query) + + logger.info("View refresh completed successfully") diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py new file mode 100644 index 0000000000..ee093d9e01 --- /dev/null +++ b/cl/scrapers/management/commands/update_from_text.py @@ -0,0 +1,206 @@ +import traceback +from datetime import datetime + +from django.db import transaction + +from cl.lib.command_utils import ScraperCommand, logger +from cl.scrapers.tasks import update_document_from_text +from cl.search.models import ( + PRECEDENTIAL_STATUS, + SOURCES, + Opinion, + OpinionCluster, +) + + +def rerun_extract_from_text( + opinion: Opinion, juriscraper_module: str, stats: dict[str, int] +): + """ + Reruns `update_document_from_text` from the scraper flow, saving changes + + `update_document_from_text` calls `Site.extract_from_text` and assigns + any changes to the proper objets, in place, but they are not saved. + This method saves the ones with actual changes + + :param opinion: the Opinion on which to apply extract_from_text + :param juriscraper_module: the scraper module path + :param stats: dict to accumulate counts for reporting. Modified in place + + :return None + """ + if not opinion.plain_text and not opinion.html: + # May be an opinion entirely from a merged corpus + # or an error during text extraction + logger.info( + "Opinion %s has no `plain_text` or `html` to extract from", + opinion.id, + ) + stats["No text to extract from"] += 1 + return + + with transaction.atomic(): + try: + changes = update_document_from_text(opinion, juriscraper_module) + except: + # Probably a bad implementation of `extract_from_text` + logger.debug( + "`update_document_from_text` failed for opinion %s. Traceback: %s", + opinion.id, + traceback.format_exc(), + ) + stats["Error"] += 1 + return + + if not changes: + logger.info("Did not get any metadata for opinion %s", opinion.id) + stats["No metadata extracted"] += 1 + return + + logger.info("Processing opinion %s", opinion.id) + + # Check if changes exist before saving, to prevent unnecessary DB queries + if changes.get("Docket"): + opinion.cluster.docket.save() + logger.debug( + "Docket %s updated with data %s", + opinion.cluster.docket.id, + changes["Docket"], + ) + stats["Docket"] += 1 + + if changes.get("OpinionCluster"): + opinion.cluster.save() + logger.debug( + "OpinionCluster %s updated with data %s", + opinion.cluster.id, + changes["OpinionCluster"], + ) + stats["OpinionCluster"] += 1 + + if changes.get("Opinion"): + opinion.save() + logger.debug("Opinion updated with data %s", changes["Opinion"]) + stats["Opinion"] += 1 + + if changes.get("Citation"): + if changes["Citation"].get("citation_created"): + logger.info( + "Citation created with data %s", changes["Citation"] + ) + stats["Citation"] += 1 + else: + logger.debug( + "Citation not created. Data %s", changes["Citation"] + ) + + +class Command(ScraperCommand): + help = """Updates objects by running Site.extract_from_text + over extracted content found on Opinion.plain_text or Opinion.html. + + If `--opinion-ids` is used, filters will be ignored. + If not, the 2 date filters will be required, to prevent triggering + unwanted reprocessing of the whole court's dataset + + Recommended use is to run over a sample of the target time period + and check if updates over Docket, OpinionCluster, Opinion and + Citation are as expected + """ + # For aggregate reporting at the end of the command + stats = { + "Docket": 0, + "OpinionCluster": 0, + "Opinion": 0, + "Citation": 0, + "No text to extract from": 0, + "No metadata extracted": 0, + "Error": 0, + } + juriscraper_module_type = "opinions" + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + "--opinion-ids", + nargs="+", + type=int, + help="""The Opinion ids to re-process. + May be more than one. If this argument is used, + other filters will be ignored""", + ) + parser.add_argument( + "--date-filed-gte", + default="", + type=self.parse_input_date, + help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format. + OpinionCluster.date_filed will have to be greater or equal""", + ) + parser.add_argument( + "--date-filed-lte", + default="", + type=self.parse_input_date, + help=r"""A filter value in %Y-%m-%d or %Y/%m/%d format. + OpinionCluster.date_filed will have to be less or equal""", + ) + parser.add_argument( + "--cluster-status", + default="", + choices=[value for value, name in PRECEDENTIAL_STATUS.NAMES], + help="""A value of OpinionCluster.precedential_status. To be + used for filtering the Opinions to be processed + """, + ) + + def handle(self, *args, **options): + super().handle(*args, **options) + juriscraper_module = options["court_id"] + + if options["opinion_ids"]: + opinions = Opinion.objects.filter(id__in=options["opinion_ids"]) + for op in opinions: + rerun_extract_from_text(op, juriscraper_module, self.stats) + + logger.info("Modified objects counts: %s", self.stats) + return + + if not (options["date_filed_gte"] and options["date_filed_lte"]): + raise ValueError( + "Both `date-filed-gte` and `date-filed-lte` arguments should have values" + ) + + court_id = juriscraper_module.split(".")[-1].split("_")[0] + query = { + "docket__court_id": court_id, + "date_filed__gte": options["date_filed_gte"], + "date_filed__lte": options["date_filed_lte"], + "source__contains": SOURCES.COURT_WEBSITE, + } + + if options["cluster_status"]: + query["precedential_status"] = options["cluster_status"] + + qs = OpinionCluster.objects.filter(**query).prefetch_related( + "sub_opinions" + ) + logger.debug("Found %s objects matching query %s", qs.count(), query) + + for cluster in qs: + opinions = cluster.sub_opinions.all() + for op in opinions: + rerun_extract_from_text(op, juriscraper_module, self.stats) + + logger.info("Modified objects counts: %s", self.stats) + + def parse_input_date(self, date_string: str) -> datetime | str: + """Parses a date string in accepted formats + + :param date_string: the date string in "%Y/%m/%d" or "%Y-%m-%d" + :return: an empty string if the input was empty; or the date object + """ + parsed_date = "" + if "/" in date_string: + parsed_date = datetime.strptime(date_string, "%Y/%m/%d") + elif "-" in date_string: + parsed_date = datetime.strptime(date_string, "%Y-%m-%d") + return parsed_date diff --git a/cl/scrapers/migrations/0004_create_mv_latest_opinion.py b/cl/scrapers/migrations/0004_create_mv_latest_opinion.py new file mode 100644 index 0000000000..4570c75d97 --- /dev/null +++ b/cl/scrapers/migrations/0004_create_mv_latest_opinion.py @@ -0,0 +1,69 @@ +# Generated by Django 5.1.2 on 2024-11-25 15:27 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("scrapers", "0003_delete_errorlog"), + ] + + operations = [ + migrations.CreateModel( + name="MVLatestOpinion", + fields=[ + ( + "court_id", + models.TextField(primary_key=True, serialize=False), + ), + ("latest_creation_date", models.DateTimeField()), + ("time_since", models.TextField()), + ("view_last_updated", models.DateTimeField()), + ], + options={ + "db_table": "scrapers_mv_latest_opinion", + "managed": False, + }, + ), + migrations.RunSQL(""" + CREATE MATERIALIZED VIEW IF NOT EXISTS + scrapers_mv_latest_opinion + AS + ( + SELECT + court_id, + max(so.date_created) as latest_creation_date, + DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since, + now() as view_last_updated + FROM + ( + SELECT id, court_id + FROM search_docket + WHERE court_id IN ( + SELECT id + FROM search_court + /* + Only check courts with scrapers in use + */ + WHERE + has_opinion_scraper + AND in_use + ) + ) sd + INNER JOIN + (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id + INNER JOIN + search_opinion so ON so.cluster_id = soc.id + GROUP BY + sd.court_id + HAVING + /* + Only return results for courts with no updates in a week + */ + now() - max(so.date_created) > interval '7 days' + ORDER BY + 2 DESC + ) + """) + ] diff --git a/cl/scrapers/migrations/0004_create_mv_latest_opinion.sql b/cl/scrapers/migrations/0004_create_mv_latest_opinion.sql new file mode 100644 index 0000000000..45c212298e --- /dev/null +++ b/cl/scrapers/migrations/0004_create_mv_latest_opinion.sql @@ -0,0 +1,49 @@ +BEGIN; +-- +-- Create model MVLatestOpinion +-- +-- (no-op) +-- +-- Raw SQL operation +-- + + CREATE MATERIALIZED VIEW IF NOT EXISTS + scrapers_mv_latest_opinion + AS + ( + SELECT + court_id, + max(so.date_created) as latest_creation_date, + DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since, + now() as view_last_updated + FROM + ( + SELECT id, court_id + FROM search_docket + WHERE court_id IN ( + SELECT id + FROM search_court + /* + Only check courts with scrapers in use + */ + WHERE + has_opinion_scraper + AND in_use + ) + ) sd + INNER JOIN + (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id + INNER JOIN + search_opinion so ON so.cluster_id = soc.id + GROUP BY + sd.court_id + HAVING + /* + Only return results for courts with no updates in a week + */ + now() - max(so.date_created) > interval '7 days' + ORDER BY + 2 DESC + ) + ; +COMMIT; diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index c60971c572..7bbc8bb40b 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -30,6 +30,7 @@ from cl.lib.string_utils import trunc from cl.lib.utils import is_iter from cl.recap.mergers import save_iquery_to_docket +from cl.scrapers.utils import scraped_citation_object_is_valid from cl.search.models import Docket, Opinion, RECAPDocument logger = logging.getLogger(__name__) @@ -39,7 +40,7 @@ def update_document_from_text( opinion: Opinion, juriscraper_module: str = "" -) -> None: +) -> dict: """Extract additional metadata from document text We use this code with BIA decisions. Previously Tax. @@ -54,12 +55,13 @@ def update_document_from_text( :param opinion: Opinion object :param juriscraper_module: full module to get Site object - :return: None + :return: the extracted data dictionary """ court = opinion.cluster.docket.court.pk site = get_scraper_object_by_name(court, juriscraper_module) if site is None: - return + logger.debug("No site found %s", juriscraper_module) + return {} metadata_dict = site.extract_from_text(opinion.plain_text or opinion.html) for model_name, data in metadata_dict.items(): @@ -70,7 +72,9 @@ def update_document_from_text( opinion.cluster.__dict__.update(data) elif model_name == "Citation": data["cluster_id"] = opinion.cluster_id - ModelClass.objects.get_or_create(**data) + if scraped_citation_object_is_valid(data): + _, citation_created = ModelClass.objects.get_or_create(**data) + metadata_dict["Citation"]["created"] = citation_created elif model_name == "Opinion": opinion.__dict__.update(data) else: @@ -78,6 +82,8 @@ def update_document_from_text( f"Object type of {model_name} not yet supported." ) + return metadata_dict + @app.task( bind=True, diff --git a/cl/scrapers/test_assets/test_opinion_scraper.py b/cl/scrapers/test_assets/test_opinion_scraper.py index 508be0dfec..18a28d71de 100644 --- a/cl/scrapers/test_assets/test_opinion_scraper.py +++ b/cl/scrapers/test_assets/test_opinion_scraper.py @@ -1,3 +1,4 @@ +import re from datetime import datetime from os.path import join @@ -53,3 +54,23 @@ def _get_nature_of_suit(self): def _get_judges(self): path = "//judge/text()" return list(self.html.xpath(path)) + + def extract_from_text(self, scraped_text): + metadata = {} + docket_regex = r"Docket Number: (?P\d+-\d+)" + disposition_regex = r"Disposition: (?P\w+)" + citation_regex = r"(?P20\d{2}) (?PVT) (?P\d+)" + if docket_match := re.search(docket_regex, scraped_text): + metadata["Docket"] = { + "docket_number": docket_match.group("docket") + } + + if disposition_match := re.search(disposition_regex, scraped_text): + metadata["OpinionCluster"] = { + "disposition": disposition_match.group("disposition") + } + + if citation_match := re.search(citation_regex, scraped_text): + metadata["Citation"] = {**citation_match.groupdict(), "type": 8} + + return metadata diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py index 66ce4e9457..232a6564b4 100644 --- a/cl/scrapers/tests.py +++ b/cl/scrapers/tests.py @@ -1,5 +1,5 @@ import os -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta from http import HTTPStatus from pathlib import Path from unittest import TestCase, mock @@ -17,6 +17,7 @@ from cl.api.models import WebhookEvent, WebhookEventType from cl.audio.factories import AudioWithParentsFactory from cl.audio.models import Audio +from cl.lib.juriscraper_utils import get_module_by_court_id from cl.lib.microservice_utils import microservice from cl.lib.test_helpers import generate_docket_target_sources from cl.scrapers.DupChecker import DupChecker @@ -29,6 +30,7 @@ cl_back_scrape_citations, cl_scrape_opinions, cl_scrape_oral_arguments, + update_from_text, ) from cl.scrapers.models import UrlHash from cl.scrapers.tasks import extract_doc_content, process_audio_file @@ -38,6 +40,7 @@ get_binary_content, get_existing_docket, get_extension, + scraped_citation_object_is_valid, update_or_create_docket, ) from cl.search.factories import ( @@ -46,7 +49,7 @@ OpinionClusterFactory, OpinionFactory, ) -from cl.search.models import Citation, Court, Docket, Opinion +from cl.search.models import SOURCES, Citation, Court, Docket, Opinion from cl.settings import MEDIA_ROOT from cl.tests.cases import ESIndexTestCase, SimpleTestCase, TestCase from cl.tests.fixtures import ONE_SECOND_MP3_BYTES, SMALL_WAV_BYTES @@ -863,3 +866,168 @@ def test_federal_jurisdictions(self): self.assertEqual( docket, self.ca2_docket, "Should match using docket number core" ) + + +class UpdateFromTextCommandTest(TestCase): + """Test the input processing and DB querying for the command""" + + def setUp(self): + self.vt = CourtFactory(id="vt") + self.sc = CourtFactory(id="sc") + self.docket_sc = DocketFactory(court=self.sc, docket_number="20") + + # Different dates, status and courts to test command behaviour + self.opinion_2020 = OpinionFactory( + cluster=OpinionClusterFactory( + docket=DocketFactory(court=self.vt, docket_number="12"), + date_filed=date(2020, 6, 1), + precedential_status="Published", + source=SOURCES.COURT_M_HARVARD, + ), + plain_text="""Docket Number: 2020-12 + Disposition: Affirmed + 2020 VT 11""", + ) + self.opinion_2020_unpub = OpinionFactory( + cluster=OpinionClusterFactory( + docket=DocketFactory(court=self.vt, docket_number="13"), + date_filed=date(2020, 7, 1), + precedential_status="Unpublished", + source=SOURCES.COURT_WEBSITE, + ), + plain_text="Docket Number: 2020-13\nDisposition: Affirmed", + ) + + self.opinion_sc = OpinionFactory( + cluster=OpinionClusterFactory( + docket=self.docket_sc, + date_filed=date(2021, 6, 1), + precedential_status="Published", + source=SOURCES.COURT_WEBSITE, + ), + plain_text="Some text with no matches", + id=101, + ) + + self.opinion_2022 = OpinionFactory( + cluster=OpinionClusterFactory( + docket=DocketFactory(court=self.vt, docket_number="13"), + date_filed=date(2022, 6, 1), + precedential_status="Unpublished", + source=SOURCES.COURT_WEBSITE, + ), + id=100, + plain_text="Docket Number: 2022-13\n2022 VT 11", + ) + + def test_inputs(self): + """Do all command inputs work properly?""" + + # will target a single opinion, for which extract_from_text + # extracts no metadata. No object should be updated + cmd = update_from_text.Command() + with mock.patch( + "cl.scrapers.tasks.get_scraper_object_by_name", + return_value=test_opinion_scraper.Site(), + ): + cmd.handle(court_id="somepath.sc", opinion_ids=[101]) + + self.assertFalse( + any( + [ + cmd.stats["Docket"], + cmd.stats["OpinionCluster"], + cmd.stats["Citation"], + cmd.stats["Opinion"], + ] + ), + "No object should be modified", + ) + + # will target 1 opinion, there are 2 in the time period + # and 3 for the court + with mock.patch( + "cl.scrapers.tasks.get_scraper_object_by_name", + return_value=test_opinion_scraper.Site(), + ): + update_from_text.Command().handle( + court_id="somepath.vt", + opinion_ids=[], + date_filed_gte=datetime(2020, 5, 1), + date_filed_lte=datetime(2021, 6, 1), + cluster_status="Published", + ) + + # Test that objects were actually updated / created + self.assertEqual( + Citation.objects.filter(cluster=self.opinion_2020.cluster).count(), + 1, + "There should be a single citation for this cluster", + ) + self.opinion_2020.refresh_from_db() + self.opinion_2020.cluster.refresh_from_db() + self.opinion_2020.cluster.docket.refresh_from_db() + self.assertEqual( + self.opinion_2020.cluster.disposition, + "Affirmed", + "OpinionCluster.disposition was not updated", + ) + self.assertEqual( + self.opinion_2020.cluster.docket.docket_number, + "2020-12", + "Docket.docket_number was not updated", + ) + + # Check that other objects in the time period and court + # were not modified. Meaning, the filter worked + self.assertEqual( + self.opinion_2020_unpub.cluster.docket.docket_number, + "13", + "Unpublished docket should not be modified", + ) + + def test_scraped_citation_object_is_valid(self): + """Can we validate Citation dicts got from `Site.extract_from_text`""" + bad_type = {"reporter": "WI", "type": Citation.FEDERAL} + self.assertFalse( + scraped_citation_object_is_valid(bad_type), + "Citation should be marked as invalid. Type does not match reporter", + ) + + bad_reporter = {"reporter": "Some text"} + self.assertFalse( + scraped_citation_object_is_valid(bad_reporter), + "Citation should be marked as invalid. Reporter does not exist", + ) + + valid_citation = {"reporter": "WI", "type": Citation.NEUTRAL} + self.assertTrue( + scraped_citation_object_is_valid(valid_citation), + "Citation object should be marked as valid", + ) + + +class CommandInputTest(TestCase): + def test_get_module_by_court_id(self): + """Test if get_module_by_court_id helper is working properly""" + try: + get_module_by_court_id("lactapp", "opinions") + self.fail("Court id matches more than 1 Site object, should fail") + except ValueError: + pass + + try: + get_module_by_court_id("ca1", "something") + self.fail("Invalid module type, should fail") + except ValueError: + pass + + # same court, different type + self.assertEqual( + "juriscraper.opinions.united_states.federal_appellate.ca1", + get_module_by_court_id("ca1", "opinions"), + ) + self.assertEqual( + "juriscraper.oral_args.united_states.federal_appellate.ca1", + get_module_by_court_id("ca1", "oral_args"), + ) diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py index 31134ce3d2..370e2a4542 100644 --- a/cl/scrapers/utils.py +++ b/cl/scrapers/utils.py @@ -1,5 +1,4 @@ import os -import sys from datetime import date from typing import Optional, Tuple from urllib.parse import urljoin @@ -9,15 +8,16 @@ from asgiref.sync import async_to_sync from courts_db import find_court_by_id, find_court_ids_by_name from django.conf import settings -from django.db.models import Q, QuerySet +from django.db.models import Q from juriscraper import AbstractSite from juriscraper.AbstractSite import logger from juriscraper.lib.test_utils import MockRequest from lxml import html +from reporters_db import REPORTERS from requests import Response, Session +from cl.citations.utils import map_reporter_db_cite_type from cl.corpus_importer.utils import winnow_case_name -from cl.lib.celery_utils import CeleryThrottle from cl.lib.decorators import retry from cl.lib.microservice_utils import microservice from cl.recap.mergers import find_docket_object @@ -26,8 +26,7 @@ NoDownloadUrlError, UnexpectedContentTypeError, ) -from cl.scrapers.tasks import extract_recap_pdf -from cl.search.models import Court, Docket, RECAPDocument +from cl.search.models import Court, Docket def get_child_court(child_court_name: str, court_id: str) -> Optional[Court]: @@ -242,53 +241,6 @@ def signal_handler(signal, frame): die_now = True -def extract_recap_documents( - docs: QuerySet, - ocr_available: bool = True, - order_by: Optional[str] = None, - queue: Optional[str] = None, -) -> None: - """Loop over RECAPDocuments and extract their contents. Use OCR if requested. - - :param docs: A queryset containing the RECAPDocuments to be processed. - :type docs: Django Queryset - :param ocr_available: Whether OCR should be completed (True) or whether items - should simply be updated to have status OCR_NEEDED. - :type ocr_available: Bool - :param order_by: An optimization parameter. You may opt to order the - processing by 'small-first' or 'big-first'. - :type order_by: str - :param queue: The celery queue to send the content to. - :type queue: str - """ - docs = docs.exclude(filepath_local="") - if ocr_available: - # We're doing OCR. Only work with those items that require it. - docs = docs.filter(ocr_status=RECAPDocument.OCR_NEEDED) - else: - # Focus on the items that we don't know if they need OCR. - docs = docs.filter(ocr_status=None) - - if order_by is not None: - if order_by == "small-first": - docs = docs.order_by("page_count") - elif order_by == "big-first": - docs = docs.order_by("-page_count") - - count = docs.count() - throttle = CeleryThrottle(queue_name=queue) - for i, pk in enumerate(docs.values_list("pk", flat=True)): - throttle.maybe_wait() - extract_recap_pdf.apply_async( - (pk, ocr_available), priority=5, queue=queue - ) - if i % 1000 == 0: - msg = f"Sent {i + 1}/{count} tasks to celery so far." - logger.info(msg) - sys.stdout.write(f"\r{msg}") - sys.stdout.flush() - - def get_existing_docket( court_id: str, docket_number: str, appeal_from_str: str = "" ) -> Docket | None: @@ -466,3 +418,31 @@ def update_or_create_docket( setattr(docket, field, value) return docket + + +def scraped_citation_object_is_valid(citation_object: dict) -> bool: + """Validate Citation objects from `Site.extract_from_text` + + Check that the parsed `Citation.reporter` exists in reporters-db + and that the `Citation.type` matches the reporters-db type + + :param citation_object: dict got from `Site.extract_from_text` + :return: True if the parsed reporter and type match with reporters-db + False otherwise + """ + parsed_reporter = citation_object["reporter"] + try: + reporter = REPORTERS[parsed_reporter] + mapped_type = map_reporter_db_cite_type(reporter[0].get("cite_type")) + if mapped_type == citation_object["type"]: + return True + logger.error( + "Citation.type '%s' from `extract_from_text` does not match reporters-db type '%s' for reporter '%s'", + citation_object["type"], + mapped_type, + parsed_reporter, + ) + except KeyError: + logger.error("Parsed reporter '%s' does not exist", parsed_reporter) + + return False diff --git a/cl/search/api_serializers.py b/cl/search/api_serializers.py index f27053e95d..31752a79af 100644 --- a/cl/search/api_serializers.py +++ b/cl/search/api_serializers.py @@ -619,7 +619,7 @@ class Meta: ) -class OpinionClusterESResultSerializer(MainMetaMixin, DocumentSerializer): +class OpinionClusterBaseESResultSerializer(DocumentSerializer): """The serializer for OpinionCluster Search results.""" opinions = OpinionDocumentESResultSerializer( @@ -649,6 +649,20 @@ class Meta: ) +class OpinionClusterESResultSerializer( + OpinionClusterBaseESResultSerializer, MainMetaMixin +): + """The serializer for OpinionCluster Search results.""" + + +class OpinionClusterWebhookResultSerializer( + OpinionClusterBaseESResultSerializer +): + """The serializer class for OpinionCluster search Webhooks results.""" + + meta = BaseMetaDataSerializer(source="*", read_only=True) + + class PositionESResultSerializer(ChildMetaMixin, DocumentSerializer): """The serializer for Positions Search results.""" diff --git a/cl/search/api_utils.py b/cl/search/api_utils.py index 09afbd2653..f5c22e388a 100644 --- a/cl/search/api_utils.py +++ b/cl/search/api_utils.py @@ -19,6 +19,7 @@ do_es_api_query, limit_inner_hits, merge_unavailable_fields_on_parent_document, + set_child_docs_and_score, set_results_highlights, ) from cl.lib.scorched_utils import ExtraSolrInterface @@ -474,18 +475,7 @@ def process_results(self, results: Response) -> None: "v4", self.clean_data["highlight"], ) - for result in results: - child_result_objects = [] - if hasattr(result, "child_docs"): - for child_doc in result.child_docs: - child_result_objects.append( - defaultdict( - lambda: None, child_doc["_source"].to_dict() - ) - ) - result["child_docs"] = child_result_objects - # Include the ES main document score as bm25_score. - result["bm25_score"] = result.meta.score + set_child_docs_and_score(results, merge_score=True) if self.reverse: # If doing backward pagination, reverse the results of the current diff --git a/cl/search/api_views.py b/cl/search/api_views.py index 2a2ca2eeeb..df798edd6f 100644 --- a/cl/search/api_views.py +++ b/cl/search/api_views.py @@ -4,6 +4,7 @@ from rest_framework import pagination, permissions, response, viewsets from rest_framework.exceptions import NotFound from rest_framework.pagination import PageNumberPagination +from rest_framework.permissions import DjangoModelPermissionsOrAnonReadOnly from cl.api.api_permissions import V3APIPermission from cl.api.pagination import ESCursorPagination @@ -65,7 +66,10 @@ class OriginatingCourtInformationViewSet(viewsets.ModelViewSet): serializer_class = OriginalCourtInformationSerializer - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] # Default cursor ordering key ordering = "-id" # Additional cursor ordering fields @@ -80,7 +84,10 @@ class OriginatingCourtInformationViewSet(viewsets.ModelViewSet): class DocketViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = DocketSerializer filterset_class = DocketFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -171,7 +178,10 @@ class RECAPDocumentViewSet( class CourtViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = CourtSerializer filterset_class = CourtFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_modified", @@ -191,7 +201,10 @@ class CourtViewSet(LoggingMixin, viewsets.ModelViewSet): class OpinionClusterViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = OpinionClusterSerializer filterset_class = OpinionClusterFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -216,7 +229,10 @@ class OpinionClusterViewSet(LoggingMixin, viewsets.ModelViewSet): class OpinionViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = OpinionSerializer filterset_class = OpinionFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] ordering_fields = ( "id", "date_created", @@ -240,7 +256,10 @@ class OpinionViewSet(LoggingMixin, viewsets.ModelViewSet): class OpinionsCitedViewSet(LoggingMixin, viewsets.ModelViewSet): serializer_class = OpinionsCitedSerializer filterset_class = OpinionsCitedFilter - permission_classes = [V3APIPermission] + permission_classes = [ + DjangoModelPermissionsOrAnonReadOnly, + V3APIPermission, + ] # Default cursor ordering key ordering = "-id" # Additional cursor ordering fields diff --git a/cl/search/constants.py b/cl/search/constants.py index 333dfbca6c..f7e76cb8fb 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -110,10 +110,10 @@ "syllabus", ] SEARCH_MLT_OPINION_QUERY_FIELDS = [ - "procedural_history", - "posture", - "syllabus", - "text", + "procedural_history.exact", + "posture.exact", + "syllabus.exact", + "text.exact", ] # ES fields that are used for highlighting diff --git a/cl/search/filters.py b/cl/search/filters.py index d7f11e472c..6f1d6f6603 100644 --- a/cl/search/filters.py +++ b/cl/search/filters.py @@ -28,6 +28,10 @@ class CourtFilter(NoEmptyFilterSet): "cl.search.filters.DocketFilter", queryset=Docket.objects.all() ) jurisdiction = filters.MultipleChoiceFilter(choices=Court.JURISDICTIONS) + parent_court = filters.CharFilter( + field_name="parent_court__id", + lookup_expr="exact", + ) class Meta: model = Court diff --git a/cl/search/management/commands/sweep_indexer.py b/cl/search/management/commands/sweep_indexer.py index 4cc7b0bc4f..fe2bb96e79 100644 --- a/cl/search/management/commands/sweep_indexer.py +++ b/cl/search/management/commands/sweep_indexer.py @@ -359,7 +359,7 @@ def process_queryset( processed_count = 0 accumulated_chunk = 0 throttle = CeleryThrottle( - poll_interval=10, + poll_interval=settings.ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL, # type: ignore min_items=self.chunk_size, queue_name=self.queue, ) @@ -405,8 +405,17 @@ def process_queryset( ).set(queue=self.queue).apply_async() accumulated_chunk += len(chunk) + if not testing_mode: + # Wait for 1/ELASTICSEARCH_SWEEP_INDEXER_WAIT_BETWEEN_CHUNKS + # before processing the next chunk. + # e.g: With a poll interval of 10 and a chunk size of 10, + # it will wait for 0.1 seconds for every 10 documents processed, + # maintaining an index rate of 100 documents per second. + time.sleep( + 1 / settings.ELASTICSEARCH_SWEEP_INDEXER_WAIT_BETWEEN_CHUNKS # type: ignore + ) self.stdout.write( - "\rProcessed {}/{}, ({:.0%}), last {} PK indexed: {},".format( + "\rProcessed {}/{}, ({:.0%}), last {} ID indexed: {},".format( processed_count, count, processed_count * 1.0 / count, diff --git a/cl/search/models.py b/cl/search/models.py index 9d24542957..e4c13e116b 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -18,6 +18,7 @@ from django.urls import NoReverseMatch, reverse from django.utils import timezone from django.utils.encoding import force_str +from django.utils.functional import cached_property from django.utils.text import slugify from eyecite import get_citations from eyecite.tokenizers import HyperscanTokenizer @@ -2836,9 +2837,8 @@ async def acaption(self): else: caption += f", {citations[0]}" - cluster = await OpinionCluster.objects.aget(pk=self.pk) - docket = await Docket.objects.aget(id=cluster.docket_id) - court = await Court.objects.aget(pk=docket.court_id) + docket = await sync_to_async(lambda: self.docket)() + court = await sync_to_async(lambda: docket.court)() if docket.court_id != "scotus": court = re.sub(" ", " ", court.citation_string) # Strftime fails before 1900. Do it this way instead. @@ -2890,6 +2890,30 @@ def caption(self): caption += f" ({court} {year})" return caption + @property + def display_citation(self): + """Find favorite citation to display + + Identify the proper or favorite citation(s) to display on the front end + but don't wrap it together with a title + :return: The citation if applicable + """ + citation_list = [citation for citation in self.citations.all()] + citations = sorted(citation_list, key=sort_cites) + if not citations: + citation = "" + elif citations[0].type == Citation.NEUTRAL: + citation = citations[0] + elif ( + len(citations) >= 2 + and citations[0].type == Citation.WEST + and citations[1].type == Citation.LEXIS + ): + citation = f"{citations[0]}, {citations[1]}" + else: + citation = citations[0] + return citation + @property def citation_string(self): """Make a citation string, joined by commas""" @@ -3002,7 +3026,13 @@ async def aauthorities_with_data(self): The returned list is sorted by that citation count field. """ authorities_with_data = [] - async for authority in await self.aauthorities(): + authorities_base = await self.aauthorities() + authorities_qs = ( + authorities_base.prefetch_related("citations") + .select_related("docket__court") + .order_by("-citation_count", "-date_filed") + ) + async for authority in authorities_qs: authority.citation_depth = ( await get_citation_depth_between_clusters( citing_cluster_pk=self.pk, cited_cluster_pk=authority.pk @@ -3029,6 +3059,19 @@ def __str__(self) -> str: def get_absolute_url(self) -> str: return reverse("view_case", args=[self.pk, self.slug]) + @cached_property + def ordered_opinions(self): + # Fetch all sub-opinions ordered by ordering_key + sub_opinions = self.sub_opinions.all().order_by("ordering_key") + + # Check if there is more than one sub-opinion + if sub_opinions.count() > 1: + # Return only sub-opinions with an ordering key + return sub_opinions.exclude(ordering_key__isnull=True) + + # If there's only one or no sub-opinions, return the main opinion + return sub_opinions + def save( self, update_fields=None, diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index fe59be528d..891650c6e8 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -2,6 +2,7 @@ import io import os from datetime import date +from http import HTTPStatus from pathlib import Path from unittest import mock from urllib.parse import parse_qs @@ -1076,6 +1077,33 @@ def test_round_estimated_search_counts(self) -> None: with self.subTest(test=test, msg="Test estimated search counts."): self.assertEqual(simplify_estimated_count(test[0]), test[1]) + def test_avoid_wrapping_boosted_numbers_in_quotes(self) -> None: + """Confirm that numbers in boost queries are not wrapped in quotes + that makes the query to fail. + """ + search_params = { + "type": SEARCH_TYPES.ORAL_ARGUMENT, + "q": "Jose^3", + } + r = self.client.get( + reverse("show_results"), + search_params, + ) + self.assertNotIn("encountered an error", r.content.decode()) + + def test_raise_forbidden_error_on_depth_pagination(self) -> None: + """Confirm that a 403 Forbidden error is raised on depth pagination.""" + search_params = { + "type": SEARCH_TYPES.OPINION, + "q": "Lorem", + "page": 101, + } + r = self.client.get( + reverse("show_results"), + search_params, + ) + self.assertEqual(r.status_code, HTTPStatus.FORBIDDEN) + class SearchAPIV4CommonTest(ESIndexTestCase, TestCase): """Common tests for the Search API V4 endpoints.""" @@ -1143,6 +1171,7 @@ async def test_handle_unbalanced_parentheses(self) -> None: ) +@override_flag("ui_flag_for_o", False) class OpinionSearchFunctionalTest(AudioTestCase, BaseSeleniumTest): """ Test some of the primary search functionality of CL: searching opinions. @@ -1283,6 +1312,7 @@ def test_search_and_facet_docket_numbers(self) -> None: for result in search_results.find_elements(By.TAG_NAME, "article"): self.assertIn("1337", result.text) + @override_flag("ui_flag_for_o", False) @timeout_decorator.timeout(SELENIUM_TIMEOUT) def test_opinion_search_result_detail_page(self) -> None: # Dora navitages to CL and does a simple wild card search @@ -1643,35 +1673,6 @@ def test_search_query_saving(self) -> None: "Repeated query not marked as having hit cache", ) - # Force Solr use - @override_flag("oa-es-active", False) - @override_flag("r-es-active", False) - @override_flag("p-es-active", False) - @override_flag("o-es-active", False) - def test_search_query_saving_solr(self) -> None: - """Are queries saved when using solr search (do_search)""" - for query in self.searches: - url = f"{reverse('show_results')}?{query}" - self.client.get(url) - last_query = SearchQuery.objects.last() - expected_query = self.normalize_query(query, replace_space=True) - stored_query = self.normalize_query(last_query.get_params) - self.assertEqual( - expected_query, - stored_query, - f"Query was not saved properly. Expected {expected_query}, got {stored_query}", - ) - self.assertEqual( - last_query.engine, - SearchQuery.SOLR, - f"Saved wrong `engine` value, expected {SearchQuery.SOLR}", - ) - self.assertEqual( - last_query.source, - SearchQuery.WEBSITE, - self.source_error_message, - ) - def test_failed_es_search_queries(self) -> None: """Do we flag failed ElasticSearch queries properly?""" query = "type=r&q=contains/sproximity token" @@ -1772,36 +1773,6 @@ def test_failed_es_search_v3_api_queries(self) -> None: f"Saved wrong `engine` value, expected {SearchQuery.ELASTICSEARCH}", ) - @override_flag("oa-es-active", False) - @override_flag("oa-es-activate", False) - @override_flag("r-es-search-api-active", False) - @override_flag("p-es-active", False) - @override_flag("o-es-search-api-active", False) - def test_search_solr_api_v3_query_saving(self) -> None: - """Do we save queries on all V3 Search Solr endpoints""" - for query in self.base_searches: - url = f"{reverse("search-list", kwargs={"version": "v3"})}?{query}" - self.client.get(url) - # Compare parsed query strings; - last_query = SearchQuery.objects.last() - expected_query = self.normalize_query(query, replace_space=True) - stored_query = self.normalize_query(last_query.get_params) - self.assertEqual( - expected_query, - stored_query, - f"Query was not saved properly. Expected {expected_query}, got {stored_query}", - ) - self.assertEqual( - last_query.engine, - SearchQuery.SOLR, - f"Saved wrong `engine` value, expected {SearchQuery.ELASTICSEARCH}", - ) - self.assertEqual( - last_query.source, - SearchQuery.API, - self.source_error_message, - ) - class CaptionTest(TestCase): """Can we make good looking captions?""" diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index c7d9c2568d..0ccfb866b4 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -547,6 +547,23 @@ def test_o_results_api_pagination(self) -> None: for created_opinion in created_opinions: created_opinion.delete() + async def test_bad_syntax_error(self) -> None: + """Can we properly raise the ElasticServerError exception?""" + + # Bad syntax due to the / char in the query. + params = { + "type": SEARCH_TYPES.OPINION, + "q": "This query contains bad/syntax query", + } + r = await self.async_client.get( + reverse("search-list", kwargs={"version": "v3"}), params + ) + self.assertEqual(r.status_code, HTTPStatus.INTERNAL_SERVER_ERROR) + self.assertEqual( + r.data["detail"], + "Internal Server Error. Please try again later or review your query.", + ) + class OpinionV4APISearchTest( OpinionSearchAPICommonTests, @@ -2253,6 +2270,8 @@ def test_uses_exact_version_for_case_name_field(self) -> None: cluster_2.delete() +@override_flag("ui_flag_for_o", False) +@override_settings(RELATED_MLT_MINTF=1) class RelatedSearchTest( ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase ): @@ -2357,6 +2376,9 @@ def test_more_like_this_opinion(self) -> None: < r.content.decode().index("/opinion/%i/" % expected_second_pk), msg="'Howard v. Honda' should come AFTER 'case name cluster 3'.", ) + # Confirm that results contain a snippet + self.assertIn("plain", r.content.decode()) + # Confirm "related to" cluster legend is within the results' header. h2_element = html.fromstring(r.content.decode()).xpath( '//h2[@id="result-count"]' diff --git a/cl/search/tests/tests_es_person.py b/cl/search/tests/tests_es_person.py index 6c59b01cf6..eb82285286 100644 --- a/cl/search/tests/tests_es_person.py +++ b/cl/search/tests/tests_es_person.py @@ -616,6 +616,7 @@ async def test_results_api_fields(self) -> None: search_params = { "type": SEARCH_TYPES.PEOPLE, "q": f"id:{self.person_2.pk} AND nomination_process:(U.S. Senate)", + "order_by": "score desc", } # API r = await self._test_api_results_count(search_params, 1, "API fields") @@ -662,6 +663,7 @@ def test_results_api_empty_fields(self) -> None: search_params = { "type": SEARCH_TYPES.PEOPLE, "q": f"id:{person.pk}", + "order_by": "score desc", } # API r = async_to_sync(self._test_api_results_count)( @@ -869,6 +871,7 @@ async def test_results_api_highlighted_fields(self) -> None: "q": f"id:{self.person_2.pk} name:Sheindlin dob_city:Brookyln nomination_process:(U.S. Senate) political_affiliation:Democratic", "school": "New York Law School", "dob_state": "NY", + "order_by": "score desc", } # Judged Search type HL disabled. diff --git a/cl/search/views.py b/cl/search/views.py index 10f3f4b7f9..e545b4a9c7 100644 --- a/cl/search/views.py +++ b/cl/search/views.py @@ -729,6 +729,7 @@ def do_es_search( query_citation = None facet_fields = [] missing_citations_str = [] + error = True search_form = SearchForm(get_params, is_es_form=True, courts=courts) match get_params.get("type", SEARCH_TYPES.OPINION): @@ -827,8 +828,6 @@ def do_es_search( cd if not error else {"type": cd["type"]}, search_form, ) - else: - error = True courts, court_count_human, court_count = merge_form_with_courts( courts, search_form diff --git a/cl/settings/misc.py b/cl/settings/misc.py index fb30282ab4..6448bfcac6 100644 --- a/cl/settings/misc.py +++ b/cl/settings/misc.py @@ -63,3 +63,8 @@ CAP_R2_ACCESS_KEY_ID = env("CAP_R2_ACCESS_KEY_ID", default="") CAP_R2_SECRET_ACCESS_KEY = env("CAP_R2_SECRET_ACCESS_KEY", default="") CAP_R2_BUCKET_NAME = env("CAP_R2_BUCKET_NAME", default="cap-static") + +# Webhooks +WEBHOOK_V1_DEPRECATION_DATE = env( + "WEBHOOK_V1_DEPRECATION_DATE", default="2024-11-18" +) diff --git a/cl/settings/project/corpus_importer.py b/cl/settings/project/corpus_importer.py index dc81d21978..f2f375845f 100644 --- a/cl/settings/project/corpus_importer.py +++ b/cl/settings/project/corpus_importer.py @@ -1,8 +1,8 @@ import environ env = environ.FileAwareEnv() -IQUERY_PROBE_DAEMON_ENABLED = env.int( - "IQUERY_PROBE_DAEMON_ENABLED", default=False +IQUERY_CASE_PROBE_DAEMON_ENABLED = env.bool( + "IQUERY_CASE_PROBE_DAEMON_ENABLED", default=False ) IQUERY_PROBE_ITERATIONS = env.int("IQUERY_PROBE_ITERATIONS", default=9) IQUERY_PROBE_WAIT = env.int("IQUERY_PROBE_WAIT", default=300) diff --git a/cl/settings/third_party/elasticsearch.py b/cl/settings/third_party/elasticsearch.py index c62e575d2e..7a1ec6b779 100644 --- a/cl/settings/third_party/elasticsearch.py +++ b/cl/settings/third_party/elasticsearch.py @@ -264,6 +264,12 @@ ELASTICSEARCH_SWEEP_INDEXER_HEADS_RATE = env( "ELASTICSEARCH_SWEEP_INDEXER_HEADS_RATE", default=60 ) +ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL = env( + "ELASTICSEARCH_SWEEP_INDEXER_POLL_INTERVAL", default=10 +) +ELASTICSEARCH_SWEEP_INDEXER_WAIT_BETWEEN_CHUNKS = env( + "ELASTICSEARCH_SWEEP_INDEXER_WAIT_BETWEEN_CHUNKS", default=3 +) ELASTICSEARCH_SWEEP_INDEXER_MODELS = env( "ELASTICSEARCH_SWEEP_INDEXER_MODELS", default=[ diff --git a/cl/settings/third_party/rest_framework.py b/cl/settings/third_party/rest_framework.py index 9e4365b4ce..2c0f04163e 100644 --- a/cl/settings/third_party/rest_framework.py +++ b/cl/settings/third_party/rest_framework.py @@ -29,8 +29,12 @@ "OVERRIDE_THROTTLE_RATES": { # Throttling down. # Unresponsive + "projecttesting": "1/hour", "SAGW": "1/hour", # Bounced + "riwiko8259": "1/hour", + "xicaro7027": "1/hour", + "nayibij851": "1/hour", "testname2024": "1/hour", "cadebe2258": "1/hour", # Disposable email diff --git a/cl/tests/cases.py b/cl/tests/cases.py index 2f0db20e88..8b23dea418 100644 --- a/cl/tests/cases.py +++ b/cl/tests/cases.py @@ -270,6 +270,11 @@ async def _compare_field( set(meta_expected_value.keys()), f"The keys in field '{meta_field}' do not match.", ) + for score_value in meta_value.values(): + self.assertIsNotNone( + score_value, f"The score value can't be None." + ) + else: self.assertEqual( meta_value, @@ -412,7 +417,7 @@ def _test_page_variables( return next_page, previous_page, current_page -class RECAPAlertsAssertions: +class SearchAlertsAssertions: @staticmethod def get_html_content_from_email(email_content): @@ -506,7 +511,9 @@ def _count_alert_hits_and_child_hits( case_text_cleaned = self.clean_case_title(case_text) if case_title == case_text_cleaned: child_hit_count = len( - case.xpath("following-sibling::ul[1]/li/a") + case.xpath( + "following-sibling::ul[1]/li/a | following-sibling::ul[1]/li/strong" + ) ) self.assertEqual( child_hit_count, @@ -535,8 +542,8 @@ def extract_child_descriptions(case_item): child_documents = case_item.xpath("./following-sibling::ul[1]/li") results = [] for li in child_documents: - a_tag = li.xpath(".//a")[0] - full_text = a_tag.text_content() + child_tag = li.xpath(".//a | .//strong")[0] + full_text = child_tag.text_content() first_part = full_text.split("\u2014")[0].strip() results.append(first_part) @@ -563,6 +570,7 @@ def _count_webhook_hits_and_child_hits( expected_hits, case_title, expected_child_hits, + nested_field="recap_documents", ): """Confirm the following assertions for the search alert webhook: - An specific alert webhook was triggered. @@ -570,6 +578,8 @@ def _count_webhook_hits_and_child_hits( - The specified case contains the expected number of child hits. """ + matched_alert_name = None + matched_case_title = None for webhook in webhooks: if webhook["payload"]["alert"]["name"] == alert_title: webhook_cases = webhook["payload"]["results"] @@ -579,14 +589,21 @@ def _count_webhook_hits_and_child_hits( msg=f"Did not get the right number of hits for the alert %s. " % alert_title, ) + matched_alert_name = True for case in webhook["payload"]["results"]: if case_title == strip_tags(case["caseName"]): + matched_case_title = True + if nested_field is None: + self.assertTrue(nested_field not in case) + continue self.assertEqual( - len(case["recap_documents"]), + len(case[nested_field]), expected_child_hits, msg=f"Did not get the right number of child documents for the case %s. " % case_title, ) + self.assertTrue(matched_alert_name, msg="Alert name didn't match") + self.assertTrue(matched_case_title, msg="Case title didn't match") def _count_percolator_webhook_hits_and_child_hits( self, @@ -651,6 +668,7 @@ def _assert_webhook_hit_hl( field_name, hl_expected, child_field, + nested_field="recap_documents", ): """Assert Hl in webhook fields.""" for webhook in webhooks: @@ -659,10 +677,10 @@ def _assert_webhook_hit_hl( if child_field: self.assertNotIn( "score", - hit["recap_documents"][0]["meta"], + hit[nested_field][0]["meta"], msg="score shouldn't be present on webhook nested documents", ) - child_field_content = hit["recap_documents"][0][field_name] + child_field_content = hit[nested_field][0][field_name] self.assertIn( hl_expected, child_field_content, diff --git a/cl/tests/test_feeds.py b/cl/tests/test_feeds.py index a9fb9c8c7c..17b42ab235 100644 --- a/cl/tests/test_feeds.py +++ b/cl/tests/test_feeds.py @@ -64,7 +64,8 @@ def test_feeds_page_shows_jurisdiction_links(self) -> None: link.get_attribute("href"), f"{self.live_server_url}/feed/court/{court.pk}/", ) - link.click() + with self.wait_for_page_load(timeout=10): + link.click() print("clicked...", end=" ") self.assertIn( 'feed xml:lang="en-us" xmlns="http://www.w3.org/2005/Atom"', diff --git a/cl/users/api_views.py b/cl/users/api_views.py index f35bad827a..865188911d 100644 --- a/cl/users/api_views.py +++ b/cl/users/api_views.py @@ -10,7 +10,12 @@ from rest_framework.viewsets import ModelViewSet from cl.api.api_permissions import IsOwner -from cl.api.models import Webhook, WebhookEvent, WebhookEventType +from cl.api.models import ( + Webhook, + WebhookEvent, + WebhookEventType, + WebhookVersions, +) from cl.api.tasks import send_test_webhook_event from cl.users.filters import WebhookEventViewFilter from cl.users.forms import WebhookForm @@ -115,50 +120,59 @@ def test_webhook(self, request, *args, **kwargs): webhook = self.get_object() event_type = webhook.event_type + version = webhook.version match event_type: case WebhookEventType.DOCKET_ALERT: event_template = loader.get_template( "includes/docket_alert_webhook_dummy.txt" ) - event_dummy_content = event_template.render().strip() + event_dummy_content = event_template.render( + {"webhook_version": version} + ).strip() event_curl_template = loader.get_template( "includes/docket_alert_webhook_dummy_curl.txt" ) event_dummy_curl = event_curl_template.render( - {"endpoint_url": webhook.url} + {"endpoint_url": webhook.url, "webhook_version": version} ).strip() case WebhookEventType.SEARCH_ALERT: event_template = loader.get_template( "includes/search_alert_webhook_dummy.txt" ) - event_dummy_content = event_template.render().strip() + event_dummy_content = event_template.render( + {"webhook_version": version} + ).strip() event_curl_template = loader.get_template( "includes/search_alert_webhook_dummy_curl.txt" ) event_dummy_curl = event_curl_template.render( - {"endpoint_url": webhook.url} + {"endpoint_url": webhook.url, "webhook_version": version} ).strip() case WebhookEventType.OLD_DOCKET_ALERTS_REPORT: event_template = loader.get_template( "includes/old_alerts_report_webhook_dummy.txt" ) - event_dummy_content = event_template.render().strip() + event_dummy_content = event_template.render( + {"webhook_version": version} + ).strip() event_curl_template = loader.get_template( "includes/old_alerts_report_webhook_dummy_curl.txt" ) event_dummy_curl = event_curl_template.render( - {"endpoint_url": webhook.url} + {"endpoint_url": webhook.url, "webhook_version": version} ).strip() case WebhookEventType.RECAP_FETCH: event_template = loader.get_template( "includes/recap_fetch_webhook_dummy.txt" ) - event_dummy_content = event_template.render().strip() + event_dummy_content = event_template.render( + {"webhook_version": version} + ).strip() event_curl_template = loader.get_template( "includes/recap_fetch_webhook_dummy_curl.txt" ) event_dummy_curl = event_curl_template.render( - {"endpoint_url": webhook.url} + {"endpoint_url": webhook.url, "webhook_version": version} ).strip() case _: # Webhook types with no support yet. @@ -193,6 +207,38 @@ def test_webhook(self, request, *args, **kwargs): status=HTTPStatus.OK, ) + @action(detail=False, methods=["get"]) + def get_available_versions(self, request, *args, **kwargs): + """Render the webhook version field containing available versions for + the select event type. + """ + + event_type = request.GET.get("event_type") + htmx_template = "includes/webhooks_htmx/webhook-version-select.html" + context = {"version_choices": []} + if not event_type: + return render(request, htmx_template, context) + + # Get user webhooks for this event type + existing_webhooks = Webhook.objects.filter( + user=request.user, event_type=event_type + ) + used_versions = set( + existing_webhooks.values_list("version", flat=True) + ) + # Get available webhook versions, excluding used ones + version_choices = [ + (v, label) + for v, label in WebhookVersions.choices + if v not in used_versions + ] + context["version_choices"] = version_choices + return render( + request, + htmx_template, + context, + ) + class WebhookEventViewSet(ModelViewSet): """ diff --git a/cl/users/forms.py b/cl/users/forms.py index 022b08a632..7ee724b104 100644 --- a/cl/users/forms.py +++ b/cl/users/forms.py @@ -18,7 +18,7 @@ from localflavor.us.forms import USStateField, USZipCodeField from localflavor.us.us_states import STATE_CHOICES -from cl.api.models import Webhook, WebhookEventType +from cl.api.models import Webhook, WebhookEventType, WebhookVersions from cl.lib.types import EmailType from cl.users.models import UserProfile from cl.users.utils import emails @@ -352,18 +352,31 @@ def __init__(self, update=None, request_user=None, *args, **kwargs): for i in WebhookEventType.choices if i[0] == self.instance.event_type ] + instance_version = [ + i + for i in WebhookVersions.choices + if i[0] == self.instance.version + ] self.fields["event_type"].choices = instance_type self.fields["event_type"].widget.attrs["readonly"] = True + self.fields["version"].choices = instance_version + self.fields["version"].widget.attrs["readonly"] = True + else: # If we're creating a new webhook, show the webhook type options # that are available for the user. One webhook for each event type # is allowed. webhooks = request_user.webhooks.all() - used_types = [w.event_type for w in webhooks] - available_choices = [ - i for i in WebhookEventType.choices if i[0] not in used_types + used_version_types = [ + f"{w.event_type}_{w.version}" for w in webhooks ] - self.fields["event_type"].choices = available_choices + available_type_choices = { + w_type + for w_type in WebhookEventType.choices + for w_version in WebhookVersions.choices + if f"{w_type[0]}_{w_version[0]}" not in used_version_types + } + self.fields["event_type"].choices = available_type_choices class Meta: model = Webhook @@ -371,6 +384,7 @@ class Meta: "url", "event_type", "enabled", + "version", ) widgets = { "event_type": forms.Select( @@ -382,4 +396,7 @@ class Meta: "enabled": forms.CheckboxInput( attrs={"class": "webhook-checkbox"}, ), + "version": forms.Select( + attrs={"class": "form-control"}, + ), } diff --git a/cl/users/templates/includes/docket_alert_webhook_dummy.txt b/cl/users/templates/includes/docket_alert_webhook_dummy.txt index d49c02e207..870b1ba068 100644 --- a/cl/users/templates/includes/docket_alert_webhook_dummy.txt +++ b/cl/users/templates/includes/docket_alert_webhook_dummy.txt @@ -65,7 +65,7 @@ ] }, "webhook":{ - "version":1, + "version":{{ webhook_version }}, "event_type":1, "date_created":"2022-10-11T14:21:40.855097-07:00", "deprecation_date":null diff --git a/cl/users/templates/includes/docket_alert_webhook_dummy_curl.txt b/cl/users/templates/includes/docket_alert_webhook_dummy_curl.txt index 9b95f40133..c5447d55ee 100644 --- a/cl/users/templates/includes/docket_alert_webhook_dummy_curl.txt +++ b/cl/users/templates/includes/docket_alert_webhook_dummy_curl.txt @@ -69,7 +69,7 @@ curl --request POST \ ] }, "webhook":{ - "version":1, + "version":{{ webhook_version }}, "event_type":1, "date_created":"2022-10-11T14:21:40.855097-07:00", "deprecation_date":null diff --git a/cl/users/templates/includes/old_alerts_report_webhook_dummy.txt b/cl/users/templates/includes/old_alerts_report_webhook_dummy.txt index b70c2b2566..91a7107d0b 100644 --- a/cl/users/templates/includes/old_alerts_report_webhook_dummy.txt +++ b/cl/users/templates/includes/old_alerts_report_webhook_dummy.txt @@ -22,7 +22,7 @@ ] }, "webhook":{ - "version":1, + "version":{{ webhook_version }}, "event_type":4, "date_created":"2022-12-28T14:21:40.855097-07:00", "deprecation_date":null diff --git a/cl/users/templates/includes/old_alerts_report_webhook_dummy_curl.txt b/cl/users/templates/includes/old_alerts_report_webhook_dummy_curl.txt index c737326672..e2d6d7c2ce 100644 --- a/cl/users/templates/includes/old_alerts_report_webhook_dummy_curl.txt +++ b/cl/users/templates/includes/old_alerts_report_webhook_dummy_curl.txt @@ -26,7 +26,7 @@ curl --request POST \ ] }, "webhook":{ - "version":1, + "version":{{ webhook_version }}, "event_type":4, "date_created":"2022-12-28T14:21:40.855097-07:00", "deprecation_date":null diff --git a/cl/users/templates/includes/recap_fetch_webhook_dummy.txt b/cl/users/templates/includes/recap_fetch_webhook_dummy.txt index 91b3130cb8..c42684e2e7 100644 --- a/cl/users/templates/includes/recap_fetch_webhook_dummy.txt +++ b/cl/users/templates/includes/recap_fetch_webhook_dummy.txt @@ -21,7 +21,7 @@ "show_list_of_member_cases":false }, "webhook":{ - "version":1, + "version":{{ webhook_version }}, "event_type":3, "date_created":"2024-01-06T14:21:40.855097-07:00", "deprecation_date":null diff --git a/cl/users/templates/includes/recap_fetch_webhook_dummy_curl.txt b/cl/users/templates/includes/recap_fetch_webhook_dummy_curl.txt index 8fa67e1257..2e25a45d38 100644 --- a/cl/users/templates/includes/recap_fetch_webhook_dummy_curl.txt +++ b/cl/users/templates/includes/recap_fetch_webhook_dummy_curl.txt @@ -25,7 +25,7 @@ curl --request POST \ "show_list_of_member_cases":false }, "webhook":{ - "version":1, + "version":{{ webhook_version }}, "event_type":3, "date_created":"2024-01-06T14:21:40.855097-07:00", "deprecation_date":null diff --git a/cl/users/templates/includes/search_alert_webhook_dummy.txt b/cl/users/templates/includes/search_alert_webhook_dummy.txt index 22e68940a0..0fe3076d17 100644 --- a/cl/users/templates/includes/search_alert_webhook_dummy.txt +++ b/cl/users/templates/includes/search_alert_webhook_dummy.txt @@ -63,7 +63,7 @@ } }, "webhook":{ - "version":1, + "version":{{ webhook_version }}, "event_type":2, "date_created":"2022-12-02T23:42:34.894411+00:00", "deprecation_date":"None" diff --git a/cl/users/templates/includes/search_alert_webhook_dummy_curl.txt b/cl/users/templates/includes/search_alert_webhook_dummy_curl.txt index ce1557976f..97b0786f18 100644 --- a/cl/users/templates/includes/search_alert_webhook_dummy_curl.txt +++ b/cl/users/templates/includes/search_alert_webhook_dummy_curl.txt @@ -67,7 +67,7 @@ curl --request POST \ } }, "webhook":{ - "version":1, + "version":{{ webhook_version }}, "event_type":2, "date_created":"2022-12-02T23:42:34.894411+00:00", "deprecation_date":"None" diff --git a/cl/users/templates/includes/webhook-event-detail.html b/cl/users/templates/includes/webhook-event-detail.html index 20f631fb53..9f70262daa 100644 --- a/cl/users/templates/includes/webhook-event-detail.html +++ b/cl/users/templates/includes/webhook-event-detail.html @@ -1,4 +1,5 @@ {% extends "profile/webhooks_base.html" %} +{% load extras %} {% load static %} {% load waffle_tags %} {% load humanize %} @@ -13,11 +14,11 @@

    Webhook Event Details{% if webhook_event.debug %} (

    {% if webhook_event.webhook.enabled %} Enabled {% else %} Disabled {% endif %}

    {{ webhook_event.webhook.get_event_type_display }}

    {{ webhook_event.event_id }}

    -

    {{ webhook_event.date_created }}

    +

    {{ webhook_event.date_created|datetime_in_utc }}

    {% if webhook_event.status_code %}{{ webhook_event.status_code }} {{ webhook_event.get_status_code_display }} {% else %}-{% endif %}

    {{ webhook_event.get_event_status_display }}

    {{ webhook_event.retry_counter }}

    -

    {% if not webhook_event.debug %}{% if webhook_event.next_retry_date %}{{ webhook_event.next_retry_date }}{% else %}-{% endif %}{% else %}Test events will not be retried{% endif %}

    +

    {% if not webhook_event.debug %}{% if webhook_event.next_retry_date %}{{ webhook_event.next_retry_date|datetime_in_utc }}{% else %}-{% endif %}{% else %}Test events will not be retried{% endif %}

    diff --git a/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html b/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html index dc022dff94..a9f8596832 100644 --- a/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html +++ b/cl/users/templates/includes/webhooks_htmx/webhook-logs-list.html @@ -1,3 +1,4 @@ +{% load extras %} {% load widget_tweaks %} {% if results %} {% for webhook in results %} @@ -21,11 +22,11 @@ {% endif %}

    - {{ webhook.date_created }} + {{ webhook.date_created|datetime_in_utc }} {% if not webhook.debug %} {% if webhook.next_retry_date %} - {{ webhook.next_retry_date }} + {{ webhook.next_retry_date|datetime_in_utc }} {% else %} - {% endif %} diff --git a/cl/users/templates/includes/webhooks_htmx/webhook-version-select.html b/cl/users/templates/includes/webhooks_htmx/webhook-version-select.html new file mode 100644 index 0000000000..72639b3b10 --- /dev/null +++ b/cl/users/templates/includes/webhooks_htmx/webhook-version-select.html @@ -0,0 +1,8 @@ + diff --git a/cl/users/templates/includes/webhooks_htmx/webhooks-form-common.html b/cl/users/templates/includes/webhooks_htmx/webhooks-form-common.html index a5166284c6..e5c8f5709c 100644 --- a/cl/users/templates/includes/webhooks_htmx/webhooks-form-common.html +++ b/cl/users/templates/includes/webhooks_htmx/webhooks-form-common.html @@ -19,7 +19,22 @@
    - {{ webhook_form.event_type }} + {% with option_count=webhook_form.event_type.field.choices|length %} + + {% endwith %} {% if webhook_form.event_type.errors %}

    {% for error in webhook_form.event_type.errors %} @@ -31,6 +46,32 @@

    +
    +
    + +
    +
    + +
    + {% if webhook_form.version.errors %} +

    + {% for error in webhook_form.version.errors %} + {{ error|escape }} + {% endfor %} +

    + {% endif %} +
    +
    +
    +
    {{ webhook_form.enabled }} diff --git a/cl/users/templates/includes/webhooks_htmx/webhooks-list.html b/cl/users/templates/includes/webhooks_htmx/webhooks-list.html index d30891c455..be7ef17c35 100644 --- a/cl/users/templates/includes/webhooks_htmx/webhooks-list.html +++ b/cl/users/templates/includes/webhooks_htmx/webhooks-list.html @@ -6,6 +6,11 @@ {{ webhook.get_event_type_display }}

    + +

    + {{ webhook.get_version_display }} +

    +

    {% if webhook.enabled is True %} diff --git a/cl/users/templates/includes/webhooks_htmx/webhooks-test-webhook.html b/cl/users/templates/includes/webhooks_htmx/webhooks-test-webhook.html index 2d35dda8ad..b5db8fed00 100644 --- a/cl/users/templates/includes/webhooks_htmx/webhooks-test-webhook.html +++ b/cl/users/templates/includes/webhooks_htmx/webhooks-test-webhook.html @@ -1,5 +1,7 @@

    -

    {{ webhook.url }}

    +

    {{ webhook.url }}

    +

    {{ webhook.get_event_type_display }}

    +

    {{ webhook.get_version_display }}