From e674781d72d545f2291a865bd592c2017fa2b863 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Wed, 6 Nov 2024 08:55:35 -0500 Subject: [PATCH 1/6] feat(scrapers.admin): create materialized view and admin page Concept for https://github.com/freelawproject/courtlistener/issues/3950 For this to work, the materialized view must be created directly on the DB --- cl/scrapers/admin.py | 68 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/cl/scrapers/admin.py b/cl/scrapers/admin.py index 694fb79d17..c7cc689a6e 100644 --- a/cl/scrapers/admin.py +++ b/cl/scrapers/admin.py @@ -1,4 +1,5 @@ from django.contrib import admin +from django.db import models from cl.scrapers.models import ( PACERFreeDocumentLog, @@ -29,3 +30,70 @@ class PACERFreeDocumentRowAdmin(admin.ModelAdmin): admin.site.register(UrlHash) + + +class MVLatestOpinions(models.Model): + """ + Model linked to materialized view for monitoring scrapers + + Must use `REFRESH MATERIALIZED VIEW scrapers_mv_latest_opinion` + periodically + """ + + query = """ + CREATE MATERIALIZED VIEW + scrapers_mv_latest_opinion + AS + ( + SELECT + court_id, + max(so.date_created) as latest_creation_date, + (now() - max(so.date_created))::text as time_since + FROM + ( + SELECT id, court_id + FROM search_docket + WHERE court_id IN ( + SELECT id + FROM search_court + /* + Only check courts with scrapers in use + */ + WHERE + has_opinion_scraper + AND in_use + ) + ) sd + INNER JOIN + (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id + INNER JOIN + search_opinion so ON so.cluster_id = soc.id + GROUP BY + sd.court_id + HAVING + /* + Only return results for courts with no updates in a week + */ + now() - max(so.date_created) > interval '7 days' + ORDER BY + 2 DESC + ) + """ + # a django model must have a primary key + court_id = models.TextField(primary_key=True) + latest_creation_date = models.DateField() + time_since = models.TextField() + + class Meta: + managed = False # ignore this model in migrations + db_table = "scrapers_mv_latest_opinion" + + +@admin.register(MVLatestOpinions) +class MVLatestOpinionsAdmin(admin.ModelAdmin): + """Admin page to look at the latest opinion for each court + + Use this to monitor silently failing scrapers + """ + + list_display = ["court_id", "latest_creation_date", "time_since"] From 3337f5bfb273cc2ba3846310f828e3962eeae602 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Mon, 25 Nov 2024 12:21:51 -0500 Subject: [PATCH 2/6] feat(scrapers.admin): create admin page for a scraper status page - Includes a migration file for the materialized view - MV will have to be refreshed manually or via a cronjob - MV considers only courts that have an active scraper, and that have no updates in a week --- cl/scrapers/admin.py | 59 ++++------------ .../0004_create_mv_latest_opinion.py | 69 +++++++++++++++++++ .../0004_create_mv_latest_opinion.sql | 49 +++++++++++++ ...004_create_mv_latest_opinion_customers.sql | 49 +++++++++++++ 4 files changed, 181 insertions(+), 45 deletions(-) create mode 100644 cl/scrapers/migrations/0004_create_mv_latest_opinion.py create mode 100644 cl/scrapers/migrations/0004_create_mv_latest_opinion.sql create mode 100644 cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql diff --git a/cl/scrapers/admin.py b/cl/scrapers/admin.py index c7cc689a6e..56ec54df03 100644 --- a/cl/scrapers/admin.py +++ b/cl/scrapers/admin.py @@ -32,68 +32,37 @@ class PACERFreeDocumentRowAdmin(admin.ModelAdmin): admin.site.register(UrlHash) -class MVLatestOpinions(models.Model): +class MVLatestOpinion(models.Model): """ Model linked to materialized view for monitoring scrapers + The SQL for creating the view is on it's migration file. + Must use `REFRESH MATERIALIZED VIEW scrapers_mv_latest_opinion` periodically """ - query = """ - CREATE MATERIALIZED VIEW - scrapers_mv_latest_opinion - AS - ( - SELECT - court_id, - max(so.date_created) as latest_creation_date, - (now() - max(so.date_created))::text as time_since - FROM - ( - SELECT id, court_id - FROM search_docket - WHERE court_id IN ( - SELECT id - FROM search_court - /* - Only check courts with scrapers in use - */ - WHERE - has_opinion_scraper - AND in_use - ) - ) sd - INNER JOIN - (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id - INNER JOIN - search_opinion so ON so.cluster_id = soc.id - GROUP BY - sd.court_id - HAVING - /* - Only return results for courts with no updates in a week - */ - now() - max(so.date_created) > interval '7 days' - ORDER BY - 2 DESC - ) - """ # a django model must have a primary key court_id = models.TextField(primary_key=True) - latest_creation_date = models.DateField() + latest_creation_date = models.DateTimeField() time_since = models.TextField() + view_last_updated = models.DateTimeField() class Meta: - managed = False # ignore this model in migrations + managed = False db_table = "scrapers_mv_latest_opinion" -@admin.register(MVLatestOpinions) -class MVLatestOpinionsAdmin(admin.ModelAdmin): +@admin.register(MVLatestOpinion) +class MVLatestOpinionAdmin(admin.ModelAdmin): """Admin page to look at the latest opinion for each court Use this to monitor silently failing scrapers """ - list_display = ["court_id", "latest_creation_date", "time_since"] + list_display = [ + "court_id", + "latest_creation_date", + "time_since", + "view_last_updated", + ] diff --git a/cl/scrapers/migrations/0004_create_mv_latest_opinion.py b/cl/scrapers/migrations/0004_create_mv_latest_opinion.py new file mode 100644 index 0000000000..4570c75d97 --- /dev/null +++ b/cl/scrapers/migrations/0004_create_mv_latest_opinion.py @@ -0,0 +1,69 @@ +# Generated by Django 5.1.2 on 2024-11-25 15:27 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("scrapers", "0003_delete_errorlog"), + ] + + operations = [ + migrations.CreateModel( + name="MVLatestOpinion", + fields=[ + ( + "court_id", + models.TextField(primary_key=True, serialize=False), + ), + ("latest_creation_date", models.DateTimeField()), + ("time_since", models.TextField()), + ("view_last_updated", models.DateTimeField()), + ], + options={ + "db_table": "scrapers_mv_latest_opinion", + "managed": False, + }, + ), + migrations.RunSQL(""" + CREATE MATERIALIZED VIEW IF NOT EXISTS + scrapers_mv_latest_opinion + AS + ( + SELECT + court_id, + max(so.date_created) as latest_creation_date, + DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since, + now() as view_last_updated + FROM + ( + SELECT id, court_id + FROM search_docket + WHERE court_id IN ( + SELECT id + FROM search_court + /* + Only check courts with scrapers in use + */ + WHERE + has_opinion_scraper + AND in_use + ) + ) sd + INNER JOIN + (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id + INNER JOIN + search_opinion so ON so.cluster_id = soc.id + GROUP BY + sd.court_id + HAVING + /* + Only return results for courts with no updates in a week + */ + now() - max(so.date_created) > interval '7 days' + ORDER BY + 2 DESC + ) + """) + ] diff --git a/cl/scrapers/migrations/0004_create_mv_latest_opinion.sql b/cl/scrapers/migrations/0004_create_mv_latest_opinion.sql new file mode 100644 index 0000000000..45c212298e --- /dev/null +++ b/cl/scrapers/migrations/0004_create_mv_latest_opinion.sql @@ -0,0 +1,49 @@ +BEGIN; +-- +-- Create model MVLatestOpinion +-- +-- (no-op) +-- +-- Raw SQL operation +-- + + CREATE MATERIALIZED VIEW IF NOT EXISTS + scrapers_mv_latest_opinion + AS + ( + SELECT + court_id, + max(so.date_created) as latest_creation_date, + DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since, + now() as view_last_updated + FROM + ( + SELECT id, court_id + FROM search_docket + WHERE court_id IN ( + SELECT id + FROM search_court + /* + Only check courts with scrapers in use + */ + WHERE + has_opinion_scraper + AND in_use + ) + ) sd + INNER JOIN + (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id + INNER JOIN + search_opinion so ON so.cluster_id = soc.id + GROUP BY + sd.court_id + HAVING + /* + Only return results for courts with no updates in a week + */ + now() - max(so.date_created) > interval '7 days' + ORDER BY + 2 DESC + ) + ; +COMMIT; diff --git a/cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql b/cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql new file mode 100644 index 0000000000..45c212298e --- /dev/null +++ b/cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql @@ -0,0 +1,49 @@ +BEGIN; +-- +-- Create model MVLatestOpinion +-- +-- (no-op) +-- +-- Raw SQL operation +-- + + CREATE MATERIALIZED VIEW IF NOT EXISTS + scrapers_mv_latest_opinion + AS + ( + SELECT + court_id, + max(so.date_created) as latest_creation_date, + DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since, + now() as view_last_updated + FROM + ( + SELECT id, court_id + FROM search_docket + WHERE court_id IN ( + SELECT id + FROM search_court + /* + Only check courts with scrapers in use + */ + WHERE + has_opinion_scraper + AND in_use + ) + ) sd + INNER JOIN + (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id + INNER JOIN + search_opinion so ON so.cluster_id = soc.id + GROUP BY + sd.court_id + HAVING + /* + Only return results for courts with no updates in a week + */ + now() - max(so.date_created) > interval '7 days' + ORDER BY + 2 DESC + ) + ; +COMMIT; From 610f9ff198d56319c8b8e9d5d0af72417aa05211 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Mon, 2 Dec 2024 11:14:38 -0500 Subject: [PATCH 3/6] feat(scrapers.admin): add refresh_scrapers_status_view command Also, delete customer's SQL --- .../commands/refresh_scrapers_status_view.py | 16 ++++++ ...004_create_mv_latest_opinion_customers.sql | 49 ------------------- 2 files changed, 16 insertions(+), 49 deletions(-) create mode 100644 cl/scrapers/management/commands/refresh_scrapers_status_view.py delete mode 100644 cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql diff --git a/cl/scrapers/management/commands/refresh_scrapers_status_view.py b/cl/scrapers/management/commands/refresh_scrapers_status_view.py new file mode 100644 index 0000000000..6fb52499e7 --- /dev/null +++ b/cl/scrapers/management/commands/refresh_scrapers_status_view.py @@ -0,0 +1,16 @@ +from cl.lib.command_utils import VerboseCommand,logger +from django.db import connection + +class Command(VerboseCommand): + help = """Refreshes the `scrapers_mv_latest_opinion` materialized view. + + Check the cl.scrapers.admin.py file for more info about the view + """ + + def handle(self, *args, **options): + query = "REFRESH MATERIALIZED VIEW scrapers_mv_latest_opinion;" + with connection.cursor() as cursor: + cursor.execute(query) + + logger.info("View refresh completed successfully") + \ No newline at end of file diff --git a/cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql b/cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql deleted file mode 100644 index 45c212298e..0000000000 --- a/cl/scrapers/migrations/0004_create_mv_latest_opinion_customers.sql +++ /dev/null @@ -1,49 +0,0 @@ -BEGIN; --- --- Create model MVLatestOpinion --- --- (no-op) --- --- Raw SQL operation --- - - CREATE MATERIALIZED VIEW IF NOT EXISTS - scrapers_mv_latest_opinion - AS - ( - SELECT - court_id, - max(so.date_created) as latest_creation_date, - DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since, - now() as view_last_updated - FROM - ( - SELECT id, court_id - FROM search_docket - WHERE court_id IN ( - SELECT id - FROM search_court - /* - Only check courts with scrapers in use - */ - WHERE - has_opinion_scraper - AND in_use - ) - ) sd - INNER JOIN - (SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id - INNER JOIN - search_opinion so ON so.cluster_id = soc.id - GROUP BY - sd.court_id - HAVING - /* - Only return results for courts with no updates in a week - */ - now() - max(so.date_created) > interval '7 days' - ORDER BY - 2 DESC - ) - ; -COMMIT; From 48046e7afdbd40ff3c71681063f16e58438de710 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Dec 2024 16:15:24 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../management/commands/refresh_scrapers_status_view.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cl/scrapers/management/commands/refresh_scrapers_status_view.py b/cl/scrapers/management/commands/refresh_scrapers_status_view.py index 6fb52499e7..e0bf692f30 100644 --- a/cl/scrapers/management/commands/refresh_scrapers_status_view.py +++ b/cl/scrapers/management/commands/refresh_scrapers_status_view.py @@ -1,9 +1,11 @@ -from cl.lib.command_utils import VerboseCommand,logger from django.db import connection +from cl.lib.command_utils import VerboseCommand, logger + + class Command(VerboseCommand): help = """Refreshes the `scrapers_mv_latest_opinion` materialized view. - + Check the cl.scrapers.admin.py file for more info about the view """ @@ -11,6 +13,5 @@ def handle(self, *args, **options): query = "REFRESH MATERIALIZED VIEW scrapers_mv_latest_opinion;" with connection.cursor() as cursor: cursor.execute(query) - + logger.info("View refresh completed successfully") - \ No newline at end of file From cb5fb64d186cff4ba2797babb1c57935e8203b01 Mon Sep 17 00:00:00 2001 From: William Palin Date: Mon, 2 Dec 2024 15:30:52 -0500 Subject: [PATCH 5/6] feat(opinions.html): Ensure Case Name displays If case name full is the only case name use it and then dont display it in the metadata section --- cl/opinion_page/templates/opinions.html | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cl/opinion_page/templates/opinions.html b/cl/opinion_page/templates/opinions.html index 320dbb40d9..cc84454184 100644 --- a/cl/opinion_page/templates/opinions.html +++ b/cl/opinion_page/templates/opinions.html @@ -216,14 +216,13 @@

- +

{{ cluster.docket.court }}


  • Citations: {{ cluster.citation_string|default:"None known" }}
  • - - {% if cluster.case_name_full != cluster.case_name and cluster.case_name_full != "" %} + {% if cluster.case_name_full != cluster.case_name and cluster.case_name_full != "" and cluster.case_name != "" %}
  • Full Case Name: {{ cluster.case_name_full }}
  • From 1af9aab1405f8cf287282e270a3f7b0d318963ca Mon Sep 17 00:00:00 2001 From: William Palin Date: Tue, 3 Dec 2024 09:40:40 -0500 Subject: [PATCH 6/6] fix(opinions.html): Use best case name --- cl/opinion_page/templates/opinions.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/opinion_page/templates/opinions.html b/cl/opinion_page/templates/opinions.html index cc84454184..d627dbcd74 100644 --- a/cl/opinion_page/templates/opinions.html +++ b/cl/opinion_page/templates/opinions.html @@ -216,13 +216,13 @@

- +

{{ cluster.docket.court }}


  • Citations: {{ cluster.citation_string|default:"None known" }}
  • - {% if cluster.case_name_full != cluster.case_name and cluster.case_name_full != "" and cluster.case_name != "" %} + {% if cluster.case_name_full != cluster|best_case_name %}
  • Full Case Name: {{ cluster.case_name_full }}