Skip to content

Commit

Permalink
Merge pull request #4662 from freelawproject/scrapers_latest_opinion_…
Browse files Browse the repository at this point in the history
…admin

feat(scrapers.admin): create materialized view and admin page
  • Loading branch information
mlissner authored Dec 2, 2024
2 parents d8456db + 48046e7 commit 3245860
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 0 deletions.
37 changes: 37 additions & 0 deletions cl/scrapers/admin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from django.contrib import admin
from django.db import models

from cl.scrapers.models import (
PACERFreeDocumentLog,
Expand Down Expand Up @@ -29,3 +30,39 @@ class PACERFreeDocumentRowAdmin(admin.ModelAdmin):


admin.site.register(UrlHash)


class MVLatestOpinion(models.Model):
"""
Model linked to materialized view for monitoring scrapers
The SQL for creating the view is on it's migration file.
Must use `REFRESH MATERIALIZED VIEW scrapers_mv_latest_opinion`
periodically
"""

# a django model must have a primary key
court_id = models.TextField(primary_key=True)
latest_creation_date = models.DateTimeField()
time_since = models.TextField()
view_last_updated = models.DateTimeField()

class Meta:
managed = False
db_table = "scrapers_mv_latest_opinion"


@admin.register(MVLatestOpinion)
class MVLatestOpinionAdmin(admin.ModelAdmin):
"""Admin page to look at the latest opinion for each court
Use this to monitor silently failing scrapers
"""

list_display = [
"court_id",
"latest_creation_date",
"time_since",
"view_last_updated",
]
17 changes: 17 additions & 0 deletions cl/scrapers/management/commands/refresh_scrapers_status_view.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from django.db import connection

from cl.lib.command_utils import VerboseCommand, logger


class Command(VerboseCommand):
help = """Refreshes the `scrapers_mv_latest_opinion` materialized view.
Check the cl.scrapers.admin.py file for more info about the view
"""

def handle(self, *args, **options):
query = "REFRESH MATERIALIZED VIEW scrapers_mv_latest_opinion;"
with connection.cursor() as cursor:
cursor.execute(query)

logger.info("View refresh completed successfully")
69 changes: 69 additions & 0 deletions cl/scrapers/migrations/0004_create_mv_latest_opinion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Generated by Django 5.1.2 on 2024-11-25 15:27

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("scrapers", "0003_delete_errorlog"),
]

operations = [
migrations.CreateModel(
name="MVLatestOpinion",
fields=[
(
"court_id",
models.TextField(primary_key=True, serialize=False),
),
("latest_creation_date", models.DateTimeField()),
("time_since", models.TextField()),
("view_last_updated", models.DateTimeField()),
],
options={
"db_table": "scrapers_mv_latest_opinion",
"managed": False,
},
),
migrations.RunSQL("""
CREATE MATERIALIZED VIEW IF NOT EXISTS
scrapers_mv_latest_opinion
AS
(
SELECT
court_id,
max(so.date_created) as latest_creation_date,
DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since,
now() as view_last_updated
FROM
(
SELECT id, court_id
FROM search_docket
WHERE court_id IN (
SELECT id
FROM search_court
/*
Only check courts with scrapers in use
*/
WHERE
has_opinion_scraper
AND in_use
)
) sd
INNER JOIN
(SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id
INNER JOIN
search_opinion so ON so.cluster_id = soc.id
GROUP BY
sd.court_id
HAVING
/*
Only return results for courts with no updates in a week
*/
now() - max(so.date_created) > interval '7 days'
ORDER BY
2 DESC
)
""")
]
49 changes: 49 additions & 0 deletions cl/scrapers/migrations/0004_create_mv_latest_opinion.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
BEGIN;
--
-- Create model MVLatestOpinion
--
-- (no-op)
--
-- Raw SQL operation
--

CREATE MATERIALIZED VIEW IF NOT EXISTS
scrapers_mv_latest_opinion
AS
(
SELECT
court_id,
max(so.date_created) as latest_creation_date,
DATE_TRUNC('minutes', (now() - max(so.date_created)))::text as time_since,
now() as view_last_updated
FROM
(
SELECT id, court_id
FROM search_docket
WHERE court_id IN (
SELECT id
FROM search_court
/*
Only check courts with scrapers in use
*/
WHERE
has_opinion_scraper
AND in_use
)
) sd
INNER JOIN
(SELECT id, docket_id FROM search_opinioncluster) soc ON soc.docket_id = sd.id
INNER JOIN
search_opinion so ON so.cluster_id = soc.id
GROUP BY
sd.court_id
HAVING
/*
Only return results for courts with no updates in a week
*/
now() - max(so.date_created) > interval '7 days'
ORDER BY
2 DESC
)
;
COMMIT;

0 comments on commit 3245860

Please sign in to comment.