onyx-dot-app · rkuo-danswer · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/backend/alembic/versions/ac5eaac849f9_add_last_pruned_to_connector_table.py b/backend/alembic/versions/ac5eaac849f9_add_last_pruned_to_connector_table.py
@@ -0,0 +1,27 @@
+"""add last_pruned to the connector_credential_pair table
+
+Revision ID: ac5eaac849f9
+Revises: 52a219fb5233
+Create Date: 2024-09-10 15:04:26.437118
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "ac5eaac849f9"
+down_revision = "46b7a812670f"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # last pruned represents the last time the connector was pruned
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column("last_pruned", sa.DateTime(timezone=True), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("connector_credential_pair", "last_pruned")
diff --git a/backend/danswer/background/celery/celery_app.py b/backend/danswer/background/celery/celery_app.py
@@ -19,6 +19,7 @@
 
 from danswer.background.celery.celery_redis import RedisConnectorCredentialPair
 from danswer.background.celery.celery_redis import RedisConnectorDeletion
+from danswer.background.celery.celery_redis import RedisConnectorPruning
 from danswer.background.celery.celery_redis import RedisDocumentSet
 from danswer.background.celery.celery_redis import RedisUserGroup
 from danswer.background.celery.celery_utils import celery_is_worker_primary
@@ -108,6 +109,14 @@ def celery_task_postrun(
             r.srem(rcd.taskset_key, task_id)
         return
 
+    if task_id.startswith(RedisConnectorPruning.SUBTASK_PREFIX):
+        r = redis_pool.get_client()
+        cc_pair_id = RedisConnectorPruning.get_id_from_task_id(task_id)
+        if cc_pair_id is not None:
+            rcp = RedisConnectorPruning(cc_pair_id)
+            r.srem(rcp.taskset_key, task_id)
+        return
+
 
 @beat_init.connect
 def on_beat_init(sender: Any, **kwargs: Any) -> None:
@@ -240,6 +249,18 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
     for key in r.scan_iter(RedisConnectorDeletion.FENCE_PREFIX + "*"):
         r.delete(key)
 
+    for key in r.scan_iter(RedisConnectorPruning.TASKSET_PREFIX + "*"):
+        r.delete(key)
+
+    for key in r.scan_iter(RedisConnectorPruning.GENERATOR_COMPLETE_PREFIX + "*"):
+        r.delete(key)
+
+    for key in r.scan_iter(RedisConnectorPruning.GENERATOR_PROGRESS_PREFIX + "*"):
+        r.delete(key)
+
+    for key in r.scan_iter(RedisConnectorPruning.FENCE_PREFIX + "*"):
+        r.delete(key)
+
 
 @worker_ready.connect
 def on_worker_ready(sender: Any, **kwargs: Any) -> None:
@@ -334,7 +355,11 @@ def on_setup_logging(
 
 class HubPeriodicTask(bootsteps.StartStopStep):
     """Regularly reacquires the primary worker lock outside of the task queue.
-    Use the task_logger in this class to avoid double logging."""
+    Use the task_logger in this class to avoid double logging.
+
+    This cannot be done inside a regular beat task because it must run on schedule and
+    a queue of existing work would starve the task from running.
+    """
 
     # it's unclear to me whether using the hub's timer or the bootstep timer is better
     requires = {"celery.worker.components:Hub"}
@@ -368,8 +393,6 @@ def run_periodic_task(self, worker: Any) -> None:
 
             lock: redis.lock.Lock = worker.primary_worker_lock
 
-            task_logger.info("Reacquiring primary worker lock.")
-
             if lock.owned():
                 task_logger.debug("Reacquiring primary worker lock.")
                 lock.reacquire()
@@ -411,6 +434,7 @@ def stop(self, worker: Any) -> None:
         "danswer.background.celery.tasks.connector_deletion",
         "danswer.background.celery.tasks.periodic",
         "danswer.background.celery.tasks.pruning",
+        "danswer.background.celery.tasks.shared",
         "danswer.background.celery.tasks.vespa",
     ]
 )
@@ -431,16 +455,16 @@ def stop(self, worker: Any) -> None:
             "task": "check_for_connector_deletion_task",
             # don't need to check too often, since we kick off a deletion initially
             # during the API call that actually marks the CC pair for deletion
-            "schedule": timedelta(minutes=1),
+            "schedule": timedelta(seconds=60),
             "options": {"priority": DanswerCeleryPriority.HIGH},
         },
     }
 )
 celery_app.conf.beat_schedule.update(
     {
         "check-for-prune": {
-            "task": "check_for_prune_task",
-            "schedule": timedelta(seconds=5),
+            "task": "check_for_prune_task_2",
+            "schedule": timedelta(seconds=60),
             "options": {"priority": DanswerCeleryPriority.HIGH},
         },
     }

diff --git a/backend/danswer/background/celery/celery_redis.py b/backend/danswer/background/celery/celery_redis.py
@@ -343,6 +343,110 @@ def generate_tasks(
         return len(async_results)
 
 
+class RedisConnectorPruning(RedisObjectHelper):
+    """Celery will kick off a long running generator task to crawl the connector and
+    find any missing docs, which will each then get a new cleanup task. The progress of
+    those tasks will then be monitored to completion.
+
+    Example rough happy path order:
+    Check connectorpruning_fence_1
+    Send generator task with id connectorpruning+generator_1_{uuid}
+
+    generator runs connector with callbacks that increment connectorpruning_generator_progress_1
+    generator creates many subtasks with id connectorpruning+sub_1_{uuid}
+      in taskset connectorpruning_taskset_1
+    on completion, generator sets connectorpruning_generator_complete_1
+
+    celery postrun removes subtasks from taskset
+    monitor beat task cleans up when taskset reaches 0 items
+    """
+
+    PREFIX = "connectorpruning"
+    FENCE_PREFIX = PREFIX + "_fence"  # a fence for the entire pruning process
+    GENERATOR_TASK_PREFIX = PREFIX + "+generator"
+
+    TASKSET_PREFIX = PREFIX + "_taskset"  # stores a list of prune tasks id's
+    SUBTASK_PREFIX = PREFIX + "+sub"
+
+    GENERATOR_PROGRESS_PREFIX = (
+        PREFIX + "_generator_progress"
+    )  # a signal that contains generator progress
+    GENERATOR_COMPLETE_PREFIX = (
+        PREFIX + "_generator_complete"
+    )  # a signal that the generator has finished
+
+    def __init__(self, id: int) -> None:
+        super().__init__(id)
+        self.documents_to_prune: set[str] = set()
+
+    @property
+    def generator_task_id_prefix(self) -> str:
+        return f"{self.GENERATOR_TASK_PREFIX}_{self._id}"
+
+    @property
+    def generator_progress_key(self) -> str:
+        # example: connectorpruning_generator_progress_1
+        return f"{self.GENERATOR_PROGRESS_PREFIX}_{self._id}"
+
+    @property
+    def generator_complete_key(self) -> str:
+        # example: connectorpruning_generator_complete_1
+        return f"{self.GENERATOR_COMPLETE_PREFIX}_{self._id}"
+
+    @property
+    def subtask_id_prefix(self) -> str:
+        return f"{self.SUBTASK_PREFIX}_{self._id}"
+
+    def generate_tasks(
+        self,
+        celery_app: Celery,
+        db_session: Session,
+        redis_client: Redis,
+        lock: redis.lock.Lock | None,
+    ) -> int | None:
+        last_lock_time = time.monotonic()
+
+        async_results = []
+        cc_pair = get_connector_credential_pair_from_id(self._id, db_session)
+        if not cc_pair:
+            return None
+
+        for doc_id in self.documents_to_prune:
+            current_time = time.monotonic()
+            if lock and current_time - last_lock_time >= (
+                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
+            ):
+                lock.reacquire()
+                last_lock_time = current_time
+
+            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # the actual redis key is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # we prefix the task id so it's easier to keep track of who created the task
+            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            custom_task_id = f"{self.subtask_id_prefix}_{uuid4()}"
+
+            # add to the tracking taskset in redis BEFORE creating the celery task.
+            # note that for the moment we are using a single taskset key, not differentiated by cc_pair id
+            redis_client.sadd(self.taskset_key, custom_task_id)
+
+            # Priority on sync's triggered by new indexing should be medium
+            result = celery_app.send_task(
+                "document_by_cc_pair_cleanup_task",
+                kwargs=dict(
+                    document_id=doc_id,
+                    connector_id=cc_pair.connector_id,
+                    credential_id=cc_pair.credential_id,
+                ),
+                queue=DanswerCeleryQueues.CONNECTOR_DELETION,
+                task_id=custom_task_id,
+                priority=DanswerCeleryPriority.MEDIUM,
+            )
+
+            async_results.append(result)
+
+        return len(async_results)
+
+
 def celery_get_queue_length(queue: str, r: Redis) -> int:
     """This is a redis specific way to get the length of a celery queue.
     It is priority aware and knows how to count across the multiple redis lists

diff --git a/backend/danswer/background/celery/celery_utils.py b/backend/danswer/background/celery/celery_utils.py
@@ -1,12 +1,12 @@
+from collections.abc import Callable
 from datetime import datetime
 from datetime import timezone
 from typing import Any
 
 from sqlalchemy.orm import Session
 
 from danswer.background.celery.celery_redis import RedisConnectorDeletion
-from danswer.background.task_utils import name_cc_prune_task
-from danswer.configs.app_configs import ALLOW_SIMULTANEOUS_PRUNING
+from danswer.background.celery.celery_redis import RedisConnectorPruning
 from danswer.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
 from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
     rate_limit_builder,
@@ -17,14 +17,9 @@
 from danswer.connectors.interfaces import PollConnector
 from danswer.connectors.models import Document
 from danswer.db.connector_credential_pair import get_connector_credential_pair
-from danswer.db.engine import get_db_current_time
+from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
 from danswer.db.enums import TaskStatus
-from danswer.db.models import Connector
-from danswer.db.models import Credential
 from danswer.db.models import TaskQueueState
-from danswer.db.tasks import check_task_is_live_and_not_timed_out
-from danswer.db.tasks import get_latest_task
-from danswer.db.tasks import get_latest_task_by_type
 from danswer.redis.redis_pool import RedisPool
 from danswer.server.documents.models import DeletionAttemptSnapshot
 from danswer.utils.logger import setup_logger
@@ -33,6 +28,24 @@
 redis_pool = RedisPool()
 
 
+# TODO: make this a member of RedisConnectorPruning
+def cc_pair_is_pruning(cc_pair_id: int, db_session: Session) -> bool:
+    #
+    cc_pair = get_connector_credential_pair_from_id(
+        cc_pair_id=cc_pair_id, db_session=db_session
+    )
+    if not cc_pair:
+        raise ValueError(f"cc_pair_id {cc_pair_id} does not exist.")
+
+    rcp = RedisConnectorPruning(cc_pair.id)
+
+    r = redis_pool.get_client()
+    if r.exists(rcp.fence_key):
+        return True
+
+    return False
+
+
 def _get_deletion_status(
     connector_id: int, credential_id: int, db_session: Session
 ) -> TaskQueueState | None:
@@ -70,72 +83,19 @@ def get_deletion_attempt_snapshot(
     )
 
 
-def skip_cc_pair_pruning_by_task(
-    pruning_task: TaskQueueState | None, db_session: Session
-) -> bool:
-    """task should be the latest prune task for this cc_pair"""
-    if not ALLOW_SIMULTANEOUS_PRUNING:
-        # if only one prune is allowed at any time, then check to see if any prune
-        # is active
-        pruning_type_task_name = name_cc_prune_task()
-        last_pruning_type_task = get_latest_task_by_type(
-            pruning_type_task_name, db_session
-        )
-
-        if last_pruning_type_task and check_task_is_live_and_not_timed_out(
-            last_pruning_type_task, db_session
-        ):
-            return True
-
-    if pruning_task and check_task_is_live_and_not_timed_out(pruning_task, db_session):
-        # if the last task is live right now, we shouldn't start a new one
-        return True
-
-    return False
-
-
-def should_prune_cc_pair(
-    connector: Connector, credential: Credential, db_session: Session
-) -> bool:
-    if not connector.prune_freq:
-        return False
-
-    pruning_task_name = name_cc_prune_task(
-        connector_id=connector.id, credential_id=credential.id
-    )
-    last_pruning_task = get_latest_task(pruning_task_name, db_session)
-
-    if skip_cc_pair_pruning_by_task(last_pruning_task, db_session):
-        return False
-
-    current_db_time = get_db_current_time(db_session)
-
-    if not last_pruning_task:
-        # If the connector has never been pruned, then compare vs when the connector
-        # was created
-        time_since_initialization = current_db_time - connector.time_created
-        if time_since_initialization.total_seconds() >= connector.prune_freq:
-            return True
-        return False
-
-    if not last_pruning_task.start_time:
-        # if the last prune task hasn't started, we shouldn't start a new one
-        return False
-
-    # if the last prune task has a start time, then compare against it to determine
-    # if we should start
-    time_since_last_pruning = current_db_time - last_pruning_task.start_time
-    return time_since_last_pruning.total_seconds() >= connector.prune_freq
-
-
 def document_batch_to_ids(doc_batch: list[Document]) -> set[str]:
     return {doc.id for doc in doc_batch}
 
 
-def extract_ids_from_runnable_connector(runnable_connector: BaseConnector) -> set[str]:
+def extract_ids_from_runnable_connector(
+    runnable_connector: BaseConnector,
+    progress_callback: Callable[[int], None] | None = None,
+) -> set[str]:
     """
     If the PruneConnector hasnt been implemented for the given connector, just pull
-    all docs using the load_from_state and grab out the IDs
+    all docs using the load_from_state and grab out the IDs.
+
+    Optionally, a callback can be passed to handle the length of each document batch.
     """
     all_connector_doc_ids: set[str] = set()
 
@@ -158,6 +118,8 @@ def extract_ids_from_runnable_connector(runnable_connector: BaseConnector) -> se
                 max_calls=MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE, period=60
             )(document_batch_to_ids)
         for doc_batch in doc_batch_generator:
+            if progress_callback:
+                progress_callback(len(doc_batch))
             all_connector_doc_ids.update(doc_batch_processing_func(doc_batch))
 
     return all_connector_doc_ids
@@ -177,9 +139,10 @@ def celery_is_listening_to_queue(worker: Any, name: str) -> bool:
 
 
 def celery_is_worker_primary(worker: Any) -> bool:
-    """There are multiple approaches that could be taken, but the way we do it is to
-    check the hostname set for the celery worker, either in celeryconfig.py or on the
-    command line."""
+    """There are multiple approaches that could be taken to determine if a celery worker
+    is 'primary', as defined by us. But the way we do it is to check the hostname set
+    for the celery worker, which can be done either in celeryconfig.py or on the
+    command line with '--hostname'."""
     hostname = worker.hostname
     if hostname.startswith("light"):
         return False