Add new ArticleStatusManagerTimeslice and specs based on existing ones

WikiEducationFoundation · Jan 4, 2025 · a7179c2 · a7179c2
1 parent f6209d5
commit a7179c2
Show file tree

Hide file tree

Showing 2 changed files with 645 additions and 0 deletions.
diff --git a/lib/article_status_manager_timeslice.rb b/lib/article_status_manager_timeslice.rb
@@ -0,0 +1,238 @@
+# frozen_string_literal: true
+
+require_dependency "#{Rails.root}/lib/articles_courses_cleaner_timeslice"
+require_dependency "#{Rails.root}/lib/assignment_updater"
+
+#= Updates articles to reflect deletion and page moves on Wikipedia
+# This class is responsible for identifying articles that:
+# * Were deleted or untracked: These are articles that were either deleted or
+# moved to a namespace not traceable by the course. Such articles should be
+# excluded from course statistics.
+# * Were restored or re-tracked: These are articles that were either restored
+# or moved to a namespace relevant to the course. Such articles should be
+# included in course statistics.
+
+class ArticleStatusManagerTimeslice
+  def initialize(course, wiki = nil)
+    @course = course
+    @wiki = wiki || Wiki.default_wiki
+  end
+
+  ################
+  # Entry points #
+  ################
+
+  def self.update_article_status_for_course(course)
+    course.wikis.each do |wiki|
+      # Determine articles based on ac timeslices
+      course.articles_from_timeslices(wiki.id)
+            # we should determine articles based on ac timeslices
+            # .where(wiki_id: wiki.id)
+            # Updating only those articles which are updated more than 1 day ago
+            .where('articles.updated_at < ?', 1.day.ago)
+            .in_batches do |article_batch|
+        # Using in_batches so that the update_at of all articles in the batch can be
+        # excuted in a single query, otherwise if we use find_in_batches, query for
+        # each article for updating the same would be required
+        new(course, wiki).update_status(article_batch)
+        # rubocop:disable Rails/SkipsModelValidations
+        article_batch.touch_all(:updated_at)
+        # rubocop:enable Rails/SkipsModelValidations
+      end
+    end
+
+    ArticlesCoursesCleanerTimeslice.clean_articles_courses_for_deleted_or_untracked_articles(course)
+    ArticlesCoursesCleanerTimeslice.clean_articles_courses_for_undeleted_or_retracked_articles(course)
+  end
+
+  ####################
+  # Per-wiki methods #
+  ####################
+
+  def update_status(articles)
+    # This is used for problem cases where articles can't be easily disambiguated
+    # because of duplicate records with the same mw_page_id. It's only used if
+    # `articles` is just one record.
+    @article = articles.first if articles.one?
+
+    identify_deleted_and_synced_page_ids(articles)
+
+    # First we find any pages that just moved, and update title and namespace.
+    update_title_and_namespace @synced_articles
+
+    # Now we check for pages that have changed mw_page_ids.
+    # This happens in situations such as history merges.
+    # If articles move in between title/namespace updates and mw_page_id updates,
+    # then it's possible to end up with inconsistent data.
+    update_article_ids @deleted_page_ids
+
+    # Mark articles as deleted as appropriate
+    update_deleted_articles(articles)
+    move_timeslices_for_deleted_articles(articles)
+  end
+
+  ##################
+  # Helper methods #
+  ##################
+  private
+
+  def identify_deleted_and_synced_page_ids(articles)
+    @synced_articles = article_data_from_replica(articles)
+    @synced_ids = @synced_articles.map { |a| a['page_id'].to_i }
+
+    # If any Replica requests failed, we don't want to assume that missing
+    # articles are deleted.
+    # FIXME: A better approach would be to look for deletion logs, and only mark
+    # an article as deleted if there is a corresponding deletion log.
+    @deleted_page_ids = if @failed_request_count.zero?
+                          articles.map(&:mw_page_id) - @synced_ids
+                        else
+                          []
+                        end
+  end
+
+  def article_data_from_replica(articles)
+    @failed_request_count = 0
+    synced_articles = Utils.chunk_requests(articles, 100) do |block|
+      request_results = Replica.new(@wiki).get_existing_articles_by_id block
+      @failed_request_count += 1 if request_results.nil?
+      request_results
+    end
+    synced_articles
+  end
+
+  def update_title_and_namespace(synced_articles)
+    # Update titles and namespaces based on mw_page_ids
+    synced_articles.each do |article_data|
+      article = @article || find_article_by_mw_page_id(article_data['page_id'])
+      next if data_matches_article?(article_data, article)
+
+      # FIXME: Workaround for four-byte unicode characters in article titles,
+      # until we fix the database to handle them.
+      # https://github.com/WikiEducationFoundation/WikiEduDashboard/issues/1744
+      # These titles are saved as their URL-encoded equivalents.
+      next if article.title[0] == '%'
+
+      begin
+        article.update!(title: article_data['page_title'],
+                        namespace: article_data['page_namespace'],
+                        deleted: false)
+        # If namespace changed or the article gets undeleted, I had to re-reprocess timeslices for
+        # that article
+        # Call timeslice_manager.update_timeslices_that_need_update_from_article_timeslices(
+        #     timeslices
+        #    )
+        # Find corresponding Assignment records and update the titles
+        AssignmentUpdater.update_assignments_for_article(article)
+      rescue ActiveRecord::RecordNotUnique => e
+        # If we reach this point, it's most likely that @article has been set. This only
+        # happens when update_status is invoked with a single article, which probably indicates
+        # it was called from a cleanup script. In this case, we consider @article as
+        # a duplicate article record, so we re-process timeslices for it.
+
+        # if this is a duplicate article record, moving the revisions to the non-deleted
+        # copy should prevent it from being part of a future update.
+        # NOTE: ActiveRecord::RecordNotUnique is a subtype of ActiveRecord::StatementInvalid
+        # so this rescue comes first.
+
+        # I don't know what to do here
+        handle_undeletion(article)
+        Sentry.capture_exception e, level: 'warning'
+      rescue ActiveRecord::StatementInvalid => e # workaround for 4-byte unicode errors
+        Sentry.capture_exception e
+      end
+    end
+  end
+
+  def update_deleted_articles(articles)
+    return unless @failed_request_count.zero?
+    articles.each do |article|
+      next unless @deleted_page_ids.include? article.mw_page_id
+      # Reload to account for articles that have had their mw_page_id changed
+      # because the page was moved rather than deleted.
+      next unless @deleted_page_ids.include? article.reload.mw_page_id
+      article.update(deleted: true)
+    end
+  end
+
+  # If an article sets as deleted has a sync mw page id, then mark the timeslices
+  # as needs_update
+  def move_timeslices_for_deleted_articles(articles)
+    articles.filter(&:deleted).each do |article|
+      next unless @synced_ids.include? article.mw_page_id
+      handle_undeletion(article)
+    end
+  end
+
+  def handle_undeletion(article)
+    # If there's already a non-deleted copy, we need to reprocess the timeslices for article
+    nondeleted_article = Article.find_by(wiki_id: @wiki.id,
+                                         mw_page_id: article.mw_page_id, deleted: false)
+    # If there is only one copy of the article, it was already found and updated
+    # via `update_title_and_namespace`
+    return unless nondeleted_article
+    ArticlesCoursesCleanerTimeslice.clean_articles(@course, [article])
+    # Mark all ac timeslices for article as needs_update
+    # Remove ac timeslices for article
+    # remove ac for article (if it exists)
+  end
+
+  def data_matches_article?(article_data, article)
+    return false unless article.title == article_data['page_title']
+    return false unless article.namespace == article_data['page_namespace'].to_i
+    # If article data is collected from Replica, the article is not currently deleted
+    return false if article.deleted
+    true
+  end
+
+  # Check whether any deleted pages still exist with a different article_id.
+  # If so, update the Article to use the new id.
+  def update_article_ids(deleted_page_ids)
+    maybe_deleted = Article.where(mw_page_id: deleted_page_ids, wiki_id: @wiki.id)
+    return if maybe_deleted.empty?
+    # These pages have titles that match Articles in our DB with deleted ids
+    request_results = Replica.new(@wiki).post_existing_articles_by_title maybe_deleted
+    @failed_request_count += 1 if request_results.nil?
+
+    # Update articles whose IDs have changed (keyed on title and namespace)
+    request_results&.each do |stp|
+      resolve_page_id(stp, deleted_page_ids)
+    end
+  end
+
+  def resolve_page_id(same_title_page, deleted_page_ids)
+    title = same_title_page['page_title']
+    mw_page_id = same_title_page['page_id']
+    namespace = same_title_page['page_namespace']
+
+    article = Article.find_by(wiki_id: @wiki.id, title:, namespace:, deleted: false)
+
+    return unless article_data_matches?(article, title, deleted_page_ids)
+    update_article_page_id(article, mw_page_id)
+  end
+
+  def article_data_matches?(article, title, deleted_page_ids)
+    return false if article.nil?
+    return false unless deleted_page_ids.include?(article.mw_page_id)
+    # This catches false positives when the query for page_title matches
+    # a case variant.
+    return false unless article.title == title
+    true
+  end
+
+  def update_article_page_id(article, mw_page_id)
+    if Article.exists?(mw_page_id:, wiki_id: @wiki.id)
+      # Catches case where update_constantly has
+      # already added this article under a new ID
+      article.update(deleted: true)
+    else
+      article.update(mw_page_id:)
+    end
+  end
+
+  def find_article_by_mw_page_id(mw_page_id)
+    article = Article.find_by(wiki_id: @wiki.id, mw_page_id:, deleted: false)
+    article ||= Article.find_by(wiki_id: @wiki.id, mw_page_id:)
+    article
+  end
+end