Skip to content

Commit

Permalink
Add new ArticleStatusManagerTimeslice and specs based on existing ones
Browse files Browse the repository at this point in the history
  • Loading branch information
gabina committed Jan 4, 2025
1 parent f6209d5 commit a7179c2
Show file tree
Hide file tree
Showing 2 changed files with 645 additions and 0 deletions.
238 changes: 238 additions & 0 deletions lib/article_status_manager_timeslice.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
# frozen_string_literal: true

require_dependency "#{Rails.root}/lib/articles_courses_cleaner_timeslice"
require_dependency "#{Rails.root}/lib/assignment_updater"

#= Updates articles to reflect deletion and page moves on Wikipedia
# This class is responsible for identifying articles that:
# * Were deleted or untracked: These are articles that were either deleted or
# moved to a namespace not traceable by the course. Such articles should be
# excluded from course statistics.
# * Were restored or re-tracked: These are articles that were either restored
# or moved to a namespace relevant to the course. Such articles should be
# included in course statistics.

class ArticleStatusManagerTimeslice
def initialize(course, wiki = nil)
@course = course
@wiki = wiki || Wiki.default_wiki
end

################
# Entry points #
################

def self.update_article_status_for_course(course)
course.wikis.each do |wiki|
# Determine articles based on ac timeslices
course.articles_from_timeslices(wiki.id)
# we should determine articles based on ac timeslices
# .where(wiki_id: wiki.id)
# Updating only those articles which are updated more than 1 day ago
.where('articles.updated_at < ?', 1.day.ago)
.in_batches do |article_batch|
# Using in_batches so that the update_at of all articles in the batch can be
# excuted in a single query, otherwise if we use find_in_batches, query for
# each article for updating the same would be required
new(course, wiki).update_status(article_batch)
# rubocop:disable Rails/SkipsModelValidations
article_batch.touch_all(:updated_at)
# rubocop:enable Rails/SkipsModelValidations
end
end

ArticlesCoursesCleanerTimeslice.clean_articles_courses_for_deleted_or_untracked_articles(course)
ArticlesCoursesCleanerTimeslice.clean_articles_courses_for_undeleted_or_retracked_articles(course)
end

####################
# Per-wiki methods #
####################

def update_status(articles)
# This is used for problem cases where articles can't be easily disambiguated
# because of duplicate records with the same mw_page_id. It's only used if
# `articles` is just one record.
@article = articles.first if articles.one?

identify_deleted_and_synced_page_ids(articles)

# First we find any pages that just moved, and update title and namespace.
update_title_and_namespace @synced_articles

# Now we check for pages that have changed mw_page_ids.
# This happens in situations such as history merges.
# If articles move in between title/namespace updates and mw_page_id updates,
# then it's possible to end up with inconsistent data.
update_article_ids @deleted_page_ids

# Mark articles as deleted as appropriate
update_deleted_articles(articles)
move_timeslices_for_deleted_articles(articles)
end

##################
# Helper methods #
##################
private

def identify_deleted_and_synced_page_ids(articles)
@synced_articles = article_data_from_replica(articles)
@synced_ids = @synced_articles.map { |a| a['page_id'].to_i }

# If any Replica requests failed, we don't want to assume that missing
# articles are deleted.
# FIXME: A better approach would be to look for deletion logs, and only mark
# an article as deleted if there is a corresponding deletion log.
@deleted_page_ids = if @failed_request_count.zero?
articles.map(&:mw_page_id) - @synced_ids
else
[]
end
end

def article_data_from_replica(articles)
@failed_request_count = 0
synced_articles = Utils.chunk_requests(articles, 100) do |block|
request_results = Replica.new(@wiki).get_existing_articles_by_id block
@failed_request_count += 1 if request_results.nil?
request_results
end
synced_articles
end

def update_title_and_namespace(synced_articles)
# Update titles and namespaces based on mw_page_ids
synced_articles.each do |article_data|
article = @article || find_article_by_mw_page_id(article_data['page_id'])
next if data_matches_article?(article_data, article)

# FIXME: Workaround for four-byte unicode characters in article titles,
# until we fix the database to handle them.
# https://github.com/WikiEducationFoundation/WikiEduDashboard/issues/1744
# These titles are saved as their URL-encoded equivalents.
next if article.title[0] == '%'

begin
article.update!(title: article_data['page_title'],
namespace: article_data['page_namespace'],
deleted: false)
# If namespace changed or the article gets undeleted, I had to re-reprocess timeslices for
# that article
# Call timeslice_manager.update_timeslices_that_need_update_from_article_timeslices(
# timeslices
# )
# Find corresponding Assignment records and update the titles
AssignmentUpdater.update_assignments_for_article(article)
rescue ActiveRecord::RecordNotUnique => e
# If we reach this point, it's most likely that @article has been set. This only
# happens when update_status is invoked with a single article, which probably indicates
# it was called from a cleanup script. In this case, we consider @article as
# a duplicate article record, so we re-process timeslices for it.

# if this is a duplicate article record, moving the revisions to the non-deleted
# copy should prevent it from being part of a future update.
# NOTE: ActiveRecord::RecordNotUnique is a subtype of ActiveRecord::StatementInvalid
# so this rescue comes first.

# I don't know what to do here
handle_undeletion(article)
Sentry.capture_exception e, level: 'warning'
rescue ActiveRecord::StatementInvalid => e # workaround for 4-byte unicode errors
Sentry.capture_exception e
end
end
end

def update_deleted_articles(articles)
return unless @failed_request_count.zero?
articles.each do |article|
next unless @deleted_page_ids.include? article.mw_page_id
# Reload to account for articles that have had their mw_page_id changed
# because the page was moved rather than deleted.
next unless @deleted_page_ids.include? article.reload.mw_page_id
article.update(deleted: true)
end
end

# If an article sets as deleted has a sync mw page id, then mark the timeslices
# as needs_update
def move_timeslices_for_deleted_articles(articles)
articles.filter(&:deleted).each do |article|
next unless @synced_ids.include? article.mw_page_id
handle_undeletion(article)
end
end

def handle_undeletion(article)
# If there's already a non-deleted copy, we need to reprocess the timeslices for article
nondeleted_article = Article.find_by(wiki_id: @wiki.id,
mw_page_id: article.mw_page_id, deleted: false)
# If there is only one copy of the article, it was already found and updated
# via `update_title_and_namespace`
return unless nondeleted_article
ArticlesCoursesCleanerTimeslice.clean_articles(@course, [article])
# Mark all ac timeslices for article as needs_update
# Remove ac timeslices for article
# remove ac for article (if it exists)
end

def data_matches_article?(article_data, article)
return false unless article.title == article_data['page_title']
return false unless article.namespace == article_data['page_namespace'].to_i
# If article data is collected from Replica, the article is not currently deleted
return false if article.deleted
true
end

# Check whether any deleted pages still exist with a different article_id.
# If so, update the Article to use the new id.
def update_article_ids(deleted_page_ids)
maybe_deleted = Article.where(mw_page_id: deleted_page_ids, wiki_id: @wiki.id)
return if maybe_deleted.empty?
# These pages have titles that match Articles in our DB with deleted ids
request_results = Replica.new(@wiki).post_existing_articles_by_title maybe_deleted
@failed_request_count += 1 if request_results.nil?

# Update articles whose IDs have changed (keyed on title and namespace)
request_results&.each do |stp|
resolve_page_id(stp, deleted_page_ids)
end
end

def resolve_page_id(same_title_page, deleted_page_ids)
title = same_title_page['page_title']
mw_page_id = same_title_page['page_id']
namespace = same_title_page['page_namespace']

article = Article.find_by(wiki_id: @wiki.id, title:, namespace:, deleted: false)

return unless article_data_matches?(article, title, deleted_page_ids)
update_article_page_id(article, mw_page_id)
end

def article_data_matches?(article, title, deleted_page_ids)
return false if article.nil?
return false unless deleted_page_ids.include?(article.mw_page_id)
# This catches false positives when the query for page_title matches
# a case variant.
return false unless article.title == title
true
end

def update_article_page_id(article, mw_page_id)
if Article.exists?(mw_page_id:, wiki_id: @wiki.id)
# Catches case where update_constantly has
# already added this article under a new ID
article.update(deleted: true)
else
article.update(mw_page_id:)
end
end

def find_article_by_mw_page_id(mw_page_id)
article = Article.find_by(wiki_id: @wiki.id, mw_page_id:, deleted: false)
article ||= Article.find_by(wiki_id: @wiki.id, mw_page_id:)
article
end
end
Loading

0 comments on commit a7179c2

Please sign in to comment.