From a7179c2f4b7a0f0a2b7ccd1ee64039241d318576 Mon Sep 17 00:00:00 2001 From: gabina Date: Sat, 4 Jan 2025 02:16:06 -0300 Subject: [PATCH] Add new ArticleStatusManagerTimeslice and specs based on existing ones --- lib/article_status_manager_timeslice.rb | 238 ++++++++++ .../article_status_manager_timeslice_spec.rb | 407 ++++++++++++++++++ 2 files changed, 645 insertions(+) create mode 100644 lib/article_status_manager_timeslice.rb create mode 100644 spec/lib/article_status_manager_timeslice_spec.rb diff --git a/lib/article_status_manager_timeslice.rb b/lib/article_status_manager_timeslice.rb new file mode 100644 index 0000000000..b6c2ad6371 --- /dev/null +++ b/lib/article_status_manager_timeslice.rb @@ -0,0 +1,238 @@ +# frozen_string_literal: true + +require_dependency "#{Rails.root}/lib/articles_courses_cleaner_timeslice" +require_dependency "#{Rails.root}/lib/assignment_updater" + +#= Updates articles to reflect deletion and page moves on Wikipedia +# This class is responsible for identifying articles that: +# * Were deleted or untracked: These are articles that were either deleted or +# moved to a namespace not traceable by the course. Such articles should be +# excluded from course statistics. +# * Were restored or re-tracked: These are articles that were either restored +# or moved to a namespace relevant to the course. Such articles should be +# included in course statistics. + +class ArticleStatusManagerTimeslice + def initialize(course, wiki = nil) + @course = course + @wiki = wiki || Wiki.default_wiki + end + + ################ + # Entry points # + ################ + + def self.update_article_status_for_course(course) + course.wikis.each do |wiki| + # Determine articles based on ac timeslices + course.articles_from_timeslices(wiki.id) + # we should determine articles based on ac timeslices + # .where(wiki_id: wiki.id) + # Updating only those articles which are updated more than 1 day ago + .where('articles.updated_at < ?', 1.day.ago) + .in_batches do |article_batch| + # Using in_batches so that the update_at of all articles in the batch can be + # excuted in a single query, otherwise if we use find_in_batches, query for + # each article for updating the same would be required + new(course, wiki).update_status(article_batch) + # rubocop:disable Rails/SkipsModelValidations + article_batch.touch_all(:updated_at) + # rubocop:enable Rails/SkipsModelValidations + end + end + + ArticlesCoursesCleanerTimeslice.clean_articles_courses_for_deleted_or_untracked_articles(course) + ArticlesCoursesCleanerTimeslice.clean_articles_courses_for_undeleted_or_retracked_articles(course) + end + + #################### + # Per-wiki methods # + #################### + + def update_status(articles) + # This is used for problem cases where articles can't be easily disambiguated + # because of duplicate records with the same mw_page_id. It's only used if + # `articles` is just one record. + @article = articles.first if articles.one? + + identify_deleted_and_synced_page_ids(articles) + + # First we find any pages that just moved, and update title and namespace. + update_title_and_namespace @synced_articles + + # Now we check for pages that have changed mw_page_ids. + # This happens in situations such as history merges. + # If articles move in between title/namespace updates and mw_page_id updates, + # then it's possible to end up with inconsistent data. + update_article_ids @deleted_page_ids + + # Mark articles as deleted as appropriate + update_deleted_articles(articles) + move_timeslices_for_deleted_articles(articles) + end + + ################## + # Helper methods # + ################## + private + + def identify_deleted_and_synced_page_ids(articles) + @synced_articles = article_data_from_replica(articles) + @synced_ids = @synced_articles.map { |a| a['page_id'].to_i } + + # If any Replica requests failed, we don't want to assume that missing + # articles are deleted. + # FIXME: A better approach would be to look for deletion logs, and only mark + # an article as deleted if there is a corresponding deletion log. + @deleted_page_ids = if @failed_request_count.zero? + articles.map(&:mw_page_id) - @synced_ids + else + [] + end + end + + def article_data_from_replica(articles) + @failed_request_count = 0 + synced_articles = Utils.chunk_requests(articles, 100) do |block| + request_results = Replica.new(@wiki).get_existing_articles_by_id block + @failed_request_count += 1 if request_results.nil? + request_results + end + synced_articles + end + + def update_title_and_namespace(synced_articles) + # Update titles and namespaces based on mw_page_ids + synced_articles.each do |article_data| + article = @article || find_article_by_mw_page_id(article_data['page_id']) + next if data_matches_article?(article_data, article) + + # FIXME: Workaround for four-byte unicode characters in article titles, + # until we fix the database to handle them. + # https://github.com/WikiEducationFoundation/WikiEduDashboard/issues/1744 + # These titles are saved as their URL-encoded equivalents. + next if article.title[0] == '%' + + begin + article.update!(title: article_data['page_title'], + namespace: article_data['page_namespace'], + deleted: false) + # If namespace changed or the article gets undeleted, I had to re-reprocess timeslices for + # that article + # Call timeslice_manager.update_timeslices_that_need_update_from_article_timeslices( + # timeslices + # ) + # Find corresponding Assignment records and update the titles + AssignmentUpdater.update_assignments_for_article(article) + rescue ActiveRecord::RecordNotUnique => e + # If we reach this point, it's most likely that @article has been set. This only + # happens when update_status is invoked with a single article, which probably indicates + # it was called from a cleanup script. In this case, we consider @article as + # a duplicate article record, so we re-process timeslices for it. + + # if this is a duplicate article record, moving the revisions to the non-deleted + # copy should prevent it from being part of a future update. + # NOTE: ActiveRecord::RecordNotUnique is a subtype of ActiveRecord::StatementInvalid + # so this rescue comes first. + + # I don't know what to do here + handle_undeletion(article) + Sentry.capture_exception e, level: 'warning' + rescue ActiveRecord::StatementInvalid => e # workaround for 4-byte unicode errors + Sentry.capture_exception e + end + end + end + + def update_deleted_articles(articles) + return unless @failed_request_count.zero? + articles.each do |article| + next unless @deleted_page_ids.include? article.mw_page_id + # Reload to account for articles that have had their mw_page_id changed + # because the page was moved rather than deleted. + next unless @deleted_page_ids.include? article.reload.mw_page_id + article.update(deleted: true) + end + end + + # If an article sets as deleted has a sync mw page id, then mark the timeslices + # as needs_update + def move_timeslices_for_deleted_articles(articles) + articles.filter(&:deleted).each do |article| + next unless @synced_ids.include? article.mw_page_id + handle_undeletion(article) + end + end + + def handle_undeletion(article) + # If there's already a non-deleted copy, we need to reprocess the timeslices for article + nondeleted_article = Article.find_by(wiki_id: @wiki.id, + mw_page_id: article.mw_page_id, deleted: false) + # If there is only one copy of the article, it was already found and updated + # via `update_title_and_namespace` + return unless nondeleted_article + ArticlesCoursesCleanerTimeslice.clean_articles(@course, [article]) + # Mark all ac timeslices for article as needs_update + # Remove ac timeslices for article + # remove ac for article (if it exists) + end + + def data_matches_article?(article_data, article) + return false unless article.title == article_data['page_title'] + return false unless article.namespace == article_data['page_namespace'].to_i + # If article data is collected from Replica, the article is not currently deleted + return false if article.deleted + true + end + + # Check whether any deleted pages still exist with a different article_id. + # If so, update the Article to use the new id. + def update_article_ids(deleted_page_ids) + maybe_deleted = Article.where(mw_page_id: deleted_page_ids, wiki_id: @wiki.id) + return if maybe_deleted.empty? + # These pages have titles that match Articles in our DB with deleted ids + request_results = Replica.new(@wiki).post_existing_articles_by_title maybe_deleted + @failed_request_count += 1 if request_results.nil? + + # Update articles whose IDs have changed (keyed on title and namespace) + request_results&.each do |stp| + resolve_page_id(stp, deleted_page_ids) + end + end + + def resolve_page_id(same_title_page, deleted_page_ids) + title = same_title_page['page_title'] + mw_page_id = same_title_page['page_id'] + namespace = same_title_page['page_namespace'] + + article = Article.find_by(wiki_id: @wiki.id, title:, namespace:, deleted: false) + + return unless article_data_matches?(article, title, deleted_page_ids) + update_article_page_id(article, mw_page_id) + end + + def article_data_matches?(article, title, deleted_page_ids) + return false if article.nil? + return false unless deleted_page_ids.include?(article.mw_page_id) + # This catches false positives when the query for page_title matches + # a case variant. + return false unless article.title == title + true + end + + def update_article_page_id(article, mw_page_id) + if Article.exists?(mw_page_id:, wiki_id: @wiki.id) + # Catches case where update_constantly has + # already added this article under a new ID + article.update(deleted: true) + else + article.update(mw_page_id:) + end + end + + def find_article_by_mw_page_id(mw_page_id) + article = Article.find_by(wiki_id: @wiki.id, mw_page_id:, deleted: false) + article ||= Article.find_by(wiki_id: @wiki.id, mw_page_id:) + article + end +end diff --git a/spec/lib/article_status_manager_timeslice_spec.rb b/spec/lib/article_status_manager_timeslice_spec.rb new file mode 100644 index 0000000000..843f83690f --- /dev/null +++ b/spec/lib/article_status_manager_timeslice_spec.rb @@ -0,0 +1,407 @@ +# frozen_string_literal: true + +require 'rails_helper' +require "#{Rails.root}/lib/article_status_manager_timeslice" + +describe ArticleStatusManagerTimeslice do + before { stub_wiki_validation } + + # CHANGE THIS + # For update_article_status_for_course, updated_at: 2.days.ago is used + # because ArticleStatusManager updates articles updated more than 1 day ago + # For update_status, it is not required, because it does not implement that logic + + let(:course) { create(:course, start: 1.year.ago, end: 1.year.from_now) } + let(:user) { create(:user) } + let(:wiki) { course.home_wiki } + let!(:courses_user) { create(:courses_user, course:, user:) } + + describe '.update_article_status_for_course' do + it 'marks deleted articles as "deleted"' do + VCR.use_cassette 'article_status_manager/main' do + create(:article, + id: 1, + mw_page_id: 1, + title: 'Noarticle', + namespace: 0, + updated_at: 2.days.ago) + create(:articles_course, course:, article_id: 1) + create(:course_wiki_timeslice, course:, wiki:, start: 2.days.ago.beginning_of_day, + end: 1.day.ago.beginning_of_day) + create(:article_course_timeslice, course:, article_id: 1, + start: 2.days.ago.beginning_of_day, end: 1.day.ago.beginning_of_day) + + described_class.update_article_status_for_course(course) + expect(Article.find(1).deleted).to be true + # It also deletes articles courses, timeslices and set them to reprocess + expect(course.articles_courses.count).to eq(0) + expect(course.article_course_timeslices.count).to eq(0) + expect(course.course_wiki_timeslices.first.needs_update).to eq(true) + end + end + + it 'updates the mw_page_ids of articles' do + VCR.use_cassette 'article_status_manager/mw_page_ids' do + # en.wikipedia - article 100 does not exist + create(:article, + id: 100, + mw_page_id: 100, + title: 'Audi', + namespace: 0, + wiki_id: wiki.id, + updated_at: 2.days.ago) + create(:articles_course, course:, article_id: 100) + create(:article_course_timeslice, course:, article_id: 100, + start: 2.days.ago.beginning_of_day, end: 1.day.ago.beginning_of_day) + + # es.wikipedia - article 1 does not exist + course.wikis << create(:wiki, id: 2, language: 'es', project: 'wikipedia') + create(:article, + id: 1, + mw_page_id: 1, + title: 'Audi', + namespace: 0, + wiki_id: 2, + updated_at: 2.days.ago) + create(:articles_course, course:, article_id: 1) + create(:article_course_timeslice, course:, article_id: 1, + start: 2.days.ago.beginning_of_day, end: 1.day.ago.beginning_of_day) + + described_class.update_article_status_for_course(course) + + expect(Article.find_by(title: 'Audi', wiki_id: wiki.id).mw_page_id).to eq(848) + expect(Article.find_by(title: 'Audi', wiki_id: 2).mw_page_id).to eq(4976786) + end + end + + it 'deletes articles when id changed but new one already exists' do + VCR.use_cassette 'article_status_manager/deleted_new_exists' do + create(:article, + id: 100, + mw_page_id: 100, + title: 'Audi', + namespace: 0, + updated_at: 2.days.ago) + create(:articles_course, course:, article_id: 100) + create(:article_course_timeslice, course:, article_id: 100, + start: 2.days.ago.beginning_of_day, end: 1.day.ago.beginning_of_day) + create(:course_wiki_timeslice, course:, wiki:, start: 2.days.ago.beginning_of_day, + end: 1.day.ago.beginning_of_day) + create(:article, + id: 848, + mw_page_id: 848, + title: 'Audi', + namespace: 0) + create(:articles_course, course:, article_id: 848) + create(:article_course_timeslice, course:, article_id: 848, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + create(:course_wiki_timeslice, course:, wiki:, start: 3.days.ago.beginning_of_day, + end: 2.days.ago.beginning_of_day) + + described_class.update_article_status_for_course(course) + expect(Article.find_by(mw_page_id: 100).deleted).to eq(true) + expect(Article.find_by(mw_page_id: 848).deleted).to eq(false) + # It also deletes articles courses, timeslices and set them to reprocess + expect(course.articles_courses.first.article_id).to eq(848) + expect(course.article_course_timeslices.first.article_id).to eq(848) + expect(course.course_wiki_timeslices.first.needs_update).to eq(true) + expect(course.course_wiki_timeslices.second.needs_update).to eq(false) + end + end + + it 'updates the namespace and titles when articles are moved' do + VCR.use_cassette 'article_status_manager/main' do + create(:article, + id: 848, + mw_page_id: 848, + title: 'Audi_Cars', # 'Audi' is the actual title + namespace: 2, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 848, + start: 2.days.ago.beginning_of_day, end: 1.day.ago.beginning_of_day) + + described_class.update_article_status_for_course(course) + expect(Article.find(848).namespace).to eq(0) + expect(Article.find(848).title).to eq('Audi') + end + end + + it 'handles cases with deleted and nondeleted copies of an article' do + create(:article, + id: 53001516, + mw_page_id: 66653200, + title: 'Port_of_Spain_Gazette', + updated_at: 2.days.ago) + create(:article, + id: 53058287, + mw_page_id: 66653200, + title: 'Port_of_Spain_Gazette', + deleted: true, + updated_at: 2.days.ago) + create(:articles_course, course:, article_id: 53001516) + create(:article_course_timeslice, course:, article_id: 53001516, + start: 2.days.ago.beginning_of_day, end: 1.day.ago.beginning_of_day) + create(:articles_course, course:, article_id: 53058287) + create(:article_course_timeslice, course:, article_id: 53058287, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + # Create course wiki timeslices + TimesliceManager.new(course).create_timeslices_for_new_course_wiki_records([course.home_wiki]) + + VCR.use_cassette 'article_status_manager/undeletion_duplicate' do + described_class.update_article_status_for_course(course) + end + expect(course.articles_courses.count).to eq(1) + expect(course.article_course_timeslices.count).to eq(1) + expect(course.course_wiki_timeslices.find_by(start: 3.days.ago.beginning_of_day).needs_update).to eq(true) + end + + it 'handles undeleted articles' do + create(:article, + id: 53058287, + mw_page_id: 66653200, + title: 'Port_of_Spain_Gazette', + deleted: true, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 53058287, + start: 2.days.ago.beginning_of_day, end: 1.day.ago.beginning_of_day) + # Create course wiki timeslices + TimesliceManager.new(course).create_timeslices_for_new_course_wiki_records([course.home_wiki]) + + VCR.use_cassette 'article_status_manager/undeletion' do + described_class.update_article_status_for_course(course) + end + expect(Article.find(53058287).deleted).to eq(false) + expect(course.course_wiki_timeslices.find_by(start: 2.days.ago.beginning_of_day).needs_update).to eq(true) + end + + context 'when a title is a unicode dump' do + let(:zh_wiki) { create(:wiki, language: 'zh', project: 'wikipedia') } + # https://zh.wikipedia.org/wiki/%E9%BB%83%F0%A8%A5%88%E7%91%A9 + let(:title) { CGI.escape('黃𨥈瑩') } + let(:article) { create(:article, wiki: zh_wiki, title:, mw_page_id: 420741) } + + it 'skips updates when the title is a unicode dumps' do + stub_wiki_validation + VCR.use_cassette 'article_status_manager/unicode_dump' do + described_class.new(course, zh_wiki).update_status([article]) + expect(Article.last.title).to eq(title) + end + end + end + + it 'handles SQL errors gracefully' do + expect_any_instance_of(Article).to receive(:update!).and_raise(ActiveRecord::StatementInvalid) + VCR.use_cassette 'article_status_manager/errors' do + article = create(:article, title: 'Selfeeee', mw_page_id: 38956275) + described_class.new(course).update_status([article]) + end + end + + it 'handles cases of space vs. underscore' do + VCR.use_cassette 'article_status_manager/main' do + # This page was first moved from a sandbox to "Yōji Sakate", then + # moved again to "Yōji Sakate (playwright)". It ended up in our database + # like this. + create(:article, + id: 46745170, + mw_page_id: 46745170, + # Currently this is a redirect to the other title. + title: 'Yōji Sakate', + namespace: 0, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 46745170, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + + create(:article, + id: 46364485, + mw_page_id: 46364485, + # Current title is "Yōji Sakate" as of 2016-07-06. + title: 'Yōji_Sakate', + namespace: 0, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 46364485, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + + described_class.update_article_status_for_course(course) + end + end + + it 'handles case-variant titles' do + VCR.use_cassette 'article_status_manager/main' do + article1 = create(:article, + id: 3914927, + mw_page_id: 3914927, + title: 'Cyber-ethnography', + deleted: true, + namespace: 1, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 3914927, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + article2 = create(:article, + id: 46394760, + mw_page_id: 46394760, + title: 'Cyber-Ethnography', + deleted: false, + namespace: 1, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 46394760, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + + described_class.update_article_status_for_course(course) + expect(article1.mw_page_id).to eq(3914927) + expect(article2.mw_page_id).to eq(46394760) + end + end + + it 'does not delete articles by mistake if Replica is down' do + VCR.use_cassette 'article_status_manager/main' do + create(:article, + id: 848, + mw_page_id: 848, + title: 'Audi', + namespace: 0, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 848, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + create(:article, + id: 1, + mw_page_id: 1, + title: 'Noarticle', + namespace: 0, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 1, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + + allow_any_instance_of(Replica).to receive(:get_existing_articles_by_id).and_return(nil) + described_class.update_article_status_for_course(course) + expect(Article.find(848).deleted).to eq(false) + expect(Article.find(1).deleted).to eq(false) + end + end + + it 'does not delete articles by mistake if Replica goes right before trying to fetch titles' do + VCR.use_cassette 'article_status_manager/main' do + create(:article, + id: 848, + mw_page_id: 848, + title: 'Audi', + namespace: 0, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 848, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + create(:article, + id: 1, + mw_page_id: 1, + title: 'Noarticle', + namespace: 0, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 1, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + + allow_any_instance_of(Replica).to receive(:post_existing_articles_by_title).and_return(nil) + described_class.update_article_status_for_course(course) + expect(Article.find(848).deleted).to eq(false) + expect(Article.find(1).deleted).to eq(false) + end + end + + it 'marks an undeleted article as not deleted' do + VCR.use_cassette 'article_status_manager/main' do + create(:article, + id: 50661367, + mw_page_id: 52228477, + title: 'Antiochis_of_Tlos', + namespace: 0, + deleted: true, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 50661367, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + # Create course wiki timeslices + TimesliceManager.new(course).create_timeslices_for_new_course_wiki_records([course.home_wiki]) + described_class.update_article_status_for_course(course) + expect(Article.find(50661367).deleted).to eq(false) + expect(course.course_wiki_timeslices.find_by(start: 3.days.ago.beginning_of_day).needs_update).to eq(true) + end + end + + it 'updates if article updated more than 1 day ago' do + VCR.use_cassette 'article_status_manager/main' do + create(:article, + id: 50661367, + mw_page_id: 52228477, + title: 'Antiochis_of_Tlos', + namespace: 0, + updated_at: 2.days.ago) + create(:article_course_timeslice, course:, article_id: 50661367, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + described_class.update_article_status_for_course(course) + expect(Article.find(50661367).updated_at > 30.seconds.ago).to eq(true) + end + end + + it 'does not update if article updated less than 1 day ago' do + VCR.use_cassette 'article_status_manager/main' do + create(:article, + id: 50661367, + mw_page_id: 52228477, + title: 'Antiochis_of_Tlos', + namespace: 0, + updated_at: 12.hours.ago) + create(:article_course_timeslice, course:, article_id: 50661367, + start: 3.days.ago.beginning_of_day, end: 2.days.ago.beginning_of_day) + described_class.update_article_status_for_course(course) + expect(Article.find(50661367).updated_at <= 12.hours.ago).to eq(true) + end + end + end + + describe '#update_status' do + context 'when passed a single article' do + let!(:first_article) do + create(:article, + title: 'Homosexuality_in_modern_sports', + mw_page_id: 26788997, + namespace: 0) + end + + let!(:article_to_update) do + build(:article, + title: 'Homosexuality_in_Modern_Sports', + mw_page_id: 26788997, + deleted: true, + namespace: 0) + Article.last + end + + it 'updates that article and not another with the same mw_page_id' do + VCR.use_cassette 'article_status_manager/duplicate_mw_page_ids' do + described_class.new(course).update_status([article_to_update]) + end + expect(article_to_update.reload.title).to eq('Homosexuality_in_modern_sports') + end + + it 'moves revisions after mw_page_id collisions with an undeleted article' do + deleted_article = create(:article, mw_page_id: 26788997, deleted: true) + create(:articles_course, course:, article: deleted_article) + VCR.use_cassette 'article_status_manager/duplicate_mw_page_ids' do + described_class.new(course).update_status([deleted_article]) + end + expect(deleted_article.revisions.count).to eq(0) + end + + it 'updates associated Assignment records with the new title' do + VCR.use_cassette 'article_status_manager/assignments' do + article = create(:article, mw_page_id: 848, + title: 'Audi_Cars', # 'Audi' is the actual title + namespace: 2, + updated_at: 2.days.ago) + assignment = create(:assignment, article_title: 'Audi_Cars', + article:, + course:) + described_class.new(course).update_status([article]) + expect(assignment.reload.article_title).to eq('Audi') + end + end + end + end +end