From 15de8a94d1a6df8096c1a334441b6bc6599a0335 Mon Sep 17 00:00:00 2001 From: geoffreychen777 Date: Mon, 26 Feb 2024 22:07:30 +0000 Subject: [PATCH] Enhance: scraping logic for papers from feeds. --- app/renderer/services/command-service.ts | 9 ++++ app/renderer/services/feed-service.ts | 12 ++---- app/renderer/services/paper-service.ts | 52 ++++++++++++++++-------- 3 files changed, 47 insertions(+), 26 deletions(-) diff --git a/app/renderer/services/command-service.ts b/app/renderer/services/command-service.ts index 106fdf80..3608da7e 100644 --- a/app/renderer/services/command-service.ts +++ b/app/renderer/services/command-service.ts @@ -92,6 +92,15 @@ export class CommandService extends Eventable<{}> { }); }, }); + + this.register({ + id: "scrape_preprints", + description: "Scrape metadata for all preprint papers in the library.", + priority: 99996, + handler: () => { + paperService.scrapePreprint(); + }, + }); } @errorcatching("Failed to register command.", true, "CommandService") diff --git a/app/renderer/services/feed-service.ts b/app/renderer/services/feed-service.ts index 717b5eb3..5fc42631 100644 --- a/app/renderer/services/feed-service.ts +++ b/app/renderer/services/feed-service.ts @@ -461,19 +461,15 @@ export class FeedService extends Eventable { "Feed" ); - const paperEntityDrafts = await this._scrapeService.scrape( - feedEntities.map((feedEntityDraft: IFeedEntityObject) => { + const paperEntityDrafts = feedEntities.map( + (feedEntityDraft: IFeedEntityObject) => { const paperEntityDraft = new PaperEntity({}, true).fromFeed( feedEntityDraft ); // NOTE: we don't want to download the PDFs when adding to library. paperEntityDraft.mainURL = ""; - return { - type: "PaperEntity", - value: paperEntityDraft, - }; - }), - ["semanticscholar"] + return paperEntityDraft; + } ); // NOTE: here we decide to not download the PDFs when adding to library. diff --git a/app/renderer/services/paper-service.ts b/app/renderer/services/paper-service.ts index ab3beb77..6af08735 100644 --- a/app/renderer/services/paper-service.ts +++ b/app/renderer/services/paper-service.ts @@ -180,7 +180,7 @@ export class PaperService extends Eventable { this._schedulerService.createTask( "paperServiceScrapePreprint", () => { - this.scrapePreprint(); + this._routineScrapePreprint(); }, 7 * 86400, undefined, @@ -601,6 +601,38 @@ export class PaperService extends Eventable { "PaperService" ) async scrapePreprint() { + if (this._databaseCore.getState("dbInitializing")) { + return; + } + this._logService.info( + `Scraping metadata of preprint paper(s)...`, + "", + true, + "PaperService" + ); + const preprintPaperEntities = this._paperEntityRepository.load( + await this._databaseCore.realm(), + '(publication contains[c] "arXiv") OR (publication contains[c] "openreview") OR publication == ""', + "addTime", + "desc" + ); + await this.scrape( + preprintPaperEntities.map((paperEntity) => { + return new PaperEntity(paperEntity); + }) + ); + } + + /** + * Scrape preprint paper entities. + */ + @processing(ProcessingKey.General) + @errorcatching( + "Failed to scrape metadata of preprints.", + true, + "PaperService" + ) + async _routineScrapePreprint() { if (this._databaseCore.getState("dbInitializing")) { return; } @@ -612,23 +644,7 @@ export class PaperService extends Eventable { ) { return; } - this._logService.info( - `Scraping metadata of preprint paper(s)...`, - "", - true, - "PaperService" - ); - const preprintPaperEntities = this._paperEntityRepository.load( - await this._databaseCore.realm(), - '(publication contains[c] "arXiv") OR (publication contains[c] "openreview") OR publication == ""', - "addTime", - "desc" - ); - await this.scrape( - preprintPaperEntities.map((paperEntity) => { - return new PaperEntity(paperEntity); - }) - ); + await this.scrapePreprint(); this._preferenceService.set({ lastRematchTime: Math.round(Date.now() / 1000), });