From 2627b30858e12ec2ae508d2260afba52b888fef2 Mon Sep 17 00:00:00 2001 From: boreq Date: Thu, 31 Aug 2023 15:25:01 +0200 Subject: [PATCH] Add causes support to rsslay This approach is less than ideal unfortunately and makes us perform ~90 requests to get articles. --- pkg/feed/downloader.go | 38 ++++++++ pkg/feed/feed.go | 212 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 248 insertions(+), 2 deletions(-) create mode 100644 pkg/feed/downloader.go diff --git a/pkg/feed/downloader.go b/pkg/feed/downloader.go new file mode 100644 index 0000000..ef3fabb --- /dev/null +++ b/pkg/feed/downloader.go @@ -0,0 +1,38 @@ +package feed + +import ( + "context" + "fmt" + "io" + "net/http" + "time" +) + +type Downloader struct { +} + +func NewDownloader() *Downloader { + return &Downloader{} +} + +func (*Downloader) Download(url string) (io.ReadCloser, error) { + client := http.Client{Timeout: 30 * time.Second} + + req, err := http.NewRequestWithContext(context.Background(), "GET", url, nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", "rsslay") + + resp, err := client.Do(req) + if err != nil { + return nil, err + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + resp.Body.Close() + return nil, fmt.Errorf("http error %d", resp.StatusCode) + } + + return resp.Body, nil +} diff --git a/pkg/feed/feed.go b/pkg/feed/feed.go index 87ced8a..a3de372 100644 --- a/pkg/feed/feed.go +++ b/pkg/feed/feed.go @@ -1,6 +1,7 @@ package feed import ( + "context" "crypto/hmac" "crypto/sha256" "database/sql" @@ -10,6 +11,7 @@ import ( "fmt" "log" "net/http" + "strconv" "strings" "time" @@ -22,6 +24,11 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +const ( + causesLink = "https://www.causes.com/api/v2/articles?feed_id=recency" + causesNumWorkers = 10 +) + var ( fp = gofeed.NewParser() client = &http.Client{ @@ -51,6 +58,10 @@ var types = []string{ } func GetFeedURL(url string) string { + if url == causesLink { + return causesLink + } + resp, err := client.Get(url) if err != nil || resp.StatusCode >= 300 { return "" @@ -102,8 +113,9 @@ func ParseFeed(url string) (*gofeed.Feed, error) { } metrics.CacheMiss.Inc() - fp.RSSTranslator = NewCustomTranslator() - feed, err := fp.ParseURL(url) + + parser := getFeedParser(url) + feed, err := parser.Parse() if err != nil { return nil, err } @@ -121,6 +133,17 @@ func ParseFeed(url string) (*gofeed.Feed, error) { return feed, nil } +func getFeedParser(feedURL string) FeedParser { + downloader := NewDownloader() + + switch feedURL { + case causesLink: + return NewCausesFeedParser(downloader, feedURL) + default: + return NewDefaultFeedParser(downloader, feedURL) + } +} + func EntryFeedToSetMetadata(pubkey string, feed *gofeed.Feed, originalUrl string, enableAutoRegistration bool, defaultProfilePictureUrl string, mainDomainName string) nostr.Event { // Handle Nitter special cases (http schema) if strings.Contains(feed.Description, "Twitter feed") { @@ -193,3 +216,188 @@ func DeleteInvalidFeed(url string, db *sql.DB) { log.Printf("[DEBUG] deleted invalid feed with url %q", url) } } + +type FeedParser interface { + Parse() (*gofeed.Feed, error) +} + +type DefaultFeedParser struct { + downloader *Downloader + url string +} + +func NewDefaultFeedParser(downloader *Downloader, url string) *DefaultFeedParser { + return &DefaultFeedParser{downloader: downloader, url: url} +} + +func (d *DefaultFeedParser) Parse() (*gofeed.Feed, error) { + body, err := d.downloader.Download(d.url) + if err != nil { + return nil, err + } + defer body.Close() + + fp.RSSTranslator = NewCustomTranslator() + return fp.Parse(body) +} + +type causesResponseOrError struct { + Response causesResponse + Err error +} + +type CausesFeedParser struct { + downloader *Downloader + url string +} + +func NewCausesFeedParser(downloader *Downloader, url string) *CausesFeedParser { + return &CausesFeedParser{downloader: downloader, url: url} +} + +func (d *CausesFeedParser) Parse() (*gofeed.Feed, error) { + resp, err := d.get(d.url) + if err != nil { + return nil, err + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + chIn := make(chan int) + chOut := make(chan causesResponseOrError) + + d.startWorkers(ctx, chIn, chOut, causesNumWorkers) + + go func() { + defer close(chIn) + + for i := 1; i <= resp.Meta.Pagination.TotalPages; i++ { + select { + case chIn <- i: + continue + case <-ctx.Done(): + return + } + } + }() + + feed := &gofeed.Feed{ + Title: "causes.com", + Description: "Causes - powered by Countable - makes it quick and easy to understand the laws Congress is considering.", + Link: "https://www.causes.com/", + FeedLink: causesLink, + Links: nil, + Items: nil, + } + + for i := 1; i <= resp.Meta.Pagination.TotalPages; i++ { + select { + case result := <-chOut: + if err := result.Err; err != nil { + return nil, fmt.Errorf("worker error: %w", err) + } + + for _, article := range result.Response.Articles { + article := article + item := d.itemFromArticle(article) + feed.Items = append(feed.Items, item) + } + case <-ctx.Done(): + return nil, ctx.Err() + } + } + + return feed, nil +} + +func (d *CausesFeedParser) startWorkers(ctx context.Context, chIn <-chan int, chOut chan<- causesResponseOrError, n int) { + for i := 0; i < n; i++ { + go d.startWorker(ctx, chIn, chOut) + } +} + +func (d *CausesFeedParser) startWorker(ctx context.Context, chIn <-chan int, chOut chan<- causesResponseOrError) { + for { + select { + case in := <-chIn: + result, err := d.work(in) + if err != nil { + select { + case chOut <- causesResponseOrError{Err: err}: + continue + case <-ctx.Done(): + return + } + } + + select { + case chOut <- causesResponseOrError{Response: result}: + continue + case <-ctx.Done(): + return + } + case <-ctx.Done(): + return + } + } +} + +func (d *CausesFeedParser) work(page int) (causesResponse, error) { + return d.get(fmt.Sprintf("%s&page=%d", d.url, page)) +} + +func (d *CausesFeedParser) get(url string) (causesResponse, error) { + var resp causesResponse + + body, err := d.downloader.Download(url) + if err != nil { + return resp, err + } + defer body.Close() + + if err := json.NewDecoder(body).Decode(&resp); err != nil { + return resp, err + } + + return resp, nil +} + +func (d *CausesFeedParser) itemFromArticle(article causesResponseArticle) *gofeed.Item { + return &gofeed.Item{ + Title: article.Title, + Content: article.HtmlContent, + Link: article.Links.Self, + Published: article.CreatedAt.Format(time.RFC3339), + PublishedParsed: &article.CreatedAt, + GUID: strconv.Itoa(article.Id), + } + +} + +type causesResponse struct { + Articles []causesResponseArticle `json:"articles"` + Meta causesResponseMeta `json:"meta"` +} + +type causesResponseArticle struct { + Id int `json:"id"` + Title string `json:"title"` + CreatedAt time.Time `json:"created_at"` + HtmlContent string `json:"html_content"` + Links causesResponseArticleLinks `json:"links"` +} + +type causesResponseArticleLinks struct { + Self string `json:"self"` +} + +type causesResponseMeta struct { + Pagination causesResponseMetaPagination `json:"pagination"` +} + +type causesResponseMetaPagination struct { + CurrentPage int `json:"current_page"` + TotalPages int `json:"total_pages"` + TotalCount int `json:"total_count"` +}