diff --git a/luigi.cfg b/luigi.cfg index 74efe2fb..b497b477 100644 --- a/luigi.cfg +++ b/luigi.cfg @@ -25,6 +25,8 @@ password=${SMTP_PASSWORD} [worker] timeout=600 # (600 seconds = 10 minutes) +keep-alive=True +# Required for using per-task retry policy (retry_count) [core] log_level=INFO diff --git a/src/extended_twitter_collection/collect_tweets.py b/src/extended_twitter_collection/collect_tweets.py index 7505d508..e1caad60 100644 --- a/src/extended_twitter_collection/collect_tweets.py +++ b/src/extended_twitter_collection/collect_tweets.py @@ -116,6 +116,11 @@ class TwitterCollectCandidateTweets(DataPreparationTask): # for a given keyword-interval collection_r_limit = luigi.IntParameter(default=50) + # twint fails sporadically with RefreshTokenException on our VM as Twitter + # is blocking too many accesses from certain IPs. + # See https://github.com/twintproject/twint/issues/957. + retry_count = 3 + def requires(self): return KeywordIntervalsToDB() diff --git a/src/twitter.py b/src/twitter.py index 032b675f..919ab0fa 100644 --- a/src/twitter.py +++ b/src/twitter.py @@ -119,6 +119,11 @@ class FetchTwitter(DataPreparationTask): default=dt.timedelta(weeks=2), description="For how many days tweets should be fetched") + # twint fails sporadically with RefreshTokenException on our VM as Twitter + # is blocking too many accesses from certain IPs. + # See https://github.com/twintproject/twint/issues/957. + retry_count = 3 + def output(self): return luigi.LocalTarget( f'{self.output_dir}/twitter/raw_tweets.csv',