Skip to content

Commit

Permalink
Fix FetchTweets by switching from twitterscraper to twint 🎉
Browse files Browse the repository at this point in the history
See twintproject/twint#604 (comment) for more information.
  • Loading branch information
LinqLover committed Nov 5, 2020
1 parent a1454bd commit 3a8fe16
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 30 deletions.
3 changes: 1 addition & 2 deletions docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ validators==0.18.1

# Data sources
google-api-python-client==1.12.5
git+https://github.com/Museum-Barberini-gGmbH/twitterscraper.git#egg=twitterscraper
twint==2.1.20
git+https://github.com/twintproject/twint.git#egg=twint

# Analysis tools
git+https://github.com/rwalk/gsdmm.git#egg=gsdmm
Expand Down
54 changes: 26 additions & 28 deletions src/twitter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
"""Provides tasks for downloading tweets related to the museum."""

import datetime as dt
import dateutil
from pytz import utc
import re

import luigi
from luigi.format import UTF8
import pandas as pd
import twitterscraper as ts
import twint
import tzlocal

from _utils import CsvToDb, DataPreparationTask, MuseumFacts, logger
Expand Down Expand Up @@ -114,7 +115,7 @@ def output(self):


class FetchTwitter(DataPreparationTask):
"""Fetch tweets related to the museum using the twitterscraper."""
"""Fetch tweets related to the museum using twint."""

query = luigi.Parameter(default="museumbarberini")
timespan = luigi.parameter.TimeDeltaParameter(
Expand All @@ -132,12 +133,30 @@ def run(self):
if self.minimal_mode:
timespan = dt.timedelta(days=5)

tweets = ts.query_tweets(
self.query,
begindate=dt.date.today() - timespan,
enddate=dt.date.today() + dt.timedelta(days=1))
tweets: twint.tweet.tweet = []
twint.run.Search(twint.Config(
Search=self.query,
Since=str(dt.date.today() - timespan),
Until=str(dt.date.today() + dt.timedelta(days=1)),
Limit=10000,
Store_object=True,
Store_object_tweets_list=tweets,
Hide_output=True
))
if tweets:
df = pd.DataFrame([tweet.__dict__ for tweet in tweets])
df = pd.DataFrame([
dict(
user_id=tweet.user_id,
tweet_id=tweet.id,
text=tweet.tweet,
parent_tweet_id=None, # TODO: Nuke
timestamp=dateutil.parser.parse(tweet.datetime),
likes=tweet.likes_count,
retweets=tweet.retweets_count,
replies=tweet.replies_count
)
for tweet in tweets
])
else: # no tweets returned, ensure schema
df = pd.DataFrame(columns=[
'user_id',
Expand All @@ -149,29 +168,8 @@ def run(self):
'retweets',
'replies'])

# Filter out false positive matches. This is oviously a workaround,
# but at the moment cheaper than repairing or switching the scraper.
# See #352.
is_false_positive = ~(
df['parent_tweet_id'].apply(bool)
| df['text'].str.contains(self.query, flags=re.IGNORECASE)
| df['screen_name'].str.contains(self.query, flags=re.IGNORECASE))
if is_false_positive.any():
false_positives = df[is_false_positive]
logger.warning(
f"Dropping {len(false_positives)} tweets that are not "
f"related to the query"
)
df = df[~is_false_positive]

df = df.drop_duplicates(subset=['tweet_id'])

# timestamp is utc by default
df['timestamp'] = df['timestamp'].apply(
lambda utc_dt:
utc.localize(utc_dt, is_dst=None).astimezone(
tzlocal.get_localzone()))

with self.output().open('w') as output_file:
df.to_csv(output_file, index=False, header=True)

Expand Down

0 comments on commit 3a8fe16

Please sign in to comment.