Skip to content

Commit

Permalink
Merge pull request #60 from DocNow/fields-fixes
Browse files Browse the repository at this point in the history
Additional Fields and fixes
  • Loading branch information
igorbrigadir authored Jan 6, 2023
2 parents 667c958 + 97816ed commit 767535b
Show file tree
Hide file tree
Showing 12 changed files with 160 additions and 29 deletions.
4 changes: 3 additions & 1 deletion csv_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ def __init__(
self.output_format = output_format
self.batch_size = batch_size
self.hide_progress = hide_progress
self.progress = FileSizeProgressBar(infile, outfile, disable=(hide_progress or not self.infile.seekable()))
self.progress = FileSizeProgressBar(
infile, outfile, disable=(hide_progress or not self.infile.seekable())
)

def _read_lines(self):
"""
Expand Down
137 changes: 117 additions & 20 deletions dataframe_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,27 @@
referenced_tweets.replied_to.id
referenced_tweets.retweeted.id
referenced_tweets.quoted.id
edit_history_tweet_ids
edit_controls.edits_remaining
edit_controls.editable_until
edit_controls.is_edit_eligible
author_id
in_reply_to_user_id
in_reply_to_username
retweeted_user_id
retweeted_username
quoted_user_id
quoted_username
created_at
text
lang
source
public_metrics.like_count
public_metrics.quote_count
public_metrics.impression_count
public_metrics.reply_count
public_metrics.retweet_count
public_metrics.quote_count
public_metrics.like_count
reply_settings
edit_history_tweet_ids
edit_controls.edits_remaining
edit_controls.editable_until
edit_controls.is_edit_eligible
possibly_sensitive
withheld.scope
withheld.copyright
Expand Down Expand Up @@ -60,6 +64,7 @@
author.entities.description.mentions
author.entities.description.urls
author.entities.url.urls
author.url
author.location
author.pinned_tweet_id
author.profile_image_url
Expand All @@ -68,8 +73,8 @@
author.public_metrics.following_count
author.public_metrics.listed_count
author.public_metrics.tweet_count
author.url
author.verified
author.verified_type
author.withheld.scope
author.withheld.copyright
author.withheld.country_codes
Expand All @@ -87,8 +92,7 @@
matching_rules
__twarc.retrieved_at
__twarc.url
__twarc.version
""".split(
__twarc.version""".split(
"\n"
)

Expand All @@ -112,22 +116,21 @@
public_metrics.tweet_count
url
verified
verified_type
withheld.scope
withheld.copyright
withheld.country_codes
__twarc.retrieved_at
__twarc.url
__twarc.version
""".split(
__twarc.version""".split(
"\n"
)

DEFAULT_COMPLIANCE_COLUMNS = """id
action
created_at
redacted_at
reason
""".split(
reason""".split(
"\n"
)

Expand All @@ -136,8 +139,7 @@
tweet_count
__twarc.retrieved_at
__twarc.url
__twarc.version
""".split(
__twarc.version""".split(
"\n"
)

Expand All @@ -152,8 +154,7 @@
private
__twarc.retrieved_at
__twarc.url
__twarc.version
""".split(
__twarc.version""".split(
"\n"
)

Expand Down Expand Up @@ -181,6 +182,7 @@ def __init__(
inline_referenced_tweets=False,
merge_retweets=True,
allow_duplicates=False,
process_entities=True,
extra_input_columns="",
output_columns=None,
dataset_ids=None,
Expand All @@ -191,6 +193,7 @@ def __init__(
self.json_encode_lists = json_encode_lists
self.inline_referenced_tweets = inline_referenced_tweets
self.merge_retweets = merge_retweets
self.process_entities = process_entities
self.allow_duplicates = allow_duplicates
self.input_data_type = input_data_type
self.columns = list()
Expand Down Expand Up @@ -269,6 +272,32 @@ def _inline_referenced_tweets(self, tweet):
self.counts["unavailable"] += 1
yield self._format_tweet(tweet)

def _process_entities(self, entities):
# Process Entities in the tweet (or user):
if "cashtags" in entities:
entities["cashtags"] = [
"$" + hashtag["tag"] for hashtag in entities["cashtags"]
]
if "hashtags" in entities:
entities["hashtags"] = [
"#" + hashtag["tag"] for hashtag in entities["hashtags"]
]
if "mentions" in entities:
entities["mentions"] = [
"@" + mention["username"] for mention in entities["mentions"]
]
# URLs:
if "urls" in entities:
entities["urls"] = [
url["display_url"]
if "media_key" in url
else url["expanded_url"]
if "expanded_url" in url
else url["url"]
for url in entities["urls"]
]
return entities

def _format_tweet(self, tweet):
"""
Make the tweet objects easier to deal with, removing extra info and changing the structure.
Expand All @@ -282,28 +311,45 @@ def _format_tweet(self, tweet):
tweet.pop("in_reply_to_user", None)

if "referenced_tweets" in tweet:

# Count Replies:
replies = [
t for t in tweet["referenced_tweets"] if t["type"] == "replied_to"
]
reply_tweet = replies[-1] if replies else None
if "in_reply_to_user_id" in tweet or reply_tweet:
self.counts["replies"] += 1
if (
reply_tweet
and "author" in reply_tweet
and "username" in reply_tweet["author"]
):
tweet["in_reply_to_username"] = reply_tweet["author"]["username"]

# Extract Retweet only
rts = [t for t in tweet["referenced_tweets"] if t["type"] == "retweeted"]
retweeted_tweet = rts[-1] if rts else None
if retweeted_tweet and "author_id" in retweeted_tweet:
self.counts["retweets"] += 1
tweet["retweeted_user_id"] = retweeted_tweet["author_id"]
if (
retweeted_tweet
and "author_id" in retweeted_tweet
and "username" in retweeted_tweet["author"]
):
tweet["retweeted_username"] = retweeted_tweet["author"]["username"]

# Extract Quoted tweet
qts = [t for t in tweet["referenced_tweets"] if t["type"] == "quoted"]
quoted_tweet = qts[-1] if qts else None
if quoted_tweet and "author_id" in quoted_tweet:
self.counts["quotes"] += 1
tweet["quoted_user_id"] = quoted_tweet["author_id"]
if (
quoted_tweet
and "author" in quoted_tweet
and "username" in quoted_tweet["author"]
):
tweet["quoted_username"] = quoted_tweet["author"]["username"]

# Process Retweets:
# If it's a native retweet, replace the "RT @user Text" with the original text, metrics, and entities, but keep the Author.
Expand Down Expand Up @@ -333,6 +379,57 @@ def _format_tweet(self, tweet):
else:
tweet["referenced_tweets"] = {}

# Process entities in the tweets:
if self.process_entities and "entities" in tweet:
tweet["entities"] = self._process_entities(tweet["entities"])

# Process entities in the tweet authors of tweets:
if (
self.process_entities
and "author" in tweet
and "entities" in tweet["author"]
):
if "url" in tweet["author"]["entities"]:
urls = [
url["expanded_url"] if "expanded_url" in url else url["url"]
for url in tweet["author"]["entities"]["url"].pop("urls", [])
]
tweet["author"]["entities"]["url"]["urls"] = urls
# There is only 1 url for the profile.
tweet["author"]["url"] = urls[-1]

if "description" in tweet["author"]["entities"]:
tweet["author"]["entities"]["description"] = self._process_entities(
tweet["author"]["entities"]["description"]
)

# For older tweet data, make sure the new impressions are missing, not zero:
if (
self.input_data_type == "tweets"
and "public_metrics" in tweet
and "impression_count" not in tweet["public_metrics"]
):
tweet["public_metrics"]["impression_count"] = None

# Process entities for users: `tweet` here is a user
if self.input_data_type == "users":
# Make sure pinned_tweet_id is missing, not zero:
tweet["pinned_tweet_id"] = (
tweet["pinned_tweet_id"] if "pinned_tweet_id" in tweet else None
)
# Process entities
if self.process_entities and "entities" in tweet:
if "description" in tweet["entities"]:
tweet["entities"]["description"] = self._process_entities(
tweet["entities"]["description"]
)
if "url" in tweet["entities"]:
tweet["entities"]["url"] = self._process_entities(
tweet["entities"]["url"]
)
# User url:
tweet["url"] = tweet["entities"]["url"]["urls"][-1]

# Remove `type` left over from referenced tweets
tweet.pop("type", None)
# Remove empty objects
Expand Down Expand Up @@ -419,13 +516,13 @@ def process(self, objects):
f"💔 ERROR: {len(diff)} Unexpected items in data! \n"
"Are you sure you specified the correct --input-data-type?\n"
"If the object type is correct, add extra columns with:"
f"\n--extra-input-columns \"{','.join(diff)}\"\nSkipping entire batch of {len(_df)} tweets!",
f"\n--extra-input-columns \"{','.join(diff)}\"\nSkipping entire batch of {len(_df)} {self.input_data_type}!",
fg="red",
),
err=True,
)
log.error(
f"CSV Unexpected Data: \"{','.join(diff)}\". Expected {len(self.columns)} columns, got {len(_df.columns)}. Skipping entire batch of {len(_df)} tweets!"
f"CSV Unexpected Data: \"{','.join(diff)}\". Expected {len(self.columns)} columns, got {len(_df.columns)}. Skipping entire batch of {len(_df)} {self.input_data_type}!"
)
self.counts["parse_errors"] += len(_df)
return pd.DataFrame(columns=self.columns)
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
# Minimum requirements for the build system to execute.
requires = ["setuptools", "wheel"] # PEP 508 specifications.
8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@

setuptools.setup(
name="twarc-csv",
version="0.6.0",
version="0.7.0",
url="https://github.com/docnow/twarc-csv",
author="Igor Brigadir",
author_email="[email protected]",
py_modules=["twarc_csv","csv_writer","dataframe_converter"],
py_modules=["twarc_csv", "csv_writer", "dataframe_converter"],
description="A twarc plugin to output Twitter data as CSV",
long_description=long_description,
long_description_content_type="text/markdown",
python_requires=">=3.3",
install_requires=[
"twarc>=2.12.0",
"pandas>=1.2.5",
"twarc>=2.13.0",
"pandas>=1.3.5",
"more-itertools>=8.7.0",
"tqdm>=4.59.0",
],
Expand Down
1 change: 1 addition & 0 deletions test-data/cashtags.jsonl

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions test-data/compliance.json

This file was deleted.

Loading

0 comments on commit 767535b

Please sign in to comment.