Merge pull request #60 from DocNow/fields-fixes

Additional Fields and fixes
DocNow · Jan 6, 2023 · 767535b · 767535b
2 parents 667c958 + 97816ed
commit 767535b
Show file tree

Hide file tree

Showing 12 changed files with 160 additions and 29 deletions.
diff --git a/csv_writer.py b/csv_writer.py
@@ -28,7 +28,9 @@ def __init__(
         self.output_format = output_format
         self.batch_size = batch_size
         self.hide_progress = hide_progress
-        self.progress = FileSizeProgressBar(infile, outfile, disable=(hide_progress or not self.infile.seekable()))
+        self.progress = FileSizeProgressBar(
+            infile, outfile, disable=(hide_progress or not self.infile.seekable())
+        )
 
     def _read_lines(self):
         """

diff --git a/dataframe_converter.py b/dataframe_converter.py
@@ -15,23 +15,27 @@
 referenced_tweets.replied_to.id
 referenced_tweets.retweeted.id
 referenced_tweets.quoted.id
-edit_history_tweet_ids
-edit_controls.edits_remaining
-edit_controls.editable_until
-edit_controls.is_edit_eligible
 author_id
 in_reply_to_user_id
+in_reply_to_username
 retweeted_user_id
+retweeted_username
 quoted_user_id
+quoted_username
 created_at
 text
 lang
 source
-public_metrics.like_count
-public_metrics.quote_count
+public_metrics.impression_count
 public_metrics.reply_count
 public_metrics.retweet_count
+public_metrics.quote_count
+public_metrics.like_count
 reply_settings
+edit_history_tweet_ids
+edit_controls.edits_remaining
+edit_controls.editable_until
+edit_controls.is_edit_eligible
 possibly_sensitive
 withheld.scope
 withheld.copyright
@@ -60,6 +64,7 @@
 author.entities.description.mentions
 author.entities.description.urls
 author.entities.url.urls
+author.url
 author.location
 author.pinned_tweet_id
 author.profile_image_url
@@ -68,8 +73,8 @@
 author.public_metrics.following_count
 author.public_metrics.listed_count
 author.public_metrics.tweet_count
-author.url
 author.verified
+author.verified_type
 author.withheld.scope
 author.withheld.copyright
 author.withheld.country_codes
@@ -87,8 +92,7 @@
 matching_rules
 __twarc.retrieved_at
 __twarc.url
-__twarc.version
-""".split(
+__twarc.version""".split(
     "\n"
 )
 
@@ -112,22 +116,21 @@
 public_metrics.tweet_count
 url
 verified
+verified_type
 withheld.scope
 withheld.copyright
 withheld.country_codes
 __twarc.retrieved_at
 __twarc.url
-__twarc.version
-""".split(
+__twarc.version""".split(
     "\n"
 )
 
 DEFAULT_COMPLIANCE_COLUMNS = """id
 action
 created_at
 redacted_at
-reason
-""".split(
+reason""".split(
     "\n"
 )
 
@@ -136,8 +139,7 @@
 tweet_count
 __twarc.retrieved_at
 __twarc.url
-__twarc.version
-""".split(
+__twarc.version""".split(
     "\n"
 )
 
@@ -152,8 +154,7 @@
 private
 __twarc.retrieved_at
 __twarc.url
-__twarc.version
-""".split(
+__twarc.version""".split(
     "\n"
 )
 
@@ -181,6 +182,7 @@ def __init__(
         inline_referenced_tweets=False,
         merge_retweets=True,
         allow_duplicates=False,
+        process_entities=True,
         extra_input_columns="",
         output_columns=None,
         dataset_ids=None,
@@ -191,6 +193,7 @@ def __init__(
         self.json_encode_lists = json_encode_lists
         self.inline_referenced_tweets = inline_referenced_tweets
         self.merge_retweets = merge_retweets
+        self.process_entities = process_entities
         self.allow_duplicates = allow_duplicates
         self.input_data_type = input_data_type
         self.columns = list()
@@ -269,6 +272,32 @@ def _inline_referenced_tweets(self, tweet):
                     self.counts["unavailable"] += 1
         yield self._format_tweet(tweet)
 
+    def _process_entities(self, entities):
+        # Process Entities in the tweet (or user):
+        if "cashtags" in entities:
+            entities["cashtags"] = [
+                "$" + hashtag["tag"] for hashtag in entities["cashtags"]
+            ]
+        if "hashtags" in entities:
+            entities["hashtags"] = [
+                "#" + hashtag["tag"] for hashtag in entities["hashtags"]
+            ]
+        if "mentions" in entities:
+            entities["mentions"] = [
+                "@" + mention["username"] for mention in entities["mentions"]
+            ]
+        # URLs:
+        if "urls" in entities:
+            entities["urls"] = [
+                url["display_url"]
+                if "media_key" in url
+                else url["expanded_url"]
+                if "expanded_url" in url
+                else url["url"]
+                for url in entities["urls"]
+            ]
+        return entities
+
     def _format_tweet(self, tweet):
         """
         Make the tweet objects easier to deal with, removing extra info and changing the structure.
@@ -282,28 +311,45 @@ def _format_tweet(self, tweet):
         tweet.pop("in_reply_to_user", None)
 
         if "referenced_tweets" in tweet:
-
             # Count Replies:
             replies = [
                 t for t in tweet["referenced_tweets"] if t["type"] == "replied_to"
             ]
             reply_tweet = replies[-1] if replies else None
             if "in_reply_to_user_id" in tweet or reply_tweet:
                 self.counts["replies"] += 1
+            if (
+                reply_tweet
+                and "author" in reply_tweet
+                and "username" in reply_tweet["author"]
+            ):
+                tweet["in_reply_to_username"] = reply_tweet["author"]["username"]
 
             # Extract Retweet only
             rts = [t for t in tweet["referenced_tweets"] if t["type"] == "retweeted"]
             retweeted_tweet = rts[-1] if rts else None
             if retweeted_tweet and "author_id" in retweeted_tweet:
                 self.counts["retweets"] += 1
                 tweet["retweeted_user_id"] = retweeted_tweet["author_id"]
+            if (
+                retweeted_tweet
+                and "author_id" in retweeted_tweet
+                and "username" in retweeted_tweet["author"]
+            ):
+                tweet["retweeted_username"] = retweeted_tweet["author"]["username"]
 
             # Extract Quoted tweet
             qts = [t for t in tweet["referenced_tweets"] if t["type"] == "quoted"]
             quoted_tweet = qts[-1] if qts else None
             if quoted_tweet and "author_id" in quoted_tweet:
                 self.counts["quotes"] += 1
                 tweet["quoted_user_id"] = quoted_tweet["author_id"]
+            if (
+                quoted_tweet
+                and "author" in quoted_tweet
+                and "username" in quoted_tweet["author"]
+            ):
+                tweet["quoted_username"] = quoted_tweet["author"]["username"]
 
             # Process Retweets:
             # If it's a native retweet, replace the "RT @user Text" with the original text, metrics, and entities, but keep the Author.
@@ -333,6 +379,57 @@ def _format_tweet(self, tweet):
         else:
             tweet["referenced_tweets"] = {}
 
+        # Process entities in the tweets:
+        if self.process_entities and "entities" in tweet:
+            tweet["entities"] = self._process_entities(tweet["entities"])
+
+        # Process entities in the tweet authors of tweets:
+        if (
+            self.process_entities
+            and "author" in tweet
+            and "entities" in tweet["author"]
+        ):
+            if "url" in tweet["author"]["entities"]:
+                urls = [
+                    url["expanded_url"] if "expanded_url" in url else url["url"]
+                    for url in tweet["author"]["entities"]["url"].pop("urls", [])
+                ]
+                tweet["author"]["entities"]["url"]["urls"] = urls
+                # There is only 1 url for the profile.
+                tweet["author"]["url"] = urls[-1]
+
+            if "description" in tweet["author"]["entities"]:
+                tweet["author"]["entities"]["description"] = self._process_entities(
+                    tweet["author"]["entities"]["description"]
+                )
+
+        # For older tweet data, make sure the new impressions are missing, not zero:
+        if (
+            self.input_data_type == "tweets"
+            and "public_metrics" in tweet
+            and "impression_count" not in tweet["public_metrics"]
+        ):
+            tweet["public_metrics"]["impression_count"] = None
+
+        # Process entities for users: `tweet` here is a user
+        if self.input_data_type == "users":
+            # Make sure pinned_tweet_id is missing, not zero:
+            tweet["pinned_tweet_id"] = (
+                tweet["pinned_tweet_id"] if "pinned_tweet_id" in tweet else None
+            )
+            # Process entities
+            if self.process_entities and "entities" in tweet:
+                if "description" in tweet["entities"]:
+                    tweet["entities"]["description"] = self._process_entities(
+                        tweet["entities"]["description"]
+                    )
+                if "url" in tweet["entities"]:
+                    tweet["entities"]["url"] = self._process_entities(
+                        tweet["entities"]["url"]
+                    )
+                    # User url:
+                    tweet["url"] = tweet["entities"]["url"]["urls"][-1]
+
         # Remove `type` left over from referenced tweets
         tweet.pop("type", None)
         # Remove empty objects
@@ -419,13 +516,13 @@ def process(self, objects):
                     f"💔 ERROR: {len(diff)} Unexpected items in data! \n"
                     "Are you sure you specified the correct --input-data-type?\n"
                     "If the object type is correct, add extra columns with:"
-                    f"\n--extra-input-columns \"{','.join(diff)}\"\nSkipping entire batch of {len(_df)} tweets!",
+                    f"\n--extra-input-columns \"{','.join(diff)}\"\nSkipping entire batch of {len(_df)} {self.input_data_type}!",
                     fg="red",
                 ),
                 err=True,
             )
             log.error(
-                f"CSV Unexpected Data: \"{','.join(diff)}\". Expected {len(self.columns)} columns, got {len(_df.columns)}. Skipping entire batch of {len(_df)} tweets!"
+                f"CSV Unexpected Data: \"{','.join(diff)}\". Expected {len(self.columns)} columns, got {len(_df.columns)}. Skipping entire batch of {len(_df)} {self.input_data_type}!"
             )
             self.counts["parse_errors"] += len(_df)
             return pd.DataFrame(columns=self.columns)

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+# Minimum requirements for the build system to execute.
+requires = ["setuptools", "wheel"]  # PEP 508 specifications.
diff --git a/setup.py b/setup.py
@@ -5,18 +5,18 @@
 
 setuptools.setup(
     name="twarc-csv",
-    version="0.6.0",
+    version="0.7.0",
     url="https://github.com/docnow/twarc-csv",
     author="Igor Brigadir",
     author_email="[email protected]",
-    py_modules=["twarc_csv","csv_writer","dataframe_converter"],
+    py_modules=["twarc_csv", "csv_writer", "dataframe_converter"],
     description="A twarc plugin to output Twitter data as CSV",
     long_description=long_description,
     long_description_content_type="text/markdown",
     python_requires=">=3.3",
     install_requires=[
-        "twarc>=2.12.0",
-        "pandas>=1.2.5",
+        "twarc>=2.13.0",
+        "pandas>=1.3.5",
         "more-itertools>=8.7.0",
         "tqdm>=4.59.0",
     ],

diff --git a/test-data/cashtags.jsonl b/test-data/cashtags.jsonl
diff --git a/test-data/compliance.json b/test-data/compliance.json