Merge pull request #63 from DocNow/missing-entities-check

Missing entities check
DocNow · Jan 8, 2023 · cb47453 · cb47453
2 parents 767535b + 3a16273
commit cb47453
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -39,10 +39,10 @@ Usage: twarc2 csv [OPTIONS] [INFILE] [OUTFILE]
   Convert tweets to CSV.
 
 Options:
-  --input-data-type [tweets|users|counts|compliance]
+  --input-data-type [tweets|users|counts|compliance|lists]
                                   Input data type - you can turn "tweets",
-                                  "users", "counts" or "compliance" data into
-                                  CSV.
+                                  "users", "counts" or "compliance" or "lists"
+                                  data into CSV.
   --inline-referenced-tweets / --no-inline-referenced-tweets
                                   Output referenced tweets inline as separate
                                   rows. Default: no.
@@ -51,6 +51,11 @@ Options:
                                   The Retweet Text, metrics and entities are
                                   merged from the original tweet. Default:
                                   Yes.
+  --process-entities / --no-process-entities
+                                  Preprocess entities like URLs, mentions and
+                                  hashtags, providing expanded urls and lists
+                                  only instead of full json objects. Default:
+                                  Yes.
   --json-encode-all / --no-json-encode-all
                                   JSON encode / escape all fields. Default: no
   --json-encode-text / --no-json-encode-text

diff --git a/dataframe_converter.py b/dataframe_converter.py
@@ -380,14 +380,15 @@ def _format_tweet(self, tweet):
             tweet["referenced_tweets"] = {}
 
         # Process entities in the tweets:
-        if self.process_entities and "entities" in tweet:
+        if self.process_entities and "entities" in tweet and tweet["entities"]:
             tweet["entities"] = self._process_entities(tweet["entities"])
 
         # Process entities in the tweet authors of tweets:
         if (
             self.process_entities
             and "author" in tweet
             and "entities" in tweet["author"]
+            and tweet["author"]["entities"]
         ):
             if "url" in tweet["author"]["entities"]:
                 urls = [
@@ -418,7 +419,7 @@ def _format_tweet(self, tweet):
                 tweet["pinned_tweet_id"] if "pinned_tweet_id" in tweet else None
             )
             # Process entities
-            if self.process_entities and "entities" in tweet:
+            if self.process_entities and "entities" in tweet and tweet["entities"]:
                 if "description" in tweet["entities"]:
                     tweet["entities"]["description"] = self._process_entities(
                         tweet["entities"]["description"]

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="twarc-csv",
-    version="0.7.0",
+    version="0.7.1",
     url="https://github.com/docnow/twarc-csv",
     author="Igor Brigadir",
     author_email="[email protected]",

diff --git a/test-data/entities_test.jsonl b/test-data/entities_test.jsonl
diff --git a/test_twarc_csv.py b/test_twarc_csv.py
@@ -139,3 +139,7 @@ def test_many_urls():
 
 def test_verified_type():
     _process_file("verified_type")
+
+
+def test_missing_entities():
+    _process_file("entities_test")