diff --git a/csv_writer.py b/csv_writer.py index 4e48418..28a48f1 100644 --- a/csv_writer.py +++ b/csv_writer.py @@ -28,7 +28,9 @@ def __init__( self.output_format = output_format self.batch_size = batch_size self.hide_progress = hide_progress - self.progress = FileSizeProgressBar(infile, outfile, disable=(hide_progress or not self.infile.seekable())) + self.progress = FileSizeProgressBar( + infile, outfile, disable=(hide_progress or not self.infile.seekable()) + ) def _read_lines(self): """ diff --git a/dataframe_converter.py b/dataframe_converter.py index 2544fe2..6af860f 100644 --- a/dataframe_converter.py +++ b/dataframe_converter.py @@ -15,23 +15,27 @@ referenced_tweets.replied_to.id referenced_tweets.retweeted.id referenced_tweets.quoted.id -edit_history_tweet_ids -edit_controls.edits_remaining -edit_controls.editable_until -edit_controls.is_edit_eligible author_id in_reply_to_user_id +in_reply_to_username retweeted_user_id +retweeted_username quoted_user_id +quoted_username created_at text lang source -public_metrics.like_count -public_metrics.quote_count +public_metrics.impression_count public_metrics.reply_count public_metrics.retweet_count +public_metrics.quote_count +public_metrics.like_count reply_settings +edit_history_tweet_ids +edit_controls.edits_remaining +edit_controls.editable_until +edit_controls.is_edit_eligible possibly_sensitive withheld.scope withheld.copyright @@ -60,6 +64,7 @@ author.entities.description.mentions author.entities.description.urls author.entities.url.urls +author.url author.location author.pinned_tweet_id author.profile_image_url @@ -68,8 +73,8 @@ author.public_metrics.following_count author.public_metrics.listed_count author.public_metrics.tweet_count -author.url author.verified +author.verified_type author.withheld.scope author.withheld.copyright author.withheld.country_codes @@ -87,8 +92,7 @@ matching_rules __twarc.retrieved_at __twarc.url -__twarc.version -""".split( +__twarc.version""".split( "\n" ) @@ -112,13 +116,13 @@ public_metrics.tweet_count url verified +verified_type withheld.scope withheld.copyright withheld.country_codes __twarc.retrieved_at __twarc.url -__twarc.version -""".split( +__twarc.version""".split( "\n" ) @@ -126,8 +130,7 @@ action created_at redacted_at -reason -""".split( +reason""".split( "\n" ) @@ -136,8 +139,7 @@ tweet_count __twarc.retrieved_at __twarc.url -__twarc.version -""".split( +__twarc.version""".split( "\n" ) @@ -152,8 +154,7 @@ private __twarc.retrieved_at __twarc.url -__twarc.version -""".split( +__twarc.version""".split( "\n" ) @@ -181,6 +182,7 @@ def __init__( inline_referenced_tweets=False, merge_retweets=True, allow_duplicates=False, + process_entities=True, extra_input_columns="", output_columns=None, dataset_ids=None, @@ -191,6 +193,7 @@ def __init__( self.json_encode_lists = json_encode_lists self.inline_referenced_tweets = inline_referenced_tweets self.merge_retweets = merge_retweets + self.process_entities = process_entities self.allow_duplicates = allow_duplicates self.input_data_type = input_data_type self.columns = list() @@ -269,6 +272,32 @@ def _inline_referenced_tweets(self, tweet): self.counts["unavailable"] += 1 yield self._format_tweet(tweet) + def _process_entities(self, entities): + # Process Entities in the tweet (or user): + if "cashtags" in entities: + entities["cashtags"] = [ + "$" + hashtag["tag"] for hashtag in entities["cashtags"] + ] + if "hashtags" in entities: + entities["hashtags"] = [ + "#" + hashtag["tag"] for hashtag in entities["hashtags"] + ] + if "mentions" in entities: + entities["mentions"] = [ + "@" + mention["username"] for mention in entities["mentions"] + ] + # URLs: + if "urls" in entities: + entities["urls"] = [ + url["display_url"] + if "media_key" in url + else url["expanded_url"] + if "expanded_url" in url + else url["url"] + for url in entities["urls"] + ] + return entities + def _format_tweet(self, tweet): """ Make the tweet objects easier to deal with, removing extra info and changing the structure. @@ -282,7 +311,6 @@ def _format_tweet(self, tweet): tweet.pop("in_reply_to_user", None) if "referenced_tweets" in tweet: - # Count Replies: replies = [ t for t in tweet["referenced_tweets"] if t["type"] == "replied_to" @@ -290,6 +318,12 @@ def _format_tweet(self, tweet): reply_tweet = replies[-1] if replies else None if "in_reply_to_user_id" in tweet or reply_tweet: self.counts["replies"] += 1 + if ( + reply_tweet + and "author" in reply_tweet + and "username" in reply_tweet["author"] + ): + tweet["in_reply_to_username"] = reply_tweet["author"]["username"] # Extract Retweet only rts = [t for t in tweet["referenced_tweets"] if t["type"] == "retweeted"] @@ -297,6 +331,12 @@ def _format_tweet(self, tweet): if retweeted_tweet and "author_id" in retweeted_tweet: self.counts["retweets"] += 1 tweet["retweeted_user_id"] = retweeted_tweet["author_id"] + if ( + retweeted_tweet + and "author_id" in retweeted_tweet + and "username" in retweeted_tweet["author"] + ): + tweet["retweeted_username"] = retweeted_tweet["author"]["username"] # Extract Quoted tweet qts = [t for t in tweet["referenced_tweets"] if t["type"] == "quoted"] @@ -304,6 +344,12 @@ def _format_tweet(self, tweet): if quoted_tweet and "author_id" in quoted_tweet: self.counts["quotes"] += 1 tweet["quoted_user_id"] = quoted_tweet["author_id"] + if ( + quoted_tweet + and "author" in quoted_tweet + and "username" in quoted_tweet["author"] + ): + tweet["quoted_username"] = quoted_tweet["author"]["username"] # Process Retweets: # If it's a native retweet, replace the "RT @user Text" with the original text, metrics, and entities, but keep the Author. @@ -333,6 +379,57 @@ def _format_tweet(self, tweet): else: tweet["referenced_tweets"] = {} + # Process entities in the tweets: + if self.process_entities and "entities" in tweet: + tweet["entities"] = self._process_entities(tweet["entities"]) + + # Process entities in the tweet authors of tweets: + if ( + self.process_entities + and "author" in tweet + and "entities" in tweet["author"] + ): + if "url" in tweet["author"]["entities"]: + urls = [ + url["expanded_url"] if "expanded_url" in url else url["url"] + for url in tweet["author"]["entities"]["url"].pop("urls", []) + ] + tweet["author"]["entities"]["url"]["urls"] = urls + # There is only 1 url for the profile. + tweet["author"]["url"] = urls[-1] + + if "description" in tweet["author"]["entities"]: + tweet["author"]["entities"]["description"] = self._process_entities( + tweet["author"]["entities"]["description"] + ) + + # For older tweet data, make sure the new impressions are missing, not zero: + if ( + self.input_data_type == "tweets" + and "public_metrics" in tweet + and "impression_count" not in tweet["public_metrics"] + ): + tweet["public_metrics"]["impression_count"] = None + + # Process entities for users: `tweet` here is a user + if self.input_data_type == "users": + # Make sure pinned_tweet_id is missing, not zero: + tweet["pinned_tweet_id"] = ( + tweet["pinned_tweet_id"] if "pinned_tweet_id" in tweet else None + ) + # Process entities + if self.process_entities and "entities" in tweet: + if "description" in tweet["entities"]: + tweet["entities"]["description"] = self._process_entities( + tweet["entities"]["description"] + ) + if "url" in tweet["entities"]: + tweet["entities"]["url"] = self._process_entities( + tweet["entities"]["url"] + ) + # User url: + tweet["url"] = tweet["entities"]["url"]["urls"][-1] + # Remove `type` left over from referenced tweets tweet.pop("type", None) # Remove empty objects @@ -419,13 +516,13 @@ def process(self, objects): f"💔 ERROR: {len(diff)} Unexpected items in data! \n" "Are you sure you specified the correct --input-data-type?\n" "If the object type is correct, add extra columns with:" - f"\n--extra-input-columns \"{','.join(diff)}\"\nSkipping entire batch of {len(_df)} tweets!", + f"\n--extra-input-columns \"{','.join(diff)}\"\nSkipping entire batch of {len(_df)} {self.input_data_type}!", fg="red", ), err=True, ) log.error( - f"CSV Unexpected Data: \"{','.join(diff)}\". Expected {len(self.columns)} columns, got {len(_df.columns)}. Skipping entire batch of {len(_df)} tweets!" + f"CSV Unexpected Data: \"{','.join(diff)}\". Expected {len(self.columns)} columns, got {len(_df.columns)}. Skipping entire batch of {len(_df)} {self.input_data_type}!" ) self.counts["parse_errors"] += len(_df) return pd.DataFrame(columns=self.columns) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..019b0d8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +# Minimum requirements for the build system to execute. +requires = ["setuptools", "wheel"] # PEP 508 specifications. diff --git a/setup.py b/setup.py index 9a1b796..4650f54 100644 --- a/setup.py +++ b/setup.py @@ -5,18 +5,18 @@ setuptools.setup( name="twarc-csv", - version="0.6.0", + version="0.7.0", url="https://github.com/docnow/twarc-csv", author="Igor Brigadir", author_email="igor.brigadir@gmail.com", - py_modules=["twarc_csv","csv_writer","dataframe_converter"], + py_modules=["twarc_csv", "csv_writer", "dataframe_converter"], description="A twarc plugin to output Twitter data as CSV", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.3", install_requires=[ - "twarc>=2.12.0", - "pandas>=1.2.5", + "twarc>=2.13.0", + "pandas>=1.3.5", "more-itertools>=8.7.0", "tqdm>=4.59.0", ], diff --git a/test-data/cashtags.jsonl b/test-data/cashtags.jsonl new file mode 100644 index 0000000..2c614b0 --- /dev/null +++ b/test-data/cashtags.jsonl @@ -0,0 +1 @@ +{"data": [{"possibly_sensitive": false, "entities": {"urls": [{"start": 119, "end": 142, "url": "https://t.co/H8r2bJrh4i", "expanded_url": "http://go.trade-ideas.com/SHHb", "display_url": "go.trade-ideas.com/SHHb", "unwound_url": "http://go.trade-ideas.com/SHHb"}, {"start": 250, "end": 273, "url": "https://t.co/XfSUInZb0t", "expanded_url": "https://twitter.com/Trading22971312/status/1611077073803841538/photo/1", "display_url": "pic.twitter.com/XfSUInZb0t", "media_key": "3_1611076914248286208"}], "cashtags": [{"start": 176, "end": 180, "tag": "AMD"}, {"start": 191, "end": 195, "tag": "BTC"}, {"start": 197, "end": 202, "tag": "TWTR"}, {"start": 204, "end": 207, "tag": "BA"}, {"start": 209, "end": 214, "tag": "doge"}, {"start": 216, "end": 221, "tag": "baba"}, {"start": 223, "end": 227, "tag": "spy"}, {"start": 229, "end": 233, "tag": "SPX"}, {"start": 234, "end": 238, "tag": "NIO"}, {"start": 239, "end": 243, "tag": "ETC"}, {"start": 244, "end": 249, "tag": "META"}], "hashtags": [{"start": 145, "end": 154, "tag": "business"}, {"start": 156, "end": 160, "tag": "NFT"}, {"start": 162, "end": 174, "tag": "StockMarket"}, {"start": 182, "end": 189, "tag": "option"}], "annotations": [{"start": 157, "end": 159, "probability": 0.5266, "type": "Other", "normalized_text": "NFT"}, {"start": 177, "end": 179, "probability": 0.9137, "type": "Organization", "normalized_text": "AMD"}, {"start": 192, "end": 194, "probability": 0.511, "type": "Other", "normalized_text": "BTC"}, {"start": 198, "end": 201, "probability": 0.5086, "type": "Other", "normalized_text": "TWTR"}, {"start": 205, "end": 206, "probability": 0.3909, "type": "Other", "normalized_text": "BA"}, {"start": 210, "end": 213, "probability": 0.4415, "type": "Organization", "normalized_text": "doge"}, {"start": 217, "end": 220, "probability": 0.4157, "type": "Organization", "normalized_text": "baba"}, {"start": 224, "end": 226, "probability": 0.4884, "type": "Organization", "normalized_text": "spy"}, {"start": 230, "end": 232, "probability": 0.4548, "type": "Other", "normalized_text": "SPX"}, {"start": 235, "end": 237, "probability": 0.517, "type": "Organization", "normalized_text": "NIO"}, {"start": 240, "end": 242, "probability": 0.4181, "type": "Other", "normalized_text": "ETC"}, {"start": 245, "end": 248, "probability": 0.499, "type": "Organization", "normalized_text": "META"}]}, "conversation_id": "1611077073803841538", "edit_history_tweet_ids": ["1611077073803841538"], "author_id": "1291963157271863297", "text": "Now available with every Trade Ideas subscription. Gain confidence and learn how to take action in the markets. \nVia:- https://t.co/H8r2bJrh4i \n\n#business \n#NFT \n#StockMarket \n$AMD \n#option \n$BTC \n$TWTR \n$BA \n$doge \n$baba \n$spy \n$SPX $NIO $ETC $META https://t.co/XfSUInZb0t", "public_metrics": {"retweet_count": 0, "reply_count": 0, "like_count": 1, "quote_count": 0, "impression_count": 52}, "reply_settings": "everyone", "edit_controls": {"edits_remaining": 5, "is_edit_eligible": true, "editable_until": "2023-01-05T19:38:01.000Z"}, "lang": "en", "context_annotations": [{"domain": {"id": "29", "name": "Events [Entity Service]", "description": "Real world events. "}, "entity": {"id": "984848492387483648", "name": "TWTR Earnings", "description": "TWTR Earnings"}}, {"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557696848252391426", "name": "Financial Services Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to Banks, Credit cards, Insurance, Investments, Stocks "}}, {"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557696940178935808", "name": "Gaming Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to offline and online games such as gaming consoles, tabletop games, video game publishers"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "1007360414114435072", "name": "Bitcoin cryptocurrency", "description": "Bitcoin Cryptocurrency"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "1139229372198469633", "name": "Dogecoin cryptocurrency"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "1480944077596086275", "name": "Boeing"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "1480944078397132804", "name": "Advanced Micro Devices"}}, {"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "781974596148793345", "name": "Business & finance"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "847888632711061504", "name": "Personal finance", "description": "Personal finance"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "913142676819648512", "name": "Cryptocurrencies", "description": "Cryptocurrency"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "1369311988040355840", "name": "NFTs", "description": "Non-fungible tokens"}}, {"domain": {"id": "67", "name": "Interests and Hobbies", "description": "Interests, opinions, and behaviors of individuals, groups, or cultures; like Speciality Cooking or Theme Parks"}, "entity": {"id": "847894630787973120", "name": "Stock options", "description": "Options"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "781974596148793345", "name": "Business & finance"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847894353708068864", "name": "Investing", "description": "Investing"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847894630787973120", "name": "Stock options", "description": "Options"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847894852779900928", "name": "Stocks & indices", "description": "Stocks"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "864154902926196737", "name": "S&P 500", "description": "S&P 500"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "913142676819648512", "name": "Cryptocurrencies", "description": "Cryptocurrency"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1007360414114435072", "name": "Bitcoin cryptocurrency", "description": "Bitcoin Cryptocurrency"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1139229372198469633", "name": "Dogecoin cryptocurrency"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1301195966125494272", "name": "$BTC"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1303013359306993665", "name": "$SPY"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1303024103272509441", "name": "$SPX"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1303026843365171206", "name": "$DOGE"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1303032576773255168", "name": "$BA"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1303034843911995392", "name": "$TWTR"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1306568690641428481", "name": "$AMD"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1306659423109046272", "name": "$BABA"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1313525755347787776", "name": "$NIO"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1369311988040355840", "name": "NFTs", "description": "Non-fungible tokens"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1480944077596086275", "name": "Boeing"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1480944078397132804", "name": "Advanced Micro Devices"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1484181943616884743", "name": "Cryptocoins"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1491481998862348291", "name": "Digital asset industry"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1492162686204854274", "name": "Digital assets & cryptocurrency", "description": "Cryptocurrency"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1301195966125494272", "name": "$BTC"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1303013359306993665", "name": "$SPY"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1303024103272509441", "name": "$SPX"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1303026843365171206", "name": "$DOGE"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1303032576773255168", "name": "$BA"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1303034843911995392", "name": "$TWTR"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1306568690641428481", "name": "$AMD"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1306659423109046272", "name": "$BABA"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1313525755347787776", "name": "$NIO"}}, {"domain": {"id": "174", "name": "Digital Assets & Crypto", "description": "For cryptocurrency entities"}, "entity": {"id": "1139229372198469633", "name": "Dogecoin cryptocurrency"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "781974596148793345", "name": "Business & finance"}}, {"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "781974596148793345", "name": "Business & finance"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "847888632711061504", "name": "Personal finance", "description": "Personal finance"}}, {"domain": {"id": "67", "name": "Interests and Hobbies", "description": "Interests, opinions, and behaviors of individuals, groups, or cultures; like Speciality Cooking or Theme Parks"}, "entity": {"id": "847894852779900928", "name": "Stocks & indices", "description": "Stocks"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847894353708068864", "name": "Investing", "description": "Investing"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847894852779900928", "name": "Stocks & indices", "description": "Stocks"}}, {"domain": {"id": "29", "name": "Events [Entity Service]", "description": "Real world events. "}, "entity": {"id": "984848492387483648", "name": "TWTR Earnings", "description": "TWTR Earnings"}}, {"domain": {"id": "29", "name": "Events [Entity Service]", "description": "Real world events. "}, "entity": {"id": "788929372342263808", "name": "Cheddar", "description": "Cheddar"}}], "created_at": "2023-01-05T19:08:01.000Z", "attachments": {"media_keys": ["3_1611076914248286208"]}, "id": "1611077073803841538"}], "includes": {"media": [{"type": "photo", "media_key": "3_1611076914248286208", "height": 350, "width": 636, "url": "https://pbs.twimg.com/media/Flux6uuaAAAq9Ap.png"}], "users": [{"profile_image_url": "https://pbs.twimg.com/profile_images/1594440891434934282/KgFqB6dZ_normal.jpg", "created_at": "2020-08-08T05:12:08.000Z", "verified": false, "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/pDKYJld2vS", "expanded_url": "http://tradealert.pw", "display_url": "tradealert.pw"}]}}, "name": "Trading", "id": "1291963157271863297", "protected": false, "url": "https://t.co/pDKYJld2vS", "location": "USA", "description": "Realtime Alerting, Scanning, Analysis, & Artificially Intelligent Idea Generation Tools for Active Traders, Investors, & Money Managers. Innovating since 2003.", "username": "Trading22971312", "public_metrics": {"followers_count": 809, "following_count": 202, "tweet_count": 2271, "listed_count": 0}}], "tweets": [{"possibly_sensitive": false, "entities": {"urls": [{"start": 119, "end": 142, "url": "https://t.co/H8r2bJrh4i", "expanded_url": "http://go.trade-ideas.com/SHHb", "display_url": "go.trade-ideas.com/SHHb", "unwound_url": "http://go.trade-ideas.com/SHHb"}, {"start": 250, "end": 273, "url": "https://t.co/XfSUInZb0t", "expanded_url": "https://twitter.com/Trading22971312/status/1611077073803841538/photo/1", "display_url": "pic.twitter.com/XfSUInZb0t", "media_key": "3_1611076914248286208"}], "cashtags": [{"start": 176, "end": 180, "tag": "AMD"}, {"start": 191, "end": 195, "tag": "BTC"}, {"start": 197, "end": 202, "tag": "TWTR"}, {"start": 204, "end": 207, "tag": "BA"}, {"start": 209, "end": 214, "tag": "doge"}, {"start": 216, "end": 221, "tag": "baba"}, {"start": 223, "end": 227, "tag": "spy"}, {"start": 229, "end": 233, "tag": "SPX"}, {"start": 234, "end": 238, "tag": "NIO"}, {"start": 239, "end": 243, "tag": "ETC"}, {"start": 244, "end": 249, "tag": "META"}], "hashtags": [{"start": 145, "end": 154, "tag": "business"}, {"start": 156, "end": 160, "tag": "NFT"}, {"start": 162, "end": 174, "tag": "StockMarket"}, {"start": 182, "end": 189, "tag": "option"}], "annotations": [{"start": 157, "end": 159, "probability": 0.5266, "type": "Other", "normalized_text": "NFT"}, {"start": 177, "end": 179, "probability": 0.9137, "type": "Organization", "normalized_text": "AMD"}, {"start": 192, "end": 194, "probability": 0.511, "type": "Other", "normalized_text": "BTC"}, {"start": 198, "end": 201, "probability": 0.5086, "type": "Other", "normalized_text": "TWTR"}, {"start": 205, "end": 206, "probability": 0.3909, "type": "Other", "normalized_text": "BA"}, {"start": 210, "end": 213, "probability": 0.4415, "type": "Organization", "normalized_text": "doge"}, {"start": 217, "end": 220, "probability": 0.4157, "type": "Organization", "normalized_text": "baba"}, {"start": 224, "end": 226, "probability": 0.4884, "type": "Organization", "normalized_text": "spy"}, {"start": 230, "end": 232, "probability": 0.4548, "type": "Other", "normalized_text": "SPX"}, {"start": 235, "end": 237, "probability": 0.517, "type": "Organization", "normalized_text": "NIO"}, {"start": 240, "end": 242, "probability": 0.4181, "type": "Other", "normalized_text": "ETC"}, {"start": 245, "end": 248, "probability": 0.499, "type": "Organization", "normalized_text": "META"}]}, "conversation_id": "1611077073803841538", "edit_history_tweet_ids": ["1611077073803841538"], "author_id": "1291963157271863297", "text": "Now available with every Trade Ideas subscription. Gain confidence and learn how to take action in the markets. \nVia:- https://t.co/H8r2bJrh4i \n\n#business \n#NFT \n#StockMarket \n$AMD \n#option \n$BTC \n$TWTR \n$BA \n$doge \n$baba \n$spy \n$SPX $NIO $ETC $META https://t.co/XfSUInZb0t", "public_metrics": {"retweet_count": 0, "reply_count": 0, "like_count": 1, "quote_count": 0, "impression_count": 52}, "reply_settings": "everyone", "edit_controls": {"edits_remaining": 5, "is_edit_eligible": true, "editable_until": "2023-01-05T19:38:01.000Z"}, "lang": "en", "context_annotations": [{"domain": {"id": "29", "name": "Events [Entity Service]", "description": "Real world events. "}, "entity": {"id": "984848492387483648", "name": "TWTR Earnings", "description": "TWTR Earnings"}}, {"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557696848252391426", "name": "Financial Services Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to Banks, Credit cards, Insurance, Investments, Stocks "}}, {"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557696940178935808", "name": "Gaming Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to offline and online games such as gaming consoles, tabletop games, video game publishers"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "1007360414114435072", "name": "Bitcoin cryptocurrency", "description": "Bitcoin Cryptocurrency"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "1139229372198469633", "name": "Dogecoin cryptocurrency"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "1480944077596086275", "name": "Boeing"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "1480944078397132804", "name": "Advanced Micro Devices"}}, {"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "781974596148793345", "name": "Business & finance"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "847888632711061504", "name": "Personal finance", "description": "Personal finance"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "913142676819648512", "name": "Cryptocurrencies", "description": "Cryptocurrency"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "1369311988040355840", "name": "NFTs", "description": "Non-fungible tokens"}}, {"domain": {"id": "67", "name": "Interests and Hobbies", "description": "Interests, opinions, and behaviors of individuals, groups, or cultures; like Speciality Cooking or Theme Parks"}, "entity": {"id": "847894630787973120", "name": "Stock options", "description": "Options"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "781974596148793345", "name": "Business & finance"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847894353708068864", "name": "Investing", "description": "Investing"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847894630787973120", "name": "Stock options", "description": "Options"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847894852779900928", "name": "Stocks & indices", "description": "Stocks"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "864154902926196737", "name": "S&P 500", "description": "S&P 500"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "913142676819648512", "name": "Cryptocurrencies", "description": "Cryptocurrency"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1007360414114435072", "name": "Bitcoin cryptocurrency", "description": "Bitcoin Cryptocurrency"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1139229372198469633", "name": "Dogecoin cryptocurrency"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1301195966125494272", "name": "$BTC"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1303013359306993665", "name": "$SPY"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1303024103272509441", "name": "$SPX"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1303026843365171206", "name": "$DOGE"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1303032576773255168", "name": "$BA"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1303034843911995392", "name": "$TWTR"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1306568690641428481", "name": "$AMD"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1306659423109046272", "name": "$BABA"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1313525755347787776", "name": "$NIO"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1369311988040355840", "name": "NFTs", "description": "Non-fungible tokens"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1480944077596086275", "name": "Boeing"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1480944078397132804", "name": "Advanced Micro Devices"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1484181943616884743", "name": "Cryptocoins"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1491481998862348291", "name": "Digital asset industry"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1492162686204854274", "name": "Digital assets & cryptocurrency", "description": "Cryptocurrency"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1301195966125494272", "name": "$BTC"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1303013359306993665", "name": "$SPY"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1303024103272509441", "name": "$SPX"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1303026843365171206", "name": "$DOGE"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1303032576773255168", "name": "$BA"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1303034843911995392", "name": "$TWTR"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1306568690641428481", "name": "$AMD"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1306659423109046272", "name": "$BABA"}}, {"domain": {"id": "166", "name": "Stocks", "description": "for individual and types of stocks, e.g., $TRX, $QQQ"}, "entity": {"id": "1313525755347787776", "name": "$NIO"}}, {"domain": {"id": "174", "name": "Digital Assets & Crypto", "description": "For cryptocurrency entities"}, "entity": {"id": "1139229372198469633", "name": "Dogecoin cryptocurrency"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "781974596148793345", "name": "Business & finance"}}, {"domain": {"id": "65", "name": "Interests and Hobbies Vertical", "description": "Top level interests and hobbies groupings, like Food or Travel"}, "entity": {"id": "781974596148793345", "name": "Business & finance"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "847888632711061504", "name": "Personal finance", "description": "Personal finance"}}, {"domain": {"id": "67", "name": "Interests and Hobbies", "description": "Interests, opinions, and behaviors of individuals, groups, or cultures; like Speciality Cooking or Theme Parks"}, "entity": {"id": "847894852779900928", "name": "Stocks & indices", "description": "Stocks"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847894353708068864", "name": "Investing", "description": "Investing"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847894852779900928", "name": "Stocks & indices", "description": "Stocks"}}, {"domain": {"id": "29", "name": "Events [Entity Service]", "description": "Real world events. "}, "entity": {"id": "984848492387483648", "name": "TWTR Earnings", "description": "TWTR Earnings"}}, {"domain": {"id": "29", "name": "Events [Entity Service]", "description": "Real world events. "}, "entity": {"id": "788929372342263808", "name": "Cheddar", "description": "Cheddar"}}], "created_at": "2023-01-05T19:08:01.000Z", "attachments": {"media_keys": ["3_1611076914248286208"]}, "id": "1611077073803841538"}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id%2Cedit_history_tweet_ids&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld%2Cedit_controls%2Cedit_history_tweet_ids&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cvariants%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1611077073803841538", "version": "2.13.0", "retrieved_at": "2023-01-05T19:10:27+00:00"}} diff --git a/test-data/compliance.json b/test-data/compliance.json deleted file mode 100644 index ddcb354..0000000 --- a/test-data/compliance.json +++ /dev/null @@ -1,2 +0,0 @@ -{"id":"1170025343920402432","action":"delete","created_at":"2019-09-06T17:25:54.667Z","reason":"protected"} -{"id":"1170147183095664640","action":"delete","created_at":"2019-09-07T01:30:03.390Z","redacted_at":"2021-08-17T17:26:15.788Z","reason":"deactivated"} diff --git a/test-data/many_urls.jsonl b/test-data/many_urls.jsonl new file mode 100644 index 0000000..2295fac --- /dev/null +++ b/test-data/many_urls.jsonl @@ -0,0 +1 @@ +{"data": [{"lang": "en", "entities": {"urls": [{"start": 217, "end": 240, "url": "https://t.co/C73mC2F5nY", "expanded_url": "https://uiadrian.gumroad.com/l/web-design-guide/?offer_code=twk0uc9f7", "display_url": "uiadrian.gumroad.com/l/web-design-g\u2026", "images": [{"url": "https://pbs.twimg.com/news_img/1610692634191826963/RsfNDpnW?format=jpg&name=orig", "width": 1005, "height": 565}, {"url": "https://pbs.twimg.com/news_img/1610692634191826963/RsfNDpnW?format=jpg&name=150x150", "width": 150, "height": 150}], "status": 200, "title": "The Ultimate Guide to Web Design (Landing Page UI Kit + Free Bonuses)", "description": "\ud83c\udf84 Christmas Promotion - $39 $29 \ud83c\udf84Learn how to become an ultimate web designer - from running a freelance business to learning design theory and web design processes to mastering design handoff \ud83d\udcddThe Ultimate Guide to Web Design is perfect for...\u2705 Web Designers \u2014 Learn new things about web design, practice with the Figma files, and take your skills to the next level\u2705 Freelance Designers \u2014 Simplify your workflow, get access to freelance documents like questionnaires, briefs, and design contracts\u2705 Web Developers \u2014 Learn the fundamentals of UI design, typography, color, and principles of visual perception. Don\u2019t rely on your designers as much!\u2705 Every creative person \u2014 Wanting to master the craft of web design!What will you find inside? \ud83d\udce6This e-book is divided into 6 parts and 33 chapters, counting 340 pages. Here\u2019s what\u2019s included:\ud83e\uddf3 Part 1: The Freelance GuideThe first part of this e-book will show you behind the scenes of running a freelance business. I go over my personal story, go i", "unwound_url": "https://uiadrian.gumroad.com/l/web-design-guide/?offer_code=twk0uc9f7"}]}, "author_id": "1328271312628936705", "edit_history_tweet_ids": ["1610692575752601600"], "possibly_sensitive": false, "created_at": "2023-01-04T17:40:09.000Z", "reply_settings": "everyone", "referenced_tweets": [{"type": "replied_to", "id": "1610623087061471232"}], "id": "1610692575752601600", "in_reply_to_user_id": "1328271312628936705", "conversation_id": "1610623087061471232", "edit_controls": {"edits_remaining": 5, "is_edit_eligible": false, "editable_until": "2023-01-04T18:10:09.000Z"}, "public_metrics": {"retweet_count": 2, "reply_count": 1, "like_count": 13, "quote_count": 0}, "text": "P.S. If you're interested in learning web design - check out my latest web design ebook\n\n\ud83c\udf81 340+ pages, UI kits, Practice files, freelance documents, design framework, and much more! \ud83c\udf81\n\nGrab it 35% OFF this week only\ud83d\udc47\nhttps://t.co/C73mC2F5nY", "context_annotations": [{"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557697121477832705", "name": "Publisher & News Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to marketing and advertiser agencies, publishers of magazines, newspapers, blogs, books"}}, {"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557697333571112960", "name": "Technology Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to softwares, apps, communication equipments, hardwares"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1462809883242205187", "name": "E-books"}}, {"domain": {"id": "173", "name": "Product Taxonomy", "description": "A product taxonomy for labeling categories of products. "}, "entity": {"id": "1462809883242205187", "name": "E-books"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "849075668352499712", "name": "Web design", "description": "Web design"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "849075668352499712", "name": "Web design", "description": "Web design"}}]}], "includes": {"users": [{"protected": false, "created_at": "2020-11-16T09:39:00.000Z", "description": "Design Lead \u2726 Community of 165k+ designers on IG \u2726 Teaching others how to design better with my design eBooks @ https://t.co/8Hp0dTkOvm", "url": "https://t.co/WJlGcHICnX", "public_metrics": {"followers_count": 42160, "following_count": 119, "tweet_count": 1319, "listed_count": 985}, "profile_image_url": "https://pbs.twimg.com/profile_images/1601946751170142209/1Qy54Ow6_normal.jpg", "name": "UI Adrian", "username": "uiuxadrian", "verified": false, "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/WJlGcHICnX", "expanded_url": "https://uiadrian.gumroad.com/l/design-manual/?offer_code=100k-special", "display_url": "uiadrian.gumroad.com/l/design-manua\u2026"}]}, "description": {"urls": [{"start": 112, "end": 135, "url": "https://t.co/8Hp0dTkOvm", "expanded_url": "http://uiadrian.gumroad.com", "display_url": "uiadrian.gumroad.com"}]}}, "location": "Design Manual 50% OFF \ud83d\udc49", "pinned_tweet_id": "1593236563672047616", "id": "1328271312628936705"}], "tweets": [{"lang": "en", "entities": {"urls": [{"start": 217, "end": 240, "url": "https://t.co/C73mC2F5nY", "expanded_url": "https://uiadrian.gumroad.com/l/web-design-guide/?offer_code=twk0uc9f7", "display_url": "uiadrian.gumroad.com/l/web-design-g\u2026", "images": [{"url": "https://pbs.twimg.com/news_img/1610692634191826963/RsfNDpnW?format=jpg&name=orig", "width": 1005, "height": 565}, {"url": "https://pbs.twimg.com/news_img/1610692634191826963/RsfNDpnW?format=jpg&name=150x150", "width": 150, "height": 150}], "status": 200, "title": "The Ultimate Guide to Web Design (Landing Page UI Kit + Free Bonuses)", "description": "\ud83c\udf84 Christmas Promotion - $39 $29 \ud83c\udf84Learn how to become an ultimate web designer - from running a freelance business to learning design theory and web design processes to mastering design handoff \ud83d\udcddThe Ultimate Guide to Web Design is perfect for...\u2705 Web Designers \u2014 Learn new things about web design, practice with the Figma files, and take your skills to the next level\u2705 Freelance Designers \u2014 Simplify your workflow, get access to freelance documents like questionnaires, briefs, and design contracts\u2705 Web Developers \u2014 Learn the fundamentals of UI design, typography, color, and principles of visual perception. Don\u2019t rely on your designers as much!\u2705 Every creative person \u2014 Wanting to master the craft of web design!What will you find inside? \ud83d\udce6This e-book is divided into 6 parts and 33 chapters, counting 340 pages. Here\u2019s what\u2019s included:\ud83e\uddf3 Part 1: The Freelance GuideThe first part of this e-book will show you behind the scenes of running a freelance business. I go over my personal story, go i", "unwound_url": "https://uiadrian.gumroad.com/l/web-design-guide/?offer_code=twk0uc9f7"}]}, "author_id": "1328271312628936705", "edit_history_tweet_ids": ["1610692575752601600"], "possibly_sensitive": false, "created_at": "2023-01-04T17:40:09.000Z", "reply_settings": "everyone", "referenced_tweets": [{"type": "replied_to", "id": "1610623087061471232"}], "id": "1610692575752601600", "in_reply_to_user_id": "1328271312628936705", "conversation_id": "1610623087061471232", "edit_controls": {"edits_remaining": 5, "is_edit_eligible": false, "editable_until": "2023-01-04T18:10:09.000Z"}, "public_metrics": {"retweet_count": 2, "reply_count": 1, "like_count": 13, "quote_count": 0}, "text": "P.S. If you're interested in learning web design - check out my latest web design ebook\n\n\ud83c\udf81 340+ pages, UI kits, Practice files, freelance documents, design framework, and much more! \ud83c\udf81\n\nGrab it 35% OFF this week only\ud83d\udc47\nhttps://t.co/C73mC2F5nY", "context_annotations": [{"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557697121477832705", "name": "Publisher & News Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to marketing and advertiser agencies, publishers of magazines, newspapers, blogs, books"}}, {"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557697333571112960", "name": "Technology Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to softwares, apps, communication equipments, hardwares"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1462809883242205187", "name": "E-books"}}, {"domain": {"id": "173", "name": "Product Taxonomy", "description": "A product taxonomy for labeling categories of products. "}, "entity": {"id": "1462809883242205187", "name": "E-books"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}, {"domain": {"id": "66", "name": "Interests and Hobbies Category", "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"}, "entity": {"id": "849075668352499712", "name": "Web design", "description": "Web design"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "849075668352499712", "name": "Web design", "description": "Web design"}}]}, {"lang": "en", "author_id": "1328271312628936705", "edit_history_tweet_ids": ["1610623087061471232"], "entities": {"annotations": [{"start": 6, "end": 7, "probability": 0.3821, "type": "Other", "normalized_text": "UX"}, {"start": 9, "end": 10, "probability": 0.3739, "type": "Organization", "normalized_text": "UI"}, {"start": 54, "end": 60, "probability": 0.474, "type": "Other", "normalized_text": "pixabay"}]}, "possibly_sensitive": false, "created_at": "2023-01-04T13:04:02.000Z", "reply_settings": "everyone", "id": "1610623087061471232", "conversation_id": "1610623087061471232", "edit_controls": {"edits_remaining": 5, "is_edit_eligible": false, "editable_until": "2023-01-04T13:34:02.000Z"}, "public_metrics": {"retweet_count": 195, "reply_count": 18, "like_count": 611, "quote_count": 0}, "text": "Every UX/UI designer should know these websites!\n\n1\ufe0f\u20e3 pixabay .com - huge collection of free images and videos\n2\ufe0f\u20e3 uidesigndaily .com - lots of free figma resources \n3\ufe0f\u20e3 designsystems .com - a great resource for building design systems\n\nShare it with your friends! Thanks \ud83d\udc99", "context_annotations": [{"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557697333571112960", "name": "Technology Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to softwares, apps, communication equipments, hardwares"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "848921413196984320", "name": "Computer programming", "description": "Computer programming"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1357430100367605760", "name": "UX design"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1566812284659200000", "name": "UI design"}}, {"domain": {"id": "165", "name": "Technology", "description": "for individual and types of technology, e.g., food technology, 3D printing"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}]}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id%2Cedit_history_tweet_ids&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld%2Cedit_controls%2Cedit_history_tweet_ids&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cvariants%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1610692575752601600", "version": "2.12.1", "retrieved_at": "2023-01-05T16:25:09+00:00"}} diff --git a/test-data/media_tweet.jsonl b/test-data/media_tweet.jsonl new file mode 100644 index 0000000..63243f7 --- /dev/null +++ b/test-data/media_tweet.jsonl @@ -0,0 +1 @@ +{"data": [{"text": "She's GOING with it! \ud83d\udc0d https://t.co/uXqSAGWkWJ", "id": "1558553189766631429", "attachments": {"media_keys": ["7_1558552952549277696"]}, "edit_controls": {"edits_remaining": 5, "is_edit_eligible": false, "editable_until": "2022-08-13T21:06:51.000Z"}, "public_metrics": {"retweet_count": 13, "reply_count": 3, "like_count": 93, "quote_count": 7}, "author_id": "495430242", "possibly_sensitive": false, "lang": "en", "conversation_id": "1558553189766631429", "created_at": "2022-08-13T20:36:51.000Z", "entities": {"urls": [{"start": 23, "end": 46, "url": "https://t.co/uXqSAGWkWJ", "expanded_url": "https://twitter.com/IgorBrigadir/status/1558553189766631429/video/1", "display_url": "pic.twitter.com/uXqSAGWkWJ", "media_key": "7_1558552952549277696"}]}, "reply_settings": "everyone", "edit_history_tweet_ids": ["1558553189766631429"]}], "includes": {"media": [{"media_key": "7_1558552952549277696", "variants": [{"bit_rate": 256000, "content_type": "video/mp4", "url": "https://video.twimg.com/ext_tw_video/1558552952549277696/pu/vid/480x270/BWnQG9k42jhjykdl.mp4?tag=12"}, {"content_type": "application/x-mpegURL", "url": "https://video.twimg.com/ext_tw_video/1558552952549277696/pu/pl/lFur_lI8dUa4WhQh.m3u8?tag=12&container=fmp4"}, {"bit_rate": 2176000, "content_type": "video/mp4", "url": "https://video.twimg.com/ext_tw_video/1558552952549277696/pu/vid/1280x720/WiZrNZKT2lMzK5zA.mp4?tag=12"}, {"bit_rate": 832000, "content_type": "video/mp4", "url": "https://video.twimg.com/ext_tw_video/1558552952549277696/pu/vid/640x360/kvgUlpiZ5LnjJytM.mp4?tag=12"}], "public_metrics": {"view_count": 19059}, "type": "video", "duration_ms": 71292, "preview_image_url": "https://pbs.twimg.com/ext_tw_video_thumb/1558552952549277696/pu/img/7VaQu-AvReI8RHkv.jpg", "width": 1280, "height": 720}], "users": [{"created_at": "2012-02-17T23:34:50.000Z", "name": "Igor Brigadir \ud83c\uddfa\ud83c\udde6 @igor@mastodon.social", "profile_image_url": "https://pbs.twimg.com/profile_images/2538946114/xiveugt78rc97y1dasxf_normal.jpeg", "url": "https://t.co/rT5vLfqHOC", "location": "Ireland", "id": "495430242", "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/rT5vLfqHOC", "expanded_url": "https://ukrainewar.carrd.co/", "display_url": "ukrainewar.carrd.co"}]}, "description": {"mentions": [{"start": 4, "end": 15, "username": "recsyslabs"}, {"start": 41, "end": 51, "username": "ucddublin"}, {"start": 57, "end": 72, "username": "insight_centre"}]}}, "protected": false, "verified": false, "description": "CTO @recsyslabs. Adjunct Research Fellow @ucddublin. PhD @insight_centre. Recommender Systems, Information Retrieval, Data Science, NLProc, ML, AI. he/him \udb40\udc00\udb40\udc00", "pinned_tweet_id": "1496861088519897090", "public_metrics": {"followers_count": 3560, "following_count": 5000, "tweet_count": 35099, "listed_count": 183}, "username": "IgorBrigadir"}], "tweets": [{"text": "She's GOING with it! \ud83d\udc0d https://t.co/uXqSAGWkWJ", "id": "1558553189766631429", "attachments": {"media_keys": ["7_1558552952549277696"]}, "edit_controls": {"edits_remaining": 5, "is_edit_eligible": false, "editable_until": "2022-08-13T21:06:51.000Z"}, "public_metrics": {"retweet_count": 13, "reply_count": 3, "like_count": 93, "quote_count": 7}, "author_id": "495430242", "possibly_sensitive": false, "lang": "en", "conversation_id": "1558553189766631429", "created_at": "2022-08-13T20:36:51.000Z", "entities": {"urls": [{"start": 23, "end": 46, "url": "https://t.co/uXqSAGWkWJ", "expanded_url": "https://twitter.com/IgorBrigadir/status/1558553189766631429/video/1", "display_url": "pic.twitter.com/uXqSAGWkWJ", "media_key": "7_1558552952549277696"}]}, "reply_settings": "everyone", "edit_history_tweet_ids": ["1558553189766631429"]}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id%2Cedit_history_tweet_ids&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld%2Cedit_controls%2Cedit_history_tweet_ids&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cvariants%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1558553189766631429", "version": "2.12.1", "retrieved_at": "2023-01-05T13:55:17+00:00"}} diff --git a/test-data/mentions_tweet.jsonl b/test-data/mentions_tweet.jsonl new file mode 100644 index 0000000..7572d0e --- /dev/null +++ b/test-data/mentions_tweet.jsonl @@ -0,0 +1 @@ +{"data": [{"entities": {"mentions": [{"start": 3, "end": 14, "username": "S3NT1N3L17", "id": "1221317221714157569"}]}, "public_metrics": {"retweet_count": 1, "reply_count": 0, "like_count": 0, "quote_count": 0}, "text": "RT @S3NT1N3L17: Looking for gaming mutuals (and I usually follow back unless you\u2019ve got a few dozen tweets and your account is a couple of\u2026", "reply_settings": "everyone", "created_at": "2022-11-07T18:11:04.000Z", "referenced_tweets": [{"type": "retweeted", "id": "1589671906718134272"}], "conversation_id": "1589681856941002753", "context_annotations": [{"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557696940178935808", "name": "Gaming Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to offline and online games such as gaming consoles, tabletop games, video game publishers"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "781974596752842752", "name": "Services"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10045225402", "name": "Twitter"}}], "lang": "en", "edit_controls": {"edits_remaining": 5, "is_edit_eligible": false, "editable_until": "2022-11-07T18:41:04.000Z"}, "author_id": "68401342", "id": "1589681856941002753", "edit_history_tweet_ids": ["1589681856941002753"], "possibly_sensitive": false}], "includes": {"users": [{"public_metrics": {"followers_count": 569, "following_count": 457, "tweet_count": 12978, "listed_count": 6}, "profile_image_url": "https://pbs.twimg.com/profile_images/1609369908797661185/1IxrH_K2_normal.jpg", "verified": false, "location": "North West, England", "description": "Xbox & PlayStation gamer, podcast panel member on @XBLpartypodcast & host of PSNpartypodcast\u2026member of @StasisChat GT:BaldManGamer PSN: BaldManGamer86", "entities": {"description": {"mentions": [{"start": 50, "end": 66, "username": "XBLpartypodcast"}, {"start": 103, "end": 114, "username": "StasisChat"}]}}, "created_at": "2009-08-24T13:00:08.000Z", "name": "BmG", "username": "BaldManGamer", "protected": false, "id": "68401342"}, {"public_metrics": {"followers_count": 1686, "following_count": 1550, "tweet_count": 64138, "listed_count": 11}, "profile_image_url": "https://pbs.twimg.com/profile_images/1578874883588726790/XvClYZ29_normal.jpg", "verified": false, "description": "Xbox/PS Gamer, YouTuber, Indie Author, Army Vet. Married w/ 3 teenagers. Co-host of PSN Party Podcast & panel member of @StasisChat, also on @xblpartypodcast", "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/iinuPK5VSw", "expanded_url": "https://linktr.ee/s3nt1n3l17", "display_url": "linktr.ee/s3nt1n3l17"}]}, "description": {"mentions": [{"start": 120, "end": 131, "username": "StasisChat"}, {"start": 141, "end": 157, "username": "xblpartypodcast"}]}}, "created_at": "2020-01-26T06:21:42.000Z", "pinned_tweet_id": "1588226833199366144", "name": "Sentinel 17 \ud83c\udfae\ud83d\udcd6\ud83c\uddfa\ud83c\uddf8\ud83e\ude96", "username": "S3NT1N3L17", "url": "https://t.co/iinuPK5VSw", "protected": false, "id": "1221317221714157569"}], "tweets": [{"entities": {"mentions": [{"start": 3, "end": 14, "username": "S3NT1N3L17", "id": "1221317221714157569"}]}, "public_metrics": {"retweet_count": 1, "reply_count": 0, "like_count": 0, "quote_count": 0}, "text": "RT @S3NT1N3L17: Looking for gaming mutuals (and I usually follow back unless you\u2019ve got a few dozen tweets and your account is a couple of\u2026", "reply_settings": "everyone", "created_at": "2022-11-07T18:11:04.000Z", "referenced_tweets": [{"type": "retweeted", "id": "1589671906718134272"}], "conversation_id": "1589681856941002753", "context_annotations": [{"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557696940178935808", "name": "Gaming Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to offline and online games such as gaming consoles, tabletop games, video game publishers"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "781974596752842752", "name": "Services"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10045225402", "name": "Twitter"}}], "lang": "en", "edit_controls": {"edits_remaining": 5, "is_edit_eligible": false, "editable_until": "2022-11-07T18:41:04.000Z"}, "author_id": "68401342", "id": "1589681856941002753", "edit_history_tweet_ids": ["1589681856941002753"], "possibly_sensitive": false}, {"entities": {"mentions": [{"start": 148, "end": 153, "username": "Xbox", "id": "24742040"}, {"start": 154, "end": 165, "username": "masseffect", "id": "20747847"}, {"start": 166, "end": 174, "username": "Ubisoft", "id": "14922225"}, {"start": 175, "end": 189, "username": "RockstarGames", "id": "29758446"}, {"start": 190, "end": 201, "username": "tombraider", "id": "98616286"}, {"start": 202, "end": 213, "username": "CallofDuty", "id": "290097288"}, {"start": 214, "end": 230, "username": "SonySantaMonica", "id": "382823424"}, {"start": 231, "end": 243, "username": "Naughty_Dog", "id": "15222083"}], "annotations": [{"start": 246, "end": 249, "probability": 0.8662, "type": "Product", "normalized_text": "Xbox"}, {"start": 252, "end": 262, "probability": 0.9271, "type": "Product", "normalized_text": "PlayStation"}], "hashtags": [{"start": 245, "end": 250, "tag": "Xbox"}, {"start": 251, "end": 263, "tag": "PlayStation"}, {"start": 264, "end": 271, "tag": "gaming"}], "urls": [{"start": 272, "end": 295, "url": "https://t.co/X8D4rbk47i", "expanded_url": "https://twitter.com/i/web/status/1589671906718134272", "display_url": "twitter.com/i/web/status/1\u2026"}, {"start": 296, "end": 319, "url": "https://t.co/nyzCbH5jW2", "expanded_url": "https://twitter.com/S3NT1N3L17/status/1589671906718134272/photo/1", "display_url": "pic.twitter.com/nyzCbH5jW2", "media_key": "16_1589671894021816321"}]}, "public_metrics": {"retweet_count": 1, "reply_count": 1, "like_count": 6, "quote_count": 0}, "text": "Looking for gaming mutuals (and I usually follow back unless you\u2019ve got a few dozen tweets and your account is a couple of minutes old \ud83d\ude09)\n\nFan of: \n@Xbox @masseffect @Ubisoft @RockstarGames @tombraider @CallofDuty @SonySantaMonica @Naughty_Dog \n#Xbox #PlayStation #gaming https://t.co/X8D4rbk47i https://t.co/nyzCbH5jW2", "reply_settings": "everyone", "created_at": "2022-11-07T17:31:31.000Z", "conversation_id": "1589671906718134272", "context_annotations": [{"domain": {"id": "45", "name": "Brand Vertical", "description": "Top level entities that describe a Brands industry"}, "entity": {"id": "781974597310615553", "name": "Entertainment"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "781974597218340864", "name": "Video Games - Entertainment"}}, {"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557696802391859201", "name": "Entertainment & Leisure Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to movies, music, television, franchises, venues, theme parks, toys, tourism, hotels"}}, {"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557696940178935808", "name": "Gaming Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to offline and online games such as gaming consoles, tabletop games, video game publishers"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10026653538", "name": "Ubisoft"}}, {"domain": {"id": "71", "name": "Video Game", "description": "A video game like Overwatch"}, "entity": {"id": "10027933222", "name": "Call of Duty", "description": "This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable.\t\t\t"}}, {"domain": {"id": "71", "name": "Video Game", "description": "A video game like Overwatch"}, "entity": {"id": "10041743512", "name": "Tomb Raider", "description": "This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable. "}}, {"domain": {"id": "71", "name": "Video Game", "description": "A video game like Overwatch"}, "entity": {"id": "10045544167", "name": "Mass Effect", "description": "This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable.\t\t\t"}}, {"domain": {"id": "71", "name": "Video Game", "description": "A video game like Overwatch"}, "entity": {"id": "997093174341337088", "name": "Call of Duty: Black Ops 4", "description": "Call of Duty: Black Ops 4"}}, {"domain": {"id": "78", "name": "Video Game Publisher", "description": "A Video Game Publisher like 'Activision'"}, "entity": {"id": "10026653538", "name": "Ubisoft"}}, {"domain": {"id": "78", "name": "Video Game Publisher", "description": "A Video Game Publisher like 'Activision'"}, "entity": {"id": "991743180566478848", "name": "Rockstar Games", "description": "Rockstar Games"}}, {"domain": {"id": "130", "name": "Multimedia Franchise", "description": "Franchises which span multiple forms of media like 'Harry Potter'"}, "entity": {"id": "10041743512", "name": "Tomb Raider", "description": "This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable. "}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "10026653538", "name": "Ubisoft"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "10027933222", "name": "Call of Duty", "description": "This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable.\t\t\t"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "10041743512", "name": "Tomb Raider", "description": "This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable. "}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "10045544167", "name": "Mass Effect", "description": "This entity includes all conversation about the franchise, as well as any individual installments in the series, if applicable.\t\t\t"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847527650667094017", "name": "Gaming", "description": "Gaming"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "991743180566478848", "name": "Rockstar Games", "description": "Rockstar Games"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1070028159964262400", "name": "Video games"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1070028312276127746", "name": "Game development", "description": "Video Game Publisher"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1146103171086819328", "name": "Gaming influencers", "description": "Gaming Personalities & Esports Players"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1197906382529294337", "name": "Competitive games"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1287879055191052288", "name": "Gaming content creators"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1403427158953922560", "name": "Shooting games"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1491898787358408746", "name": "Sci-fi games"}}, {"domain": {"id": "45", "name": "Brand Vertical", "description": "Top level entities that describe a Brands industry"}, "entity": {"id": "781974597310615553", "name": "Entertainment"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "781974596752842752", "name": "Services"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "781974597218340864", "name": "Video Games - Entertainment"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10027048851", "name": "Xbox"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10027048853", "name": "PlayStation"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "79", "name": "Video Game Hardware", "description": "Video Game Hardware"}, "entity": {"id": "10027048851", "name": "Xbox"}}, {"domain": {"id": "79", "name": "Video Game Hardware", "description": "Video Game Hardware"}, "entity": {"id": "10027048853", "name": "PlayStation"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "10027048851", "name": "Xbox"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "10027048853", "name": "PlayStation"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "847527650667094017", "name": "Gaming", "description": "Gaming"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1070028159964262400", "name": "Video games"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1070028376662888448", "name": "Console gaming", "description": "Video Game Hardware"}}], "lang": "en", "edit_controls": {"edits_remaining": 5, "is_edit_eligible": false, "editable_until": "2022-11-07T18:01:31.000Z"}, "author_id": "1221317221714157569", "id": "1589671906718134272", "edit_history_tweet_ids": ["1589671906718134272"], "attachments": {"media_keys": ["16_1589671894021816321", "16_1589671894013321222", "16_1589671894009126913", "16_1589671894223126529"]}, "possibly_sensitive": false}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id%2Cedit_history_tweet_ids&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld%2Cedit_controls%2Cedit_history_tweet_ids&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cvariants%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1589681856941002753", "version": "2.13.0", "retrieved_at": "2023-01-05T17:34:04+00:00"}} diff --git a/test-data/verified_type.jsonl b/test-data/verified_type.jsonl new file mode 100644 index 0000000..5f49cee --- /dev/null +++ b/test-data/verified_type.jsonl @@ -0,0 +1 @@ +{"data": [{"public_metrics": {"retweet_count": 104, "reply_count": 37, "like_count": 745, "quote_count": 24, "impression_count": 139075}, "id": "1603823063690199040", "entities": {"annotations": [{"start": 18, "end": 24, "probability": 0.9421, "type": "Other", "normalized_text": "Twitter"}, {"start": 85, "end": 91, "probability": 0.9323, "type": "Other", "normalized_text": "Twitter"}]}, "conversation_id": "1603823063690199040", "reply_settings": "everyone", "context_annotations": [{"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557697333571112960", "name": "Technology Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to softwares, apps, communication equipments, hardwares"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1196446161223028736", "name": "Social media"}}, {"domain": {"id": "165", "name": "Technology", "description": "for individual and types of technology, e.g., food technology, 3D printing"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "165", "name": "Technology", "description": "for individual and types of technology, e.g., food technology, 3D printing"}, "entity": {"id": "1196446161223028736", "name": "Social media"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "781974596752842752", "name": "Services"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "165", "name": "Technology", "description": "for individual and types of technology, e.g., food technology, 3D printing"}, "entity": {"id": "1196446161223028736", "name": "Social media"}}], "author_id": "2244994945", "edit_history_tweet_ids": ["1603823063690199040"], "possibly_sensitive": false, "lang": "en", "text": "Testing\u2026 Testing\u2026 Twitter Dev is back online! \ud83d\udce1\n\nIt\u2019s been a time for change here at Twitter Dev but we have an important message to share with you. \ud83e\uddf5", "edit_controls": {"edits_remaining": 5, "is_edit_eligible": false, "editable_until": "2022-12-16T19:13:10.000Z"}, "created_at": "2022-12-16T18:43:10.000Z"}], "includes": {"users": [{"username": "TwitterDev", "verified_type": "business", "profile_image_url": "https://pbs.twimg.com/profile_images/1445764922474827784/W2zEPN7U_normal.jpg", "verified": true, "created_at": "2013-12-14T04:35:55.000Z", "entities": {"url": {"urls": [{"start": 0, "end": 23, "url": "https://t.co/9wI31m3ELF", "expanded_url": "https://developer.twitter.com/en/community", "display_url": "developer.twitter.com/en/community"}]}, "description": {"hashtags": [{"start": 17, "end": 28, "tag": "TwitterDev"}, {"start": 105, "end": 116, "tag": "TwitterAPI"}]}}, "public_metrics": {"followers_count": 556601, "following_count": 1961, "tweet_count": 4053, "listed_count": 2108}, "url": "https://t.co/9wI31m3ELF", "protected": false, "id": "2244994945", "description": "The voice of the #TwitterDev team and your official source for updates, news, and events, related to the #TwitterAPI.", "name": "Twitter Dev", "location": "127.0.0.1"}], "tweets": [{"public_metrics": {"retweet_count": 104, "reply_count": 37, "like_count": 745, "quote_count": 24, "impression_count": 139075}, "id": "1603823063690199040", "entities": {"annotations": [{"start": 18, "end": 24, "probability": 0.9421, "type": "Other", "normalized_text": "Twitter"}, {"start": 85, "end": 91, "probability": 0.9323, "type": "Other", "normalized_text": "Twitter"}]}, "conversation_id": "1603823063690199040", "reply_settings": "everyone", "context_annotations": [{"domain": {"id": "46", "name": "Business Taxonomy", "description": "Categories within Brand Verticals that narrow down the scope of Brands"}, "entity": {"id": "1557697333571112960", "name": "Technology Business", "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to softwares, apps, communication equipments, hardwares"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "1196446161223028736", "name": "Social media"}}, {"domain": {"id": "165", "name": "Technology", "description": "for individual and types of technology, e.g., food technology, 3D printing"}, "entity": {"id": "848920371311001600", "name": "Technology", "description": "Technology and computing"}}, {"domain": {"id": "165", "name": "Technology", "description": "for individual and types of technology, e.g., food technology, 3D printing"}, "entity": {"id": "1196446161223028736", "name": "Social media"}}, {"domain": {"id": "30", "name": "Entities [Entity Service]", "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"}, "entity": {"id": "781974596752842752", "name": "Services"}}, {"domain": {"id": "47", "name": "Brand", "description": "Brands and Companies"}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "131", "name": "Unified Twitter Taxonomy", "description": "A taxonomy of user interests. "}, "entity": {"id": "10045225402", "name": "Twitter"}}, {"domain": {"id": "165", "name": "Technology", "description": "for individual and types of technology, e.g., food technology, 3D printing"}, "entity": {"id": "1196446161223028736", "name": "Social media"}}], "author_id": "2244994945", "edit_history_tweet_ids": ["1603823063690199040"], "possibly_sensitive": false, "lang": "en", "text": "Testing\u2026 Testing\u2026 Twitter Dev is back online! \ud83d\udce1\n\nIt\u2019s been a time for change here at Twitter Dev but we have an important message to share with you. \ud83e\uddf5", "edit_controls": {"edits_remaining": 5, "is_edit_eligible": false, "editable_until": "2022-12-16T19:13:10.000Z"}, "created_at": "2022-12-16T18:43:10.000Z"}]}, "__twarc": {"url": "https://api.twitter.com/2/tweets?expansions=author_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id%2Centities.mentions.username%2Cattachments.poll_ids%2Cattachments.media_keys%2Cgeo.place_id%2Cedit_history_tweet_ids&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Ctext%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Cwithheld%2Cedit_controls%2Cedit_history_tweet_ids&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cverified_type%2Cwithheld&media.fields=alt_text%2Cduration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cvariants%2Cpublic_metrics&poll.fields=duration_minutes%2Cend_datetime%2Cid%2Coptions%2Cvoting_status&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&ids=1603823063690199040", "version": "2.13.0", "retrieved_at": "2023-01-06T02:30:36+00:00"}} diff --git a/test_twarc_csv.py b/test_twarc_csv.py index 013632f..0af665f 100644 --- a/test_twarc_csv.py +++ b/test_twarc_csv.py @@ -54,7 +54,7 @@ def test_2sets(): def test_brexit(): - _process_file("2sets") + _process_file("brexit") def test_kpop(): @@ -119,3 +119,23 @@ def test_edited_before(): def test_quoted_edit(): _process_file("quoted_edit") + + +def test_cashtags(): + _process_file("cashtags") + + +def test_media_tweet(): + _process_file("media_tweet") + + +def test_mentions(): + _process_file("mentions_tweet") + + +def test_many_urls(): + _process_file("many_urls") + + +def test_verified_type(): + _process_file("verified_type") diff --git a/twarc_csv.py b/twarc_csv.py index bf17fb2..b81c755 100644 --- a/twarc_csv.py +++ b/twarc_csv.py @@ -60,6 +60,11 @@ def _validate_output_columns(context, parameter, value): default=True, help="Merge original tweet metadata into retweets. The Retweet Text, metrics and entities are merged from the original tweet. Default: Yes.", ) +@click.option( + "--process-entities/--no-process-entities", + default=True, + help="Preprocess entities like URLs, mentions and hashtags, providing expanded urls and lists only instead of full json objects. Default: Yes.", +) @click.option( "--json-encode-all/--no-json-encode-all", default=False, @@ -119,6 +124,7 @@ def csv( json_encode_lists, inline_referenced_tweets, merge_retweets, + process_entities, allow_duplicates, extra_input_columns, output_columns, @@ -160,6 +166,7 @@ def csv( json_encode_lists=json_encode_lists, inline_referenced_tweets=inline_referenced_tweets, merge_retweets=merge_retweets, + process_entities=process_entities, allow_duplicates=allow_duplicates, extra_input_columns=extra_input_columns, output_columns=output_columns, @@ -176,7 +183,6 @@ def csv( writer.process() if not hide_stats and outfile.name != "": - errors = ( click.style( f"{converter.counts['parse_errors']} failed to parse. See twarc.log for details.\n",