datasources.twitter-import.search_twitter
Import scraped X/Twitter data
It's prohibitively difficult to scrape data from Twitter within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped X/Twitter data 3 4It's prohibitively difficult to scrape data from Twitter within 4CAT itself due 5to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7from datetime import datetime 8 9from backend.lib.search import Search 10from common.lib.helpers import strip_tags 11from common.lib.item_mapping import MappedItem 12 13 14class SearchTwitterViaZeeschuimer(Search): 15 """ 16 Import scraped Imgur data 17 """ 18 type = "twitter-import" # job ID 19 category = "Search" # category 20 title = "Import scraped X/Twitter data" # title displayed in UI 21 description = "Import X/Twitter data collected with an external tool such as Zeeschuimer." # description displayed in UI 22 extension = "ndjson" # extension of result file, used internally and in UI 23 is_from_zeeschuimer = True 24 25 # not available as a processor for existing datasets 26 accepts = [] 27 references = [ 28 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 29 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 30 ] 31 32 def get_items(self, query): 33 """ 34 Run custom search 35 36 Not available for Imgur 37 """ 38 raise NotImplementedError("Twitter datasets can only be created by importing data from elsewhere") 39 40 @staticmethod 41 def map_item(item): 42 if item.get("rest_id"): 43 return MappedItem(SearchTwitterViaZeeschuimer.map_item_modern(item)) 44 elif item.get("type") == "adaptive": 45 return MappedItem(SearchTwitterViaZeeschuimer.map_item_legacy(item)) 46 else: 47 raise NotImplementedError 48 49 @staticmethod 50 def map_item_modern(tweet): 51 timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y") 52 withheld = False 53 54 retweet = tweet["legacy"].get("retweeted_status_result") 55 if retweet: 56 # make sure the full RT is included, by default this is shortened 57 if "tweet" in retweet["result"]: 58 retweet["result"] = retweet["result"]["tweet"] 59 60 if retweet["result"].get("legacy", {}).get("withheld_scope"): 61 withheld = True 62 tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"] 63 else: 64 t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \ 65 ": " + retweet["result"]["legacy"]["full_text"] 66 tweet["legacy"]["full_text"] = t_text 67 68 quote_tweet = tweet.get("quoted_status_result") 69 if quote_tweet and "tweet" in quote_tweet.get("result", {}): 70 # sometimes this is one level deeper, sometimes not... 71 quote_tweet["result"] = quote_tweet["result"]["tweet"] 72 73 return { 74 "id": tweet["rest_id"], 75 "thread_id": tweet["legacy"]["conversation_id_str"], 76 "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"), 77 "unix_timestamp": int(timestamp.timestamp()), 78 "link": f"https://x.com/{tweet['core']['user_results']['result']['legacy']['screen_name']}/status/{tweet['id']}", 79 "body": tweet["legacy"]["full_text"], 80 "author": tweet["core"]["user_results"]["result"]["legacy"]["screen_name"], 81 "author_fullname": tweet["core"]["user_results"]["result"]["legacy"]["name"], 82 "author_id": tweet["legacy"]["user_id_str"], 83 "author_avatar_url": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"], 84 "author_banner_url": tweet["core"]["user_results"]["result"]["legacy"].get("profile_banner_url", ""), # key does not exist when author does not have a banner 85 "source": strip_tags(tweet["source"]), 86 "language_guess": tweet["legacy"].get("lang"), 87 "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") else "no", 88 "retweet_count": tweet["legacy"]["retweet_count"], 89 "reply_count": tweet["legacy"]["reply_count"], 90 "like_count": tweet["legacy"]["favorite_count"], 91 "quote_count": tweet["legacy"]["quote_count"], 92 "impression_count": tweet.get("views", {}).get("count", ""), 93 "is_retweet": "yes" if retweet else "no", 94 "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "", 95 "is_quote_tweet": "yes" if quote_tweet else "no", 96 "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if (quote_tweet and "tombstone" not in quote_tweet["result"]) else "", 97 "is_quote_withheld": "yes" if (quote_tweet and "tombstone" in quote_tweet["result"]) else "no", 98 "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != str(tweet["rest_id"]) else "no", 99 "replied_user": tweet["legacy"].get("in_reply_to_screen_name", ""), 100 "is_withheld": "yes" if withheld else "no", 101 "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]), 102 "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]), 103 "images": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if 104 media["type"] == "photo"]), 105 "videos": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if 106 media["type"] == "video"]), 107 "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]), 108 "long_lat": SearchTwitterViaZeeschuimer.get_centroid( 109 tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "", 110 "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "", 111 } 112 113 @staticmethod 114 def map_item_legacy(tweet): 115 timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y") 116 tweet_id = tweet["legacy"]["id_str"] 117 withheld = False 118 119 retweet = tweet["legacy"].get("retweeted_status_result") 120 if retweet: 121 # make sure the full RT is included, by default this is shortened 122 if retweet["result"].get("legacy", {}).get("withheld_status"): 123 withheld = True 124 tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"] 125 else: 126 t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \ 127 " " + retweet["result"]["legacy"]["full_text"] 128 tweet["legacy"]["full_text"] = t_text 129 130 quote_tweet = tweet.get("quoted_status_result") 131 132 if quote_tweet and "tweet" in quote_tweet.get("result", {}): 133 # sometimes this is one level deeper, sometimes not... 134 quote_tweet["result"] = quote_tweet["result"]["tweet"] 135 136 return { 137 "id": tweet_id, 138 "thread_id": tweet["legacy"]["conversation_id_str"], 139 "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"), 140 "unix_timestamp": int(timestamp.timestamp()), 141 "link": f"https://x.com/{tweet['user']['screen_name']}/status/{tweet_id}", 142 "body": tweet["legacy"]["full_text"], 143 "author": tweet["user"]["screen_name"], 144 "author_fullname": tweet["user"]["name"], 145 "author_id": tweet["user"]["id_str"], 146 "source": strip_tags(tweet["legacy"]["source"]), 147 "language_guess": tweet["legacy"].get("lang"), 148 "possibly_sensitive": "yes" if tweet["legacy"].get("possibly_sensitive") else "no", 149 "retweet_count": tweet["legacy"]["retweet_count"], 150 "reply_count": tweet["legacy"]["reply_count"], 151 "like_count": tweet["legacy"]["favorite_count"], 152 "quote_count": tweet["legacy"]["quote_count"], 153 "impression_count": tweet.get("ext_views", {}).get("count", ""), 154 "is_retweet": "yes" if retweet else "no", 155 "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "", 156 "is_quote_tweet": "yes" if quote_tweet else "no", 157 "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "", 158 "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != tweet_id else "no", 159 "replied_user": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get( 160 "in_reply_to_screen_name") else "", 161 "is_withheld": "yes" if withheld else "no", 162 "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]), 163 "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]), 164 "images": ",".join( 165 [media["media_url_https"] for media in tweet["legacy"].get("extended_entities", {}).get("media", []) if 166 media["type"] == "photo"]), 167 "videos": ",".join([media["video_info"]["variants"][0]["url"] for media in 168 tweet["legacy"].get("extended_entities", {}).get("media", []) if 169 media["type"] == "video"]), 170 "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]), 171 "long_lat": SearchTwitterViaZeeschuimer.get_centroid( 172 tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "", 173 "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "", 174 } 175 176 @staticmethod 177 def get_centroid(box): 178 """ 179 Get centre of a rectangular box 180 181 Convenience function for converting X/Twitter's bounding box coordinates 182 to a singular coordinate - simply the centre of the box - because that 183 is what is expected for mapped output. 184 185 :param list box: The box as part of X/Twitter's response 186 :return str: Coordinate, as longitude,latitude. 187 """ 188 box = box[0] 189 return ",".join(( 190 str(round((box[0][0] + box[1][0]) / 2, 6)), 191 str(round((box[0][1] + box[1][1]) / 2, 6)), 192 ))
15class SearchTwitterViaZeeschuimer(Search): 16 """ 17 Import scraped Imgur data 18 """ 19 type = "twitter-import" # job ID 20 category = "Search" # category 21 title = "Import scraped X/Twitter data" # title displayed in UI 22 description = "Import X/Twitter data collected with an external tool such as Zeeschuimer." # description displayed in UI 23 extension = "ndjson" # extension of result file, used internally and in UI 24 is_from_zeeschuimer = True 25 26 # not available as a processor for existing datasets 27 accepts = [] 28 references = [ 29 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 30 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 31 ] 32 33 def get_items(self, query): 34 """ 35 Run custom search 36 37 Not available for Imgur 38 """ 39 raise NotImplementedError("Twitter datasets can only be created by importing data from elsewhere") 40 41 @staticmethod 42 def map_item(item): 43 if item.get("rest_id"): 44 return MappedItem(SearchTwitterViaZeeschuimer.map_item_modern(item)) 45 elif item.get("type") == "adaptive": 46 return MappedItem(SearchTwitterViaZeeschuimer.map_item_legacy(item)) 47 else: 48 raise NotImplementedError 49 50 @staticmethod 51 def map_item_modern(tweet): 52 timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y") 53 withheld = False 54 55 retweet = tweet["legacy"].get("retweeted_status_result") 56 if retweet: 57 # make sure the full RT is included, by default this is shortened 58 if "tweet" in retweet["result"]: 59 retweet["result"] = retweet["result"]["tweet"] 60 61 if retweet["result"].get("legacy", {}).get("withheld_scope"): 62 withheld = True 63 tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"] 64 else: 65 t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \ 66 ": " + retweet["result"]["legacy"]["full_text"] 67 tweet["legacy"]["full_text"] = t_text 68 69 quote_tweet = tweet.get("quoted_status_result") 70 if quote_tweet and "tweet" in quote_tweet.get("result", {}): 71 # sometimes this is one level deeper, sometimes not... 72 quote_tweet["result"] = quote_tweet["result"]["tweet"] 73 74 return { 75 "id": tweet["rest_id"], 76 "thread_id": tweet["legacy"]["conversation_id_str"], 77 "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"), 78 "unix_timestamp": int(timestamp.timestamp()), 79 "link": f"https://x.com/{tweet['core']['user_results']['result']['legacy']['screen_name']}/status/{tweet['id']}", 80 "body": tweet["legacy"]["full_text"], 81 "author": tweet["core"]["user_results"]["result"]["legacy"]["screen_name"], 82 "author_fullname": tweet["core"]["user_results"]["result"]["legacy"]["name"], 83 "author_id": tweet["legacy"]["user_id_str"], 84 "author_avatar_url": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"], 85 "author_banner_url": tweet["core"]["user_results"]["result"]["legacy"].get("profile_banner_url", ""), # key does not exist when author does not have a banner 86 "source": strip_tags(tweet["source"]), 87 "language_guess": tweet["legacy"].get("lang"), 88 "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") else "no", 89 "retweet_count": tweet["legacy"]["retweet_count"], 90 "reply_count": tweet["legacy"]["reply_count"], 91 "like_count": tweet["legacy"]["favorite_count"], 92 "quote_count": tweet["legacy"]["quote_count"], 93 "impression_count": tweet.get("views", {}).get("count", ""), 94 "is_retweet": "yes" if retweet else "no", 95 "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "", 96 "is_quote_tweet": "yes" if quote_tweet else "no", 97 "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if (quote_tweet and "tombstone" not in quote_tweet["result"]) else "", 98 "is_quote_withheld": "yes" if (quote_tweet and "tombstone" in quote_tweet["result"]) else "no", 99 "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != str(tweet["rest_id"]) else "no", 100 "replied_user": tweet["legacy"].get("in_reply_to_screen_name", ""), 101 "is_withheld": "yes" if withheld else "no", 102 "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]), 103 "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]), 104 "images": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if 105 media["type"] == "photo"]), 106 "videos": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if 107 media["type"] == "video"]), 108 "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]), 109 "long_lat": SearchTwitterViaZeeschuimer.get_centroid( 110 tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "", 111 "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "", 112 } 113 114 @staticmethod 115 def map_item_legacy(tweet): 116 timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y") 117 tweet_id = tweet["legacy"]["id_str"] 118 withheld = False 119 120 retweet = tweet["legacy"].get("retweeted_status_result") 121 if retweet: 122 # make sure the full RT is included, by default this is shortened 123 if retweet["result"].get("legacy", {}).get("withheld_status"): 124 withheld = True 125 tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"] 126 else: 127 t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \ 128 " " + retweet["result"]["legacy"]["full_text"] 129 tweet["legacy"]["full_text"] = t_text 130 131 quote_tweet = tweet.get("quoted_status_result") 132 133 if quote_tweet and "tweet" in quote_tweet.get("result", {}): 134 # sometimes this is one level deeper, sometimes not... 135 quote_tweet["result"] = quote_tweet["result"]["tweet"] 136 137 return { 138 "id": tweet_id, 139 "thread_id": tweet["legacy"]["conversation_id_str"], 140 "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"), 141 "unix_timestamp": int(timestamp.timestamp()), 142 "link": f"https://x.com/{tweet['user']['screen_name']}/status/{tweet_id}", 143 "body": tweet["legacy"]["full_text"], 144 "author": tweet["user"]["screen_name"], 145 "author_fullname": tweet["user"]["name"], 146 "author_id": tweet["user"]["id_str"], 147 "source": strip_tags(tweet["legacy"]["source"]), 148 "language_guess": tweet["legacy"].get("lang"), 149 "possibly_sensitive": "yes" if tweet["legacy"].get("possibly_sensitive") else "no", 150 "retweet_count": tweet["legacy"]["retweet_count"], 151 "reply_count": tweet["legacy"]["reply_count"], 152 "like_count": tweet["legacy"]["favorite_count"], 153 "quote_count": tweet["legacy"]["quote_count"], 154 "impression_count": tweet.get("ext_views", {}).get("count", ""), 155 "is_retweet": "yes" if retweet else "no", 156 "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "", 157 "is_quote_tweet": "yes" if quote_tweet else "no", 158 "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "", 159 "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != tweet_id else "no", 160 "replied_user": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get( 161 "in_reply_to_screen_name") else "", 162 "is_withheld": "yes" if withheld else "no", 163 "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]), 164 "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]), 165 "images": ",".join( 166 [media["media_url_https"] for media in tweet["legacy"].get("extended_entities", {}).get("media", []) if 167 media["type"] == "photo"]), 168 "videos": ",".join([media["video_info"]["variants"][0]["url"] for media in 169 tweet["legacy"].get("extended_entities", {}).get("media", []) if 170 media["type"] == "video"]), 171 "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]), 172 "long_lat": SearchTwitterViaZeeschuimer.get_centroid( 173 tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "", 174 "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "", 175 } 176 177 @staticmethod 178 def get_centroid(box): 179 """ 180 Get centre of a rectangular box 181 182 Convenience function for converting X/Twitter's bounding box coordinates 183 to a singular coordinate - simply the centre of the box - because that 184 is what is expected for mapped output. 185 186 :param list box: The box as part of X/Twitter's response 187 :return str: Coordinate, as longitude,latitude. 188 """ 189 box = box[0] 190 return ",".join(( 191 str(round((box[0][0] + box[1][0]) / 2, 6)), 192 str(round((box[0][1] + box[1][1]) / 2, 6)), 193 ))
Import scraped Imgur data
references =
['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def
get_items(self, query):
33 def get_items(self, query): 34 """ 35 Run custom search 36 37 Not available for Imgur 38 """ 39 raise NotImplementedError("Twitter datasets can only be created by importing data from elsewhere")
Run custom search
Not available for Imgur
@staticmethod
def
map_item_modern(tweet):
50 @staticmethod 51 def map_item_modern(tweet): 52 timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y") 53 withheld = False 54 55 retweet = tweet["legacy"].get("retweeted_status_result") 56 if retweet: 57 # make sure the full RT is included, by default this is shortened 58 if "tweet" in retweet["result"]: 59 retweet["result"] = retweet["result"]["tweet"] 60 61 if retweet["result"].get("legacy", {}).get("withheld_scope"): 62 withheld = True 63 tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"] 64 else: 65 t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \ 66 ": " + retweet["result"]["legacy"]["full_text"] 67 tweet["legacy"]["full_text"] = t_text 68 69 quote_tweet = tweet.get("quoted_status_result") 70 if quote_tweet and "tweet" in quote_tweet.get("result", {}): 71 # sometimes this is one level deeper, sometimes not... 72 quote_tweet["result"] = quote_tweet["result"]["tweet"] 73 74 return { 75 "id": tweet["rest_id"], 76 "thread_id": tweet["legacy"]["conversation_id_str"], 77 "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"), 78 "unix_timestamp": int(timestamp.timestamp()), 79 "link": f"https://x.com/{tweet['core']['user_results']['result']['legacy']['screen_name']}/status/{tweet['id']}", 80 "body": tweet["legacy"]["full_text"], 81 "author": tweet["core"]["user_results"]["result"]["legacy"]["screen_name"], 82 "author_fullname": tweet["core"]["user_results"]["result"]["legacy"]["name"], 83 "author_id": tweet["legacy"]["user_id_str"], 84 "author_avatar_url": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"], 85 "author_banner_url": tweet["core"]["user_results"]["result"]["legacy"].get("profile_banner_url", ""), # key does not exist when author does not have a banner 86 "source": strip_tags(tweet["source"]), 87 "language_guess": tweet["legacy"].get("lang"), 88 "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") else "no", 89 "retweet_count": tweet["legacy"]["retweet_count"], 90 "reply_count": tweet["legacy"]["reply_count"], 91 "like_count": tweet["legacy"]["favorite_count"], 92 "quote_count": tweet["legacy"]["quote_count"], 93 "impression_count": tweet.get("views", {}).get("count", ""), 94 "is_retweet": "yes" if retweet else "no", 95 "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "", 96 "is_quote_tweet": "yes" if quote_tweet else "no", 97 "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if (quote_tweet and "tombstone" not in quote_tweet["result"]) else "", 98 "is_quote_withheld": "yes" if (quote_tweet and "tombstone" in quote_tweet["result"]) else "no", 99 "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != str(tweet["rest_id"]) else "no", 100 "replied_user": tweet["legacy"].get("in_reply_to_screen_name", ""), 101 "is_withheld": "yes" if withheld else "no", 102 "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]), 103 "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]), 104 "images": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if 105 media["type"] == "photo"]), 106 "videos": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if 107 media["type"] == "video"]), 108 "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]), 109 "long_lat": SearchTwitterViaZeeschuimer.get_centroid( 110 tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "", 111 "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "", 112 }
@staticmethod
def
map_item_legacy(tweet):
114 @staticmethod 115 def map_item_legacy(tweet): 116 timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y") 117 tweet_id = tweet["legacy"]["id_str"] 118 withheld = False 119 120 retweet = tweet["legacy"].get("retweeted_status_result") 121 if retweet: 122 # make sure the full RT is included, by default this is shortened 123 if retweet["result"].get("legacy", {}).get("withheld_status"): 124 withheld = True 125 tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"] 126 else: 127 t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \ 128 " " + retweet["result"]["legacy"]["full_text"] 129 tweet["legacy"]["full_text"] = t_text 130 131 quote_tweet = tweet.get("quoted_status_result") 132 133 if quote_tweet and "tweet" in quote_tweet.get("result", {}): 134 # sometimes this is one level deeper, sometimes not... 135 quote_tweet["result"] = quote_tweet["result"]["tweet"] 136 137 return { 138 "id": tweet_id, 139 "thread_id": tweet["legacy"]["conversation_id_str"], 140 "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"), 141 "unix_timestamp": int(timestamp.timestamp()), 142 "link": f"https://x.com/{tweet['user']['screen_name']}/status/{tweet_id}", 143 "body": tweet["legacy"]["full_text"], 144 "author": tweet["user"]["screen_name"], 145 "author_fullname": tweet["user"]["name"], 146 "author_id": tweet["user"]["id_str"], 147 "source": strip_tags(tweet["legacy"]["source"]), 148 "language_guess": tweet["legacy"].get("lang"), 149 "possibly_sensitive": "yes" if tweet["legacy"].get("possibly_sensitive") else "no", 150 "retweet_count": tweet["legacy"]["retweet_count"], 151 "reply_count": tweet["legacy"]["reply_count"], 152 "like_count": tweet["legacy"]["favorite_count"], 153 "quote_count": tweet["legacy"]["quote_count"], 154 "impression_count": tweet.get("ext_views", {}).get("count", ""), 155 "is_retweet": "yes" if retweet else "no", 156 "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "", 157 "is_quote_tweet": "yes" if quote_tweet else "no", 158 "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "", 159 "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != tweet_id else "no", 160 "replied_user": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get( 161 "in_reply_to_screen_name") else "", 162 "is_withheld": "yes" if withheld else "no", 163 "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]), 164 "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]), 165 "images": ",".join( 166 [media["media_url_https"] for media in tweet["legacy"].get("extended_entities", {}).get("media", []) if 167 media["type"] == "photo"]), 168 "videos": ",".join([media["video_info"]["variants"][0]["url"] for media in 169 tweet["legacy"].get("extended_entities", {}).get("media", []) if 170 media["type"] == "video"]), 171 "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]), 172 "long_lat": SearchTwitterViaZeeschuimer.get_centroid( 173 tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "", 174 "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "", 175 }
@staticmethod
def
get_centroid(box):
177 @staticmethod 178 def get_centroid(box): 179 """ 180 Get centre of a rectangular box 181 182 Convenience function for converting X/Twitter's bounding box coordinates 183 to a singular coordinate - simply the centre of the box - because that 184 is what is expected for mapped output. 185 186 :param list box: The box as part of X/Twitter's response 187 :return str: Coordinate, as longitude,latitude. 188 """ 189 box = box[0] 190 return ",".join(( 191 str(round((box[0][0] + box[1][0]) / 2, 6)), 192 str(round((box[0][1] + box[1][1]) / 2, 6)), 193 ))
Get centre of a rectangular box
Convenience function for converting X/Twitter's bounding box coordinates to a singular coordinate - simply the centre of the box - because that is what is expected for mapped output.
Parameters
- list box: The box as part of X/Twitter's response
Returns
Coordinate, as longitude,latitude.
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor