Edit on GitHub

datasources.twitter-import.search_twitter

Import scraped X/Twitter data

It's prohibitively difficult to scrape data from Twitter within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

View Source

  1"""
  2Import scraped X/Twitter data
  3
  4It's prohibitively difficult to scrape data from Twitter within 4CAT itself due
  5to its aggressive rate limiting. Instead, import data collected elsewhere.
  6"""
  7from datetime import datetime
  8
  9from backend.lib.search import Search
 10from common.lib.helpers import strip_tags
 11from common.lib.item_mapping import MappedItem
 12
 13
 14class SearchTwitterViaZeeschuimer(Search):
 15    """
 16    Import scraped Imgur data
 17    """
 18    type = "twitter-import"  # job ID
 19    category = "Search"  # category
 20    title = "Import scraped X/Twitter data"  # title displayed in UI
 21    description = "Import X/Twitter data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 22    extension = "ndjson"  # extension of result file, used internally and in UI
 23    is_from_zeeschuimer = True
 24
 25    # not available as a processor for existing datasets
 26    accepts = []
 27    references = [
 28        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 29        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 30    ]
 31
 32    def get_items(self, query):
 33        """
 34        Run custom search
 35
 36        Not available for Imgur
 37        """
 38        raise NotImplementedError("Twitter datasets can only be created by importing data from elsewhere")
 39
 40    @staticmethod
 41    def map_item(item):
 42        if item.get("rest_id"):
 43            return MappedItem(SearchTwitterViaZeeschuimer.map_item_modern(item))
 44        elif item.get("type") == "adaptive":
 45            return MappedItem(SearchTwitterViaZeeschuimer.map_item_legacy(item))
 46        else:
 47            raise NotImplementedError
 48
 49    @staticmethod
 50    def map_item_modern(tweet):
 51        timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y")
 52        withheld = False
 53
 54        retweet = tweet["legacy"].get("retweeted_status_result")
 55        if retweet:
 56            # make sure the full RT is included, by default this is shortened
 57            if "tweet" in retweet["result"]:
 58                retweet["result"] = retweet["result"]["tweet"]
 59
 60            if retweet["result"].get("legacy", {}).get("withheld_scope"):
 61                withheld = True
 62                tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"]
 63            else:
 64                t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \
 65                     ": " + retweet["result"]["legacy"]["full_text"]
 66                tweet["legacy"]["full_text"] = t_text
 67
 68        quote_tweet = tweet.get("quoted_status_result")
 69        if quote_tweet and "tweet" in quote_tweet.get("result", {}):
 70            # sometimes this is one level deeper, sometimes not...
 71            quote_tweet["result"] = quote_tweet["result"]["tweet"]
 72
 73        return {
 74            "id": tweet["rest_id"],
 75            "thread_id": tweet["legacy"]["conversation_id_str"],
 76            "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
 77            "unix_timestamp": int(timestamp.timestamp()),
 78            "link": f"https://x.com/{tweet['core']['user_results']['result']['legacy']['screen_name']}/status/{tweet['id']}",
 79            "body": tweet["legacy"]["full_text"],
 80            "author": tweet["core"]["user_results"]["result"]["legacy"]["screen_name"],
 81            "author_fullname": tweet["core"]["user_results"]["result"]["legacy"]["name"],
 82            "author_id": tweet["legacy"]["user_id_str"],
 83            "author_avatar_url": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"],
 84            "author_banner_url": tweet["core"]["user_results"]["result"]["legacy"].get("profile_banner_url", ""), # key does not exist when author does not have a banner
 85            "source": strip_tags(tweet["source"]),
 86            "language_guess": tweet["legacy"].get("lang"),
 87            "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") else "no",
 88            "retweet_count": tweet["legacy"]["retweet_count"],
 89            "reply_count": tweet["legacy"]["reply_count"],
 90            "like_count": tweet["legacy"]["favorite_count"],
 91            "quote_count": tweet["legacy"]["quote_count"],
 92            "impression_count": tweet.get("views", {}).get("count", ""),
 93            "is_retweet": "yes" if retweet else "no",
 94            "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "",
 95            "is_quote_tweet": "yes" if quote_tweet else "no",
 96            "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if (quote_tweet and "tombstone" not in quote_tweet["result"]) else "",
 97            "is_quote_withheld": "yes" if (quote_tweet and "tombstone" in quote_tweet["result"]) else "no",
 98            "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != str(tweet["rest_id"]) else "no",
 99            "replied_user": tweet["legacy"].get("in_reply_to_screen_name", ""),
100            "is_withheld": "yes" if withheld else "no",
101            "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]),
102            "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]),
103            "images": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if
104                                media["type"] == "photo"]),
105            "videos": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if
106                                media["type"] == "video"]),
107            "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]),
108            "long_lat": SearchTwitterViaZeeschuimer.get_centroid(
109                tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "",
110            "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "",
111        }
112
113    @staticmethod
114    def map_item_legacy(tweet):
115        timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y")
116        tweet_id = tweet["legacy"]["id_str"]
117        withheld = False
118
119        retweet = tweet["legacy"].get("retweeted_status_result")
120        if retweet:
121            # make sure the full RT is included, by default this is shortened
122            if retweet["result"].get("legacy", {}).get("withheld_status"):
123                withheld = True
124                tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"]
125            else:
126                t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \
127                     " " + retweet["result"]["legacy"]["full_text"]
128                tweet["legacy"]["full_text"] = t_text
129
130        quote_tweet = tweet.get("quoted_status_result")
131
132        if quote_tweet and "tweet" in quote_tweet.get("result", {}):
133            # sometimes this is one level deeper, sometimes not...
134            quote_tweet["result"] = quote_tweet["result"]["tweet"]
135
136        return {
137            "id": tweet_id,
138            "thread_id": tweet["legacy"]["conversation_id_str"],
139            "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
140            "unix_timestamp": int(timestamp.timestamp()),
141            "link": f"https://x.com/{tweet['user']['screen_name']}/status/{tweet_id}",
142            "body": tweet["legacy"]["full_text"],
143            "author": tweet["user"]["screen_name"],
144            "author_fullname": tweet["user"]["name"],
145            "author_id": tweet["user"]["id_str"],
146            "source": strip_tags(tweet["legacy"]["source"]),
147            "language_guess": tweet["legacy"].get("lang"),
148            "possibly_sensitive": "yes" if tweet["legacy"].get("possibly_sensitive") else "no",
149            "retweet_count": tweet["legacy"]["retweet_count"],
150            "reply_count": tweet["legacy"]["reply_count"],
151            "like_count": tweet["legacy"]["favorite_count"],
152            "quote_count": tweet["legacy"]["quote_count"],
153            "impression_count": tweet.get("ext_views", {}).get("count", ""),
154            "is_retweet": "yes" if retweet else "no",
155            "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "",
156            "is_quote_tweet": "yes" if quote_tweet else "no",
157            "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "",
158            "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != tweet_id else "no",
159            "replied_user": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get(
160                "in_reply_to_screen_name") else "",
161            "is_withheld": "yes" if withheld else "no",
162            "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]),
163            "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]),
164            "images": ",".join(
165                [media["media_url_https"] for media in tweet["legacy"].get("extended_entities", {}).get("media", []) if
166                 media["type"] == "photo"]),
167            "videos": ",".join([media["video_info"]["variants"][0]["url"] for media in
168                                tweet["legacy"].get("extended_entities", {}).get("media", []) if
169                                media["type"] == "video"]),
170            "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]),
171            "long_lat": SearchTwitterViaZeeschuimer.get_centroid(
172                tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "",
173            "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "",
174        }
175
176    @staticmethod
177    def get_centroid(box):
178        """
179        Get centre of a rectangular box
180
181        Convenience function for converting X/Twitter's bounding box coordinates
182        to a singular coordinate - simply the centre of the box - because that
183        is what is expected for mapped output.
184
185        :param list box:  The box as part of X/Twitter's response
186        :return str:  Coordinate, as longitude,latitude.
187        """
188        box = box[0]
189        return ",".join((
190            str(round((box[0][0] + box[1][0]) / 2, 6)),
191            str(round((box[0][1] + box[1][1]) / 2, 6)),
192        ))

class SearchTwitterViaZeeschuimer(backend.lib.search.Search): View Source

 15class SearchTwitterViaZeeschuimer(Search):
 16    """
 17    Import scraped Imgur data
 18    """
 19    type = "twitter-import"  # job ID
 20    category = "Search"  # category
 21    title = "Import scraped X/Twitter data"  # title displayed in UI
 22    description = "Import X/Twitter data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 23    extension = "ndjson"  # extension of result file, used internally and in UI
 24    is_from_zeeschuimer = True
 25
 26    # not available as a processor for existing datasets
 27    accepts = []
 28    references = [
 29        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 30        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 31    ]
 32
 33    def get_items(self, query):
 34        """
 35        Run custom search
 36
 37        Not available for Imgur
 38        """
 39        raise NotImplementedError("Twitter datasets can only be created by importing data from elsewhere")
 40
 41    @staticmethod
 42    def map_item(item):
 43        if item.get("rest_id"):
 44            return MappedItem(SearchTwitterViaZeeschuimer.map_item_modern(item))
 45        elif item.get("type") == "adaptive":
 46            return MappedItem(SearchTwitterViaZeeschuimer.map_item_legacy(item))
 47        else:
 48            raise NotImplementedError
 49
 50    @staticmethod
 51    def map_item_modern(tweet):
 52        timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y")
 53        withheld = False
 54
 55        retweet = tweet["legacy"].get("retweeted_status_result")
 56        if retweet:
 57            # make sure the full RT is included, by default this is shortened
 58            if "tweet" in retweet["result"]:
 59                retweet["result"] = retweet["result"]["tweet"]
 60
 61            if retweet["result"].get("legacy", {}).get("withheld_scope"):
 62                withheld = True
 63                tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"]
 64            else:
 65                t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \
 66                     ": " + retweet["result"]["legacy"]["full_text"]
 67                tweet["legacy"]["full_text"] = t_text
 68
 69        quote_tweet = tweet.get("quoted_status_result")
 70        if quote_tweet and "tweet" in quote_tweet.get("result", {}):
 71            # sometimes this is one level deeper, sometimes not...
 72            quote_tweet["result"] = quote_tweet["result"]["tweet"]
 73
 74        return {
 75            "id": tweet["rest_id"],
 76            "thread_id": tweet["legacy"]["conversation_id_str"],
 77            "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
 78            "unix_timestamp": int(timestamp.timestamp()),
 79            "link": f"https://x.com/{tweet['core']['user_results']['result']['legacy']['screen_name']}/status/{tweet['id']}",
 80            "body": tweet["legacy"]["full_text"],
 81            "author": tweet["core"]["user_results"]["result"]["legacy"]["screen_name"],
 82            "author_fullname": tweet["core"]["user_results"]["result"]["legacy"]["name"],
 83            "author_id": tweet["legacy"]["user_id_str"],
 84            "author_avatar_url": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"],
 85            "author_banner_url": tweet["core"]["user_results"]["result"]["legacy"].get("profile_banner_url", ""), # key does not exist when author does not have a banner
 86            "source": strip_tags(tweet["source"]),
 87            "language_guess": tweet["legacy"].get("lang"),
 88            "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") else "no",
 89            "retweet_count": tweet["legacy"]["retweet_count"],
 90            "reply_count": tweet["legacy"]["reply_count"],
 91            "like_count": tweet["legacy"]["favorite_count"],
 92            "quote_count": tweet["legacy"]["quote_count"],
 93            "impression_count": tweet.get("views", {}).get("count", ""),
 94            "is_retweet": "yes" if retweet else "no",
 95            "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "",
 96            "is_quote_tweet": "yes" if quote_tweet else "no",
 97            "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if (quote_tweet and "tombstone" not in quote_tweet["result"]) else "",
 98            "is_quote_withheld": "yes" if (quote_tweet and "tombstone" in quote_tweet["result"]) else "no",
 99            "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != str(tweet["rest_id"]) else "no",
100            "replied_user": tweet["legacy"].get("in_reply_to_screen_name", ""),
101            "is_withheld": "yes" if withheld else "no",
102            "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]),
103            "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]),
104            "images": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if
105                                media["type"] == "photo"]),
106            "videos": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if
107                                media["type"] == "video"]),
108            "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]),
109            "long_lat": SearchTwitterViaZeeschuimer.get_centroid(
110                tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "",
111            "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "",
112        }
113
114    @staticmethod
115    def map_item_legacy(tweet):
116        timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y")
117        tweet_id = tweet["legacy"]["id_str"]
118        withheld = False
119
120        retweet = tweet["legacy"].get("retweeted_status_result")
121        if retweet:
122            # make sure the full RT is included, by default this is shortened
123            if retweet["result"].get("legacy", {}).get("withheld_status"):
124                withheld = True
125                tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"]
126            else:
127                t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \
128                     " " + retweet["result"]["legacy"]["full_text"]
129                tweet["legacy"]["full_text"] = t_text
130
131        quote_tweet = tweet.get("quoted_status_result")
132
133        if quote_tweet and "tweet" in quote_tweet.get("result", {}):
134            # sometimes this is one level deeper, sometimes not...
135            quote_tweet["result"] = quote_tweet["result"]["tweet"]
136
137        return {
138            "id": tweet_id,
139            "thread_id": tweet["legacy"]["conversation_id_str"],
140            "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
141            "unix_timestamp": int(timestamp.timestamp()),
142            "link": f"https://x.com/{tweet['user']['screen_name']}/status/{tweet_id}",
143            "body": tweet["legacy"]["full_text"],
144            "author": tweet["user"]["screen_name"],
145            "author_fullname": tweet["user"]["name"],
146            "author_id": tweet["user"]["id_str"],
147            "source": strip_tags(tweet["legacy"]["source"]),
148            "language_guess": tweet["legacy"].get("lang"),
149            "possibly_sensitive": "yes" if tweet["legacy"].get("possibly_sensitive") else "no",
150            "retweet_count": tweet["legacy"]["retweet_count"],
151            "reply_count": tweet["legacy"]["reply_count"],
152            "like_count": tweet["legacy"]["favorite_count"],
153            "quote_count": tweet["legacy"]["quote_count"],
154            "impression_count": tweet.get("ext_views", {}).get("count", ""),
155            "is_retweet": "yes" if retweet else "no",
156            "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "",
157            "is_quote_tweet": "yes" if quote_tweet else "no",
158            "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "",
159            "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != tweet_id else "no",
160            "replied_user": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get(
161                "in_reply_to_screen_name") else "",
162            "is_withheld": "yes" if withheld else "no",
163            "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]),
164            "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]),
165            "images": ",".join(
166                [media["media_url_https"] for media in tweet["legacy"].get("extended_entities", {}).get("media", []) if
167                 media["type"] == "photo"]),
168            "videos": ",".join([media["video_info"]["variants"][0]["url"] for media in
169                                tweet["legacy"].get("extended_entities", {}).get("media", []) if
170                                media["type"] == "video"]),
171            "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]),
172            "long_lat": SearchTwitterViaZeeschuimer.get_centroid(
173                tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "",
174            "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "",
175        }
176
177    @staticmethod
178    def get_centroid(box):
179        """
180        Get centre of a rectangular box
181
182        Convenience function for converting X/Twitter's bounding box coordinates
183        to a singular coordinate - simply the centre of the box - because that
184        is what is expected for mapped output.
185
186        :param list box:  The box as part of X/Twitter's response
187        :return str:  Coordinate, as longitude,latitude.
188        """
189        box = box[0]
190        return ",".join((
191            str(round((box[0][0] + box[1][0]) / 2, 6)),
192            str(round((box[0][1] + box[1][1]) / 2, 6)),
193        ))

Import scraped Imgur data

type = 'twitter-import'

category = 'Search'

title = 'Import scraped X/Twitter data'

description = 'Import X/Twitter data collected with an external tool such as Zeeschuimer.'

extension = 'ndjson'

is_from_zeeschuimer = True

accepts = []

references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']

def get_items(self, query): View Source

33    def get_items(self, query):
34        """
35        Run custom search
36
37        Not available for Imgur
38        """
39        raise NotImplementedError("Twitter datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Imgur

@staticmethod

def map_item(item): View Source

41    @staticmethod
42    def map_item(item):
43        if item.get("rest_id"):
44            return MappedItem(SearchTwitterViaZeeschuimer.map_item_modern(item))
45        elif item.get("type") == "adaptive":
46            return MappedItem(SearchTwitterViaZeeschuimer.map_item_legacy(item))
47        else:
48            raise NotImplementedError

@staticmethod

def map_item_modern(tweet): View Source

 50    @staticmethod
 51    def map_item_modern(tweet):
 52        timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y")
 53        withheld = False
 54
 55        retweet = tweet["legacy"].get("retweeted_status_result")
 56        if retweet:
 57            # make sure the full RT is included, by default this is shortened
 58            if "tweet" in retweet["result"]:
 59                retweet["result"] = retweet["result"]["tweet"]
 60
 61            if retweet["result"].get("legacy", {}).get("withheld_scope"):
 62                withheld = True
 63                tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"]
 64            else:
 65                t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \
 66                     ": " + retweet["result"]["legacy"]["full_text"]
 67                tweet["legacy"]["full_text"] = t_text
 68
 69        quote_tweet = tweet.get("quoted_status_result")
 70        if quote_tweet and "tweet" in quote_tweet.get("result", {}):
 71            # sometimes this is one level deeper, sometimes not...
 72            quote_tweet["result"] = quote_tweet["result"]["tweet"]
 73
 74        return {
 75            "id": tweet["rest_id"],
 76            "thread_id": tweet["legacy"]["conversation_id_str"],
 77            "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
 78            "unix_timestamp": int(timestamp.timestamp()),
 79            "link": f"https://x.com/{tweet['core']['user_results']['result']['legacy']['screen_name']}/status/{tweet['id']}",
 80            "body": tweet["legacy"]["full_text"],
 81            "author": tweet["core"]["user_results"]["result"]["legacy"]["screen_name"],
 82            "author_fullname": tweet["core"]["user_results"]["result"]["legacy"]["name"],
 83            "author_id": tweet["legacy"]["user_id_str"],
 84            "author_avatar_url": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"],
 85            "author_banner_url": tweet["core"]["user_results"]["result"]["legacy"].get("profile_banner_url", ""), # key does not exist when author does not have a banner
 86            "source": strip_tags(tweet["source"]),
 87            "language_guess": tweet["legacy"].get("lang"),
 88            "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") else "no",
 89            "retweet_count": tweet["legacy"]["retweet_count"],
 90            "reply_count": tweet["legacy"]["reply_count"],
 91            "like_count": tweet["legacy"]["favorite_count"],
 92            "quote_count": tweet["legacy"]["quote_count"],
 93            "impression_count": tweet.get("views", {}).get("count", ""),
 94            "is_retweet": "yes" if retweet else "no",
 95            "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "",
 96            "is_quote_tweet": "yes" if quote_tweet else "no",
 97            "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if (quote_tweet and "tombstone" not in quote_tweet["result"]) else "",
 98            "is_quote_withheld": "yes" if (quote_tweet and "tombstone" in quote_tweet["result"]) else "no",
 99            "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != str(tweet["rest_id"]) else "no",
100            "replied_user": tweet["legacy"].get("in_reply_to_screen_name", ""),
101            "is_withheld": "yes" if withheld else "no",
102            "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]),
103            "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]),
104            "images": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if
105                                media["type"] == "photo"]),
106            "videos": ",".join([media["media_url_https"] for media in tweet["legacy"]["entities"].get("media", []) if
107                                media["type"] == "video"]),
108            "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]),
109            "long_lat": SearchTwitterViaZeeschuimer.get_centroid(
110                tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "",
111            "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "",
112        }

@staticmethod

def map_item_legacy(tweet): View Source

114    @staticmethod
115    def map_item_legacy(tweet):
116        timestamp = datetime.strptime(tweet["legacy"]["created_at"], "%a %b %d %H:%M:%S %z %Y")
117        tweet_id = tweet["legacy"]["id_str"]
118        withheld = False
119
120        retweet = tweet["legacy"].get("retweeted_status_result")
121        if retweet:
122            # make sure the full RT is included, by default this is shortened
123            if retweet["result"].get("legacy", {}).get("withheld_status"):
124                withheld = True
125                tweet["legacy"]["full_text"] = retweet["result"]["legacy"]["full_text"]
126            else:
127                t_text = "RT @" + retweet["result"]["core"]["user_results"]["result"]["legacy"]["screen_name"] + \
128                     " " + retweet["result"]["legacy"]["full_text"]
129                tweet["legacy"]["full_text"] = t_text
130
131        quote_tweet = tweet.get("quoted_status_result")
132
133        if quote_tweet and "tweet" in quote_tweet.get("result", {}):
134            # sometimes this is one level deeper, sometimes not...
135            quote_tweet["result"] = quote_tweet["result"]["tweet"]
136
137        return {
138            "id": tweet_id,
139            "thread_id": tweet["legacy"]["conversation_id_str"],
140            "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
141            "unix_timestamp": int(timestamp.timestamp()),
142            "link": f"https://x.com/{tweet['user']['screen_name']}/status/{tweet_id}",
143            "body": tweet["legacy"]["full_text"],
144            "author": tweet["user"]["screen_name"],
145            "author_fullname": tweet["user"]["name"],
146            "author_id": tweet["user"]["id_str"],
147            "source": strip_tags(tweet["legacy"]["source"]),
148            "language_guess": tweet["legacy"].get("lang"),
149            "possibly_sensitive": "yes" if tweet["legacy"].get("possibly_sensitive") else "no",
150            "retweet_count": tweet["legacy"]["retweet_count"],
151            "reply_count": tweet["legacy"]["reply_count"],
152            "like_count": tweet["legacy"]["favorite_count"],
153            "quote_count": tweet["legacy"]["quote_count"],
154            "impression_count": tweet.get("ext_views", {}).get("count", ""),
155            "is_retweet": "yes" if retweet else "no",
156            "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "",
157            "is_quote_tweet": "yes" if quote_tweet else "no",
158            "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "",
159            "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != tweet_id else "no",
160            "replied_user": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get(
161                "in_reply_to_screen_name") else "",
162            "is_withheld": "yes" if withheld else "no",
163            "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]),
164            "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]),
165            "images": ",".join(
166                [media["media_url_https"] for media in tweet["legacy"].get("extended_entities", {}).get("media", []) if
167                 media["type"] == "photo"]),
168            "videos": ",".join([media["video_info"]["variants"][0]["url"] for media in
169                                tweet["legacy"].get("extended_entities", {}).get("media", []) if
170                                media["type"] == "video"]),
171            "mentions": ",".join([media["screen_name"] for media in tweet["legacy"]["entities"]["user_mentions"]]),
172            "long_lat": SearchTwitterViaZeeschuimer.get_centroid(
173                tweet["legacy"]["place"]["bounding_box"]["coordinates"]) if tweet["legacy"].get("place") else "",
174            "place_name": tweet["legacy"].get("place", {}).get("full_name", "") if tweet["legacy"].get("place") else "",
175        }

@staticmethod

def get_centroid(box): View Source

177    @staticmethod
178    def get_centroid(box):
179        """
180        Get centre of a rectangular box
181
182        Convenience function for converting X/Twitter's bounding box coordinates
183        to a singular coordinate - simply the centre of the box - because that
184        is what is expected for mapped output.
185
186        :param list box:  The box as part of X/Twitter's response
187        :return str:  Coordinate, as longitude,latitude.
188        """
189        box = box[0]
190        return ",".join((
191            str(round((box[0][0] + box[1][0]) / 2, 6)),
192            str(round((box[0][1] + box[1][1]) / 2, 6)),
193        ))

Get centre of a rectangular box

Convenience function for converting X/Twitter's bounding box coordinates to a singular coordinate - simply the centre of the box - because that is what is expected for mapped output.

Parameters

list box: The box as part of X/Twitter's response

Returns

Coordinate, as longitude,latitude.

Inherited Members

backend.lib.worker.BasicWorker: BasicWorker; INTERRUPT_NONE; INTERRUPT_RETRY; INTERRUPT_CANCEL; queue; log; manager; interrupted; modules; init_time; name; run; clean_up; request_interrupt; is_4cat_class
backend.lib.search.Search: max_workers; prefix; return_cols; import_error_count; import_warning_count; process; search; import_from_file; items_to_csv; items_to_ndjson; items_to_archive
backend.lib.processor.BasicProcessor: db; job; dataset; owner; source_dataset; source_file; config; is_running_in_preset; is_hidden; filepath; work; after_process; remove_files; abort; add_field_to_parent; iterate_archive_contents; unpack_archive_contents; extract_archived_file_by_name; write_csv_items_and_finish; write_archive_and_finish; create_standalone; map_item_method_available; get_mapped_item; is_filter; get_options; get_status; is_top_dataset; is_from_collector; get_extension; is_rankable; exclude_followup_processors; is_4cat_processor