datasources.dmi-tcat.search_tcat
Twitter search within a DMI-TCAT bin; connect via TCAT frontend
1""" 2Twitter search within a DMI-TCAT bin; connect via TCAT frontend 3""" 4import requests 5import datetime 6import csv 7import json 8import re 9import io 10 11from backend.lib.search import Search 12from common.lib.exceptions import QueryParametersException 13from common.lib.user_input import UserInput 14from common.lib.helpers import sniff_encoding 15from common.lib.item_mapping import MappedItem 16from common.config_manager import config 17 18from datasources.twitterv2.search_twitter import SearchWithTwitterAPIv2 19 20 21class SearchWithinTCATBins(Search): 22 """ 23 Get Tweets via DMI-TCAT 24 25 This allows subsetting an existing query bin, similar to the 'Data 26 Selection' panel in the DMI-TCAT analysis interface 27 """ 28 type = "dmi-tcat-search" # job ID 29 extension = "ndjson" 30 title = "TCAT Search (HTTP)" 31 32 # TCAT has a few fields that do not exist in APIv2 33 additional_TCAT_fields = ["to_user_name", "filter_level", "favorite_count", "truncated", "from_user_favourites_count", "from_user_lang", "from_user_utcoffset", 34 "from_user_timezone"] 35 36 options = { 37 "intro-1": { 38 "type": UserInput.OPTION_INFO, 39 "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet " 40 "bin in that instance." 41 }, 42 "divider-1": { 43 "type": UserInput.OPTION_DIVIDER 44 }, 45 "bin": { 46 "type": UserInput.OPTION_INFO, 47 "help": "Query bin" 48 }, 49 "query": { 50 "type": UserInput.OPTION_TEXT, 51 "help": "Query text", 52 "tooltip": "Match all tweets containing this text." 53 }, 54 "query-exclude": { 55 "type": UserInput.OPTION_TEXT, 56 "help": "Exclude text", 57 "tooltip": "Match all tweets that do NOT contain this text." 58 }, 59 "user-name": { 60 "type": UserInput.OPTION_TEXT, 61 "help": "From user", 62 "tooltip": "Match all tweets from this username." 63 }, 64 "user-exclude": { 65 "type": UserInput.OPTION_TEXT, 66 "help": "Exclude user", 67 "tooltip": "Match all tweets NOT from this username." 68 }, 69 "exclude-replies": { 70 "type": UserInput.OPTION_CHOICE, 71 "options": { 72 "exclude": "Exclude replies", 73 "include": "Include replies" 74 }, 75 "help": "Reply tweets", 76 "default": "include", 77 "tooltip": "Choose to exclude or include tweets that are replies from the data" 78 }, 79 "daterange": { 80 "type": UserInput.OPTION_DATERANGE, 81 "help": "Date range" 82 }, 83 # Advanced Options Section 84 "divider-2": { 85 "type": UserInput.OPTION_DIVIDER 86 }, 87 "advanced_options_info": { 88 "type": UserInput.OPTION_INFO, 89 "help": "Advanced Query Options can further refine your query" 90 }, 91 "user-bio": { 92 "type": UserInput.OPTION_TEXT, 93 "help": "User bio text", 94 "tooltip": "Match all tweets from users with biographies containing this text." 95 }, 96 "user-language": { 97 "type": UserInput.OPTION_TEXT, 98 "help": "User language", 99 "tooltip": "Match all tweets from users using this language (as detected by Twitter)." 100 }, 101 "tweet-language": { 102 "type": UserInput.OPTION_TEXT, 103 "help": "Tweet language", 104 "tooltip": "Match all tweets from users with this language (as detected by Twitter)." 105 }, 106 "tweet-client": { 107 "type": UserInput.OPTION_TEXT, 108 "help": "Twitter client URL/descr", 109 "tooltip": "Match all tweets from clients that match this text." 110 }, 111 "url": { 112 "type": UserInput.OPTION_TEXT, 113 "help": "(Part of) URL", 114 "tooltip": "Match all tweets containing this (partial) URL." 115 }, 116 "url-media": { 117 "type": UserInput.OPTION_TEXT, 118 "help": "(Part of) media URL", 119 "tooltip": "Match all tweets containing this (partial) media URL." 120 }, 121 } 122 123 config = { 124 "dmi-tcat-search.instances": { 125 "type": UserInput.OPTION_TEXT_JSON, 126 "help": "DMI-TCAT instances", 127 "tooltip": 'List of DMI-TCAT instance URLs, e.g. ["http://username:password@tcat.instance.webpage.net"]. ' 128 'This needs to be formatted as a JSON list of strings.', 129 "default": {} 130 } 131 } 132 133 bin_data = { 134 "all_bins": {}, 135 "last_collected": {}, 136 } 137 138 @classmethod 139 def collect_all_bins(cls, force_update=False): 140 """ 141 Requests bin information from TCAT instances 142 """ 143 instances = config.get("dmi-tcat-search.instances", []) 144 for instance in instances: 145 # query each configured TCAT instance for a list of bins that can 146 # be subsetted 147 instance = instance.rstrip("/") 148 api_url = instance + "/api/bin-stats.php" 149 150 if force_update or instance not in cls.bin_data["last_collected"] or datetime.datetime.now()-datetime.timedelta(days=1) >= cls.bin_data["last_collected"][instance]: 151 # Collect Instance data 152 try: 153 api_request = requests.get(api_url, timeout=5) 154 instance_bins = json.loads(api_request.content) 155 cls.bin_data["all_bins"][instance] = {k: instance_bins[k] for k in sorted(instance_bins)} 156 cls.bin_data["last_collected"][instance] = datetime.datetime.now() 157 except (requests.RequestException, json.JSONDecodeError): 158 cls.bin_data["all_bins"][instance] = {"failed": True} 159 # TODO: No logger here as nothing has been initialized 160 # print(f"WARNING, unable to collect TCAT bins from instance {instance}") 161 pass 162 163 @classmethod 164 def get_options(cls, parent_dataset=None, user=None): 165 """ 166 Get data source options 167 168 This method takes the pre-defined options, but fills the 'bins' options 169 with bins currently available from the configured TCAT instances. 170 171 :param DataSet parent_dataset: An object representing the dataset that 172 the processor would be run on 173 :param User user: Flask user the options will be displayed for, in 174 case they are requested for display in the 4CAT web interface. This can 175 be used to show some options only to privileges users. 176 """ 177 options = cls.options 178 179 cls.collect_all_bins() 180 if all([data.get("failed", False) for instance, data in cls.bin_data["all_bins"].items()]): 181 options["bin"] = { 182 "type": UserInput.OPTION_INFO, 183 "help": "Could not connect to DMI-TCAT instance(s)." 184 } 185 return options 186 187 options["bin"] = { 188 "type": UserInput.OPTION_CHOICE, 189 "options": {}, 190 "help": "Query bin" 191 } 192 193 for instance, bins in cls.bin_data["all_bins"].items(): 194 # make the host somewhat human-readable 195 # also strip out embedded HTTP auths 196 host = re.sub(r"^https?://", "", instance).split("@").pop() 197 for bin_name, bin in bins.items(): 198 bin_key = "%s@%s" % (bin_name, host) 199 display_text = f"{bin_name}: {bin.get('tweets_approximate')} tweets from {bin.get('range').get('first_tweet')} to {bin.get('range').get('last_tweet')}" 200 options["bin"]["options"][bin_key] = display_text 201 202 return options 203 204 def get_items(self, query): 205 """ 206 Use the DMI-TCAT tweet export to retrieve tweets 207 208 :param query: 209 :return: 210 """ 211 bin = self.parameters.get("bin") 212 bin_name = bin.split("@")[0] 213 bin_host = bin.split("@").pop() 214 215 # we cannot store the full instance URL as a parameter, because it may 216 # contain sensitive information (e.g. HTTP auth) - so we find the full 217 # instance URL again here 218 # while the parameter could be marked 'sensitive', the values would 219 # still show up in e.g. the HTML of the 'create dataset' form 220 available_instances = config.get("dmi-tcat-search.instances", []) 221 instance_url = "" 222 instance = None 223 for available_instance in available_instances: 224 hostname = re.sub(r"https?://", "", available_instance).split("@").pop().rstrip("/") 225 if hostname == bin_host: 226 instance_url = available_instance 227 instance = available_instance.rstrip("/") 228 break 229 230 if not instance_url: 231 return self.dataset.finish_with_error("Invalid DMI-TCAT instance name '%s'" % bin_host) 232 233 # Collect the bins again (ensure we have updated info in case bin is still active) 234 self.collect_all_bins(force_update=True) 235 # Add metadata to parameters 236 try: 237 current_bin = self.bin_data["all_bins"][instance][bin_name] 238 except KeyError: 239 return self.dataset.finish_with_error(f"Lost connection to TCAT instance {bin_host}") 240 # Add TCAT metadata to dataset 241 self.dataset.tcat_bin_data = current_bin 242 if current_bin.get("type") in ["follow", "track", "timeline", "geotrack"] and ("phrase_times" not in current_bin or not "user_times" not in current_bin): 243 self.dataset.update_status("Warning: TCAT not updated to send phrase and user time ranges; consider updating if you would like to retain this BIN metadata.") 244 245 # now get the parameters... 246 request_url = instance_url.rstrip("/") + "/analysis/mod.export_tweets.php" 247 248 # Allow for blank dates 249 if self.parameters.get("min_date"): 250 start_date = datetime.datetime.fromtimestamp(self.parameters.get("min_date")).strftime("%Y-%m-%d") 251 else: 252 first_tweet_timestamp = current_bin.get('range').get('first_tweet') 253 start_date = datetime.datetime.strptime(first_tweet_timestamp, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d") 254 255 end_date = datetime.datetime.fromtimestamp(self.parameters.get("max_date")).strftime("%Y-%m-%d") if self.parameters.get("max_date") else (datetime.datetime.now() + datetime.timedelta(days=1)).strftime("%Y-%m-%d") 256 parameters = { 257 "dataset": bin_name, 258 "query": self.parameters.get("query"), 259 "url_query": self.parameters.get("url"), 260 "media_url_query": self.parameters.get("url-media"), 261 "exclude": self.parameters.get("query-exclude"), 262 "from_user_name": self.parameters.get("user-name"), 263 "from_user_lang": self.parameters.get("user-language"), 264 "lang": self.parameters.get("tweet-language"), 265 "exclude_from_user_name": self.parameters.get("user-exclude"), 266 "from_source": re.sub(r"<[^>]+>", "", self.parameters.get("tweet-client")), 267 "startdate": start_date, 268 "enddate": end_date, 269 "replyto": "yes" if self.parameters.get("exclude-replies") == "exclude" else "no", 270 "whattodo": "", 271 "exportSettings": "urls,mentions,hashtags,media,", 272 "graph_resolution": "day", 273 "outputformat": "csv" 274 } 275 276 # for now we simply request the full CSV export of the bin with the 277 # given parameters, letting TCAT handle the full text search and so 278 # on 279 self.dataset.update_status("Searching for tweets on %s" % bin_host) 280 response = requests.get(request_url, params=parameters, stream=True) 281 if response.status_code != 200: 282 return self.dataset.finish_with_error("Query bin not available: received HTTP Error %i" % response.status_code) 283 284 # process the file in 1kB chunks, buffer as we go 285 # If a newline is encountered, the buffer is processed as a row of csv 286 # data. This works as long as there are no newlines in the csv itself, 287 # which is the case for TCAT exports. Processing as a stream is needed 288 # to avoid having to load the full file in memory 289 buffer = bytearray() 290 fieldnames = None 291 items = 0 292 encoding = None 293 for chunk in response.iter_content(chunk_size=1024): 294 # see if this chunk contains a newline, in which case we have a 295 # full line to process (e.g. as a tweet) 296 lines = [] 297 buffer += bytearray(chunk) 298 299 if not encoding and len(buffer) > 3: 300 # response.encoding is not correct sometimes, since it does not 301 # indicate that the file uses a BOM, so sniff it instead once 302 # we have some bytes 303 encoding = sniff_encoding(buffer) 304 305 # split buffer by newlines and process each full line 306 # the last line is always carried over, since it may be incomplete 307 if b"\n" in buffer: 308 buffered_lines = buffer.split(b"\n") 309 lines = buffered_lines[:-1] 310 buffer = buffered_lines.pop() 311 elif not chunk: 312 # eof, process left-over data 313 lines = buffer.split(b"\n") 314 315 # and finally we can process the data 316 for line in lines: 317 # use a dummy csv reader to abstract away the annoying csv parsing 318 # this is quite a bit of overhead, but beats implementing csv parsing 319 # manually, and it's still reasonably fast (about 10k/second) 320 dummy_file = io.TextIOWrapper(io.BytesIO(line.replace(b"\0", b"")), encoding=encoding) 321 reader = csv.reader(dummy_file, 322 delimiter=",", 323 quotechar='"', 324 doublequote=True, 325 quoting=csv.QUOTE_MINIMAL) 326 row_data = next(reader) 327 328 if row_data and not fieldnames: 329 # first line in file 330 fieldnames = row_data.copy() 331 332 elif row_data: 333 tweet = dict(zip(fieldnames, row_data)) 334 items += 1 335 336 if items % 250 == 0: 337 self.dataset.update_status("Loaded %i tweets from bin %s@%s" % (items, bin_name, bin_host)) 338 339 yield self.tcat_to_APIv2(tweet) 340 341 if not chunk: 342 # end of file 343 break 344 345 @ staticmethod 346 def tcat_to_4cat_time(tcat_time): 347 """ 348 Twitter APIv2 time is in format "%Y-%m-%dT%H:%M:%S.000Z" while TCAT uses "%Y-%m-%d %H:%M:%S" and a timestamp. 349 350 :return datetime: 351 """ 352 try: 353 tcat_time = int(tcat_time) 354 return datetime.datetime.fromtimestamp(tcat_time).strftime("%Y-%m-%dT%H:%M:%S.000Z") 355 except ValueError: 356 return datetime.datetime.strptime(tcat_time, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S.000Z") 357 358 @staticmethod 359 def tcat_to_APIv2(tcat_tweet): 360 """ 361 Attempt to construct a 4CAT tweet gathered from APIv2 to allow for use of Twitter specific processors! 362 363 A great deal of information is missing so there may result in some issues. Notes are kept for the expected 364 type and, if the data is missing in TCAT, None is used. Therefor it should be possible to refactor processors 365 to handle None if necessary. 366 """ 367 # We're missing lots of data here... 368 369 urls = [url.strip() for url in (tcat_tweet["urls_expanded"].split(";") if tcat_tweet["urls_expanded"] else tcat_tweet["urls_followed"].split(";") if tcat_tweet["urls_followed"] else tcat_tweet["urls_followed"].split(";")) if url] 370 # TCAT media_id: 7 = video, 3 = photo, 16 = animated_gif 371 media_type = "video" if tcat_tweet["media_id"] == "7" else "photo" if tcat_tweet["media_id"] == "3" else "animated_gif" if tcat_tweet["media_id"] == "16" else tcat_tweet["media_id"] 372 373 # 4CAT Twitter APIv2 result data structure 374 APIv2_tweet = { 375 "lang": tcat_tweet["lang"], # str 376 "source": tcat_tweet["source"], # REMOVED FROM TWITTER API v2 377 "possibly_sensitive": True if tcat_tweet["possibly_sensitive"] == 1 else False if tcat_tweet["possibly_sensitive"] == 0 else None, # bool 378 "text": tcat_tweet["text"], # str 379 "edit_history_tweet_ids": None, # list; Missing in TCAT data 380 "public_metrics": { 381 "retweet_count": tcat_tweet["retweet_count"], # int 382 "reply_count": None, # int; Missing in TCAT data 383 "like_count": tcat_tweet["favorite_count"], # int 384 "quote_count": None, # int; Missing in TCAT data 385 "impression_count": None, # int; Missing in TCAT data 386 # TCAT has also favorite_count 387 }, 388 "entities": { 389 "mentions": [{ 390 "id": None, # str; Missing in TCAT data 391 "username": mention.strip(), # str 392 # Twitter v2 API has additional user fields 393 } for mention in tcat_tweet["mentions"].split(";") if mention], 394 "annotations": None, # list; Missing in TCAT data 395 "urls": [{ 396 "url": url, # str 397 "expanded_url": url, # str 398 # Twitter v2 API has additional URL fields 399 } for url in urls], 400 "hashtags": [{ 401 "tag": hashtag.strip(), # str 402 "start": None, # int; Missing in TCAT data 403 "end": None, # int; Missing in TCAT data 404 } for hashtag in tcat_tweet["hashtags"].split(";") if hashtag], 405 "cashtags": None, # list; Missing in TCAT data 406 }, 407 "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["time"]), # str 408 "id": tcat_tweet["id"], # str 409 "author_id": tcat_tweet["from_user_id"], # str 410 "context_annotations": None, # list; Missing in TCAT data 411 "reply_settings": None, # str; Missing in TCAT data 412 "conversation_id": None, # str; TCAT has a in_reply_to_status_id but this is not necessarily the original Tweet that started the conversation 413 "author_user": { 414 "protected": None, # bool; Missing in TCAT data 415 "verified": True if tcat_tweet["from_user_verified"] == 1 else False if tcat_tweet["from_user_verified"] == 0 else None, # bool 416 "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["from_user_created_at"]), # str 417 "name": tcat_tweet["from_user_realname"], # str 418 "entities": { 419 "description": None, # dict; contains entities from author description such as mentions, URLs, etc.; Missing in TCAT data 420 "url": None, # dict; containers entities from author url e.g. URL data; Missing in TCAT data 421 }, 422 "description": tcat_tweet["from_user_description"], # str 423 "pinned_tweet_id": None, # str; Missing in TCAT data 424 "profile_image_url": tcat_tweet["from_user_profile_image_url"], # str 425 "url": tcat_tweet["from_user_url"], # str 426 "username": tcat_tweet["from_user_name"], # str 427 "id": tcat_tweet["from_user_id"], # str 428 "location": None, # str; Missing in TCAT data 429 "public_metrics": { 430 "followers_count": tcat_tweet["from_user_followercount"], # int 431 "following_count": tcat_tweet["from_user_friendcount"], # int 432 "tweet_count": tcat_tweet["from_user_tweetcount"], # int 433 "listed_count": tcat_tweet["from_user_listed"], # int 434 # TCAT has also from_user_favourites_count 435 }, 436 "withheld": { 437 "country_codes": tcat_tweet["from_user_withheld_scope"].split(";"), # list; TODO TCAT has column, but have not seen it populated in testing... This is guess 438 }, 439 # TCAT has also from_user_lang, from_user_utcoffset, from_user_timezone 440 }, 441 "attachments": { 442 # TCAT has some media data, but not the URLs listed 443 "media_keys": [{ 444 "type": media_type, 445 "url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["photo"]]), # str; TCAT does not have the URL though it may be in the list of URLs 446 "variants": [{"url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["video"]]), "bit_rate":0}] # list; This is not the expected direct link to video, but it is a URL to the video 447 # Twitter API v2 has additional data 448 }], # list; TCAT seems to only have one type of media per tweet 449 "poll_ids": None, # list; Missing from TCAT data 450 }, 451 "geo": { 452 "place_id": None, # str; Missing from TCAT data 453 "place": { 454 "country": None, # str; Missing from TCAT data 455 "id": None, # str; Missing from TCAT data 456 "geo": { 457 458 }, 459 "country_code": None, # str; Missing from TCAT data 460 "name": tcat_tweet["location"], # str 461 "place_type": None, # str; Missing from TCAT data 462 "full_name": tcat_tweet["location"], # str 463 }, 464 "coordindates": { 465 "type": None, # str; Missing from TCAT data 466 "coordinates": [tcat_tweet["lng"], tcat_tweet["lat"]], # list i.e. [longitude, latitude] 467 }, 468 }, 469 "withheld": { 470 "copyright": True if tcat_tweet["withheld_copyright"] == 1 else False if tcat_tweet["withheld_copyright"] == 0 else None, # bool; TODO TCAT has column, but have not seen it populated in testing... This is guess 471 "country_codes": tcat_tweet["withheld_scope"].split(";"), # list; TODO TCAT has column, but have not seen it populated in testing... This is guess 472 }, 473 } 474 475 # Referenced Tweets; Twitter API v2 has entire tweet data here which we will be missing 476 referenced_tweets = [] 477 if tcat_tweet["text"][:4] == "RT @": 478 # Retweet 479 referenced_tweets.append({ 480 "type": "retweeted", 481 "id": None, # str; Missing in TCAT data 482 }) 483 if tcat_tweet["quoted_status_id"]: 484 # Quote 485 referenced_tweets.append({ 486 "type": "quoted", 487 "id": tcat_tweet["quoted_status_id"], # str; Missing in TCAT data 488 }) 489 if tcat_tweet["in_reply_to_status_id"]: 490 # Reply 491 referenced_tweets.append({ 492 "type": "replied_to", 493 "id": tcat_tweet["in_reply_to_status_id"], # str; Missing in TCAT data 494 }) 495 # These should NOT be None in case a processor/user attempts to identify a reply using these 496 APIv2_tweet["in_reply_to_user_id"] = "UNKNOWN" # str; Missing from TCAT data 497 APIv2_tweet["in_reply_to_user"] = {"username": "UNKNOWN"} # dict; Missing from TCAT data 498 499 APIv2_tweet["referenced_tweets"] = referenced_tweets # list 500 501 # Append any extra TCAT data 502 additional_TCAT_data = {} 503 for field in SearchWithinTCATBins.additional_TCAT_fields: 504 additional_TCAT_data["TCAT_"+field] = tcat_tweet[field] 505 APIv2_tweet.update(additional_TCAT_data) 506 507 return APIv2_tweet 508 509 @staticmethod 510 def validate_query(query, request, user): 511 """ 512 Validate DMI-TCAT query input 513 514 :param dict query: Query parameters, from client-side. 515 :param request: Flask request 516 :param User user: User object of user who has submitted the query 517 :return dict: Safe query parameters 518 """ 519 # no query 4 u 520 if not query.get("bin", "").strip(): 521 raise QueryParametersException("You must choose a query bin to get tweets from.") 522 523 # Dates need to make sense as a range to search within 524 after, before = query.get("daterange") 525 if (after and before) and before <= after: 526 raise QueryParametersException("A date range must start before it ends") 527 528 query["min_date"], query["max_date"] = query.get("daterange") 529 del query["daterange"] 530 531 # simple! 532 return query 533 534 @staticmethod 535 def map_item(item): 536 """ 537 Use Twitter APIv2 map_item 538 """ 539 mapped_tweet = SearchWithTwitterAPIv2.map_item(item) 540 541 # Add TCAT extra data 542 data = mapped_tweet.get_item_data() 543 message = mapped_tweet.get_message() 544 for field in SearchWithinTCATBins.additional_TCAT_fields: 545 data["TCAT_" + field] = item.get("TCAT_" + field) 546 547 return MappedItem(data, message)
22class SearchWithinTCATBins(Search): 23 """ 24 Get Tweets via DMI-TCAT 25 26 This allows subsetting an existing query bin, similar to the 'Data 27 Selection' panel in the DMI-TCAT analysis interface 28 """ 29 type = "dmi-tcat-search" # job ID 30 extension = "ndjson" 31 title = "TCAT Search (HTTP)" 32 33 # TCAT has a few fields that do not exist in APIv2 34 additional_TCAT_fields = ["to_user_name", "filter_level", "favorite_count", "truncated", "from_user_favourites_count", "from_user_lang", "from_user_utcoffset", 35 "from_user_timezone"] 36 37 options = { 38 "intro-1": { 39 "type": UserInput.OPTION_INFO, 40 "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet " 41 "bin in that instance." 42 }, 43 "divider-1": { 44 "type": UserInput.OPTION_DIVIDER 45 }, 46 "bin": { 47 "type": UserInput.OPTION_INFO, 48 "help": "Query bin" 49 }, 50 "query": { 51 "type": UserInput.OPTION_TEXT, 52 "help": "Query text", 53 "tooltip": "Match all tweets containing this text." 54 }, 55 "query-exclude": { 56 "type": UserInput.OPTION_TEXT, 57 "help": "Exclude text", 58 "tooltip": "Match all tweets that do NOT contain this text." 59 }, 60 "user-name": { 61 "type": UserInput.OPTION_TEXT, 62 "help": "From user", 63 "tooltip": "Match all tweets from this username." 64 }, 65 "user-exclude": { 66 "type": UserInput.OPTION_TEXT, 67 "help": "Exclude user", 68 "tooltip": "Match all tweets NOT from this username." 69 }, 70 "exclude-replies": { 71 "type": UserInput.OPTION_CHOICE, 72 "options": { 73 "exclude": "Exclude replies", 74 "include": "Include replies" 75 }, 76 "help": "Reply tweets", 77 "default": "include", 78 "tooltip": "Choose to exclude or include tweets that are replies from the data" 79 }, 80 "daterange": { 81 "type": UserInput.OPTION_DATERANGE, 82 "help": "Date range" 83 }, 84 # Advanced Options Section 85 "divider-2": { 86 "type": UserInput.OPTION_DIVIDER 87 }, 88 "advanced_options_info": { 89 "type": UserInput.OPTION_INFO, 90 "help": "Advanced Query Options can further refine your query" 91 }, 92 "user-bio": { 93 "type": UserInput.OPTION_TEXT, 94 "help": "User bio text", 95 "tooltip": "Match all tweets from users with biographies containing this text." 96 }, 97 "user-language": { 98 "type": UserInput.OPTION_TEXT, 99 "help": "User language", 100 "tooltip": "Match all tweets from users using this language (as detected by Twitter)." 101 }, 102 "tweet-language": { 103 "type": UserInput.OPTION_TEXT, 104 "help": "Tweet language", 105 "tooltip": "Match all tweets from users with this language (as detected by Twitter)." 106 }, 107 "tweet-client": { 108 "type": UserInput.OPTION_TEXT, 109 "help": "Twitter client URL/descr", 110 "tooltip": "Match all tweets from clients that match this text." 111 }, 112 "url": { 113 "type": UserInput.OPTION_TEXT, 114 "help": "(Part of) URL", 115 "tooltip": "Match all tweets containing this (partial) URL." 116 }, 117 "url-media": { 118 "type": UserInput.OPTION_TEXT, 119 "help": "(Part of) media URL", 120 "tooltip": "Match all tweets containing this (partial) media URL." 121 }, 122 } 123 124 config = { 125 "dmi-tcat-search.instances": { 126 "type": UserInput.OPTION_TEXT_JSON, 127 "help": "DMI-TCAT instances", 128 "tooltip": 'List of DMI-TCAT instance URLs, e.g. ["http://username:password@tcat.instance.webpage.net"]. ' 129 'This needs to be formatted as a JSON list of strings.', 130 "default": {} 131 } 132 } 133 134 bin_data = { 135 "all_bins": {}, 136 "last_collected": {}, 137 } 138 139 @classmethod 140 def collect_all_bins(cls, force_update=False): 141 """ 142 Requests bin information from TCAT instances 143 """ 144 instances = config.get("dmi-tcat-search.instances", []) 145 for instance in instances: 146 # query each configured TCAT instance for a list of bins that can 147 # be subsetted 148 instance = instance.rstrip("/") 149 api_url = instance + "/api/bin-stats.php" 150 151 if force_update or instance not in cls.bin_data["last_collected"] or datetime.datetime.now()-datetime.timedelta(days=1) >= cls.bin_data["last_collected"][instance]: 152 # Collect Instance data 153 try: 154 api_request = requests.get(api_url, timeout=5) 155 instance_bins = json.loads(api_request.content) 156 cls.bin_data["all_bins"][instance] = {k: instance_bins[k] for k in sorted(instance_bins)} 157 cls.bin_data["last_collected"][instance] = datetime.datetime.now() 158 except (requests.RequestException, json.JSONDecodeError): 159 cls.bin_data["all_bins"][instance] = {"failed": True} 160 # TODO: No logger here as nothing has been initialized 161 # print(f"WARNING, unable to collect TCAT bins from instance {instance}") 162 pass 163 164 @classmethod 165 def get_options(cls, parent_dataset=None, user=None): 166 """ 167 Get data source options 168 169 This method takes the pre-defined options, but fills the 'bins' options 170 with bins currently available from the configured TCAT instances. 171 172 :param DataSet parent_dataset: An object representing the dataset that 173 the processor would be run on 174 :param User user: Flask user the options will be displayed for, in 175 case they are requested for display in the 4CAT web interface. This can 176 be used to show some options only to privileges users. 177 """ 178 options = cls.options 179 180 cls.collect_all_bins() 181 if all([data.get("failed", False) for instance, data in cls.bin_data["all_bins"].items()]): 182 options["bin"] = { 183 "type": UserInput.OPTION_INFO, 184 "help": "Could not connect to DMI-TCAT instance(s)." 185 } 186 return options 187 188 options["bin"] = { 189 "type": UserInput.OPTION_CHOICE, 190 "options": {}, 191 "help": "Query bin" 192 } 193 194 for instance, bins in cls.bin_data["all_bins"].items(): 195 # make the host somewhat human-readable 196 # also strip out embedded HTTP auths 197 host = re.sub(r"^https?://", "", instance).split("@").pop() 198 for bin_name, bin in bins.items(): 199 bin_key = "%s@%s" % (bin_name, host) 200 display_text = f"{bin_name}: {bin.get('tweets_approximate')} tweets from {bin.get('range').get('first_tweet')} to {bin.get('range').get('last_tweet')}" 201 options["bin"]["options"][bin_key] = display_text 202 203 return options 204 205 def get_items(self, query): 206 """ 207 Use the DMI-TCAT tweet export to retrieve tweets 208 209 :param query: 210 :return: 211 """ 212 bin = self.parameters.get("bin") 213 bin_name = bin.split("@")[0] 214 bin_host = bin.split("@").pop() 215 216 # we cannot store the full instance URL as a parameter, because it may 217 # contain sensitive information (e.g. HTTP auth) - so we find the full 218 # instance URL again here 219 # while the parameter could be marked 'sensitive', the values would 220 # still show up in e.g. the HTML of the 'create dataset' form 221 available_instances = config.get("dmi-tcat-search.instances", []) 222 instance_url = "" 223 instance = None 224 for available_instance in available_instances: 225 hostname = re.sub(r"https?://", "", available_instance).split("@").pop().rstrip("/") 226 if hostname == bin_host: 227 instance_url = available_instance 228 instance = available_instance.rstrip("/") 229 break 230 231 if not instance_url: 232 return self.dataset.finish_with_error("Invalid DMI-TCAT instance name '%s'" % bin_host) 233 234 # Collect the bins again (ensure we have updated info in case bin is still active) 235 self.collect_all_bins(force_update=True) 236 # Add metadata to parameters 237 try: 238 current_bin = self.bin_data["all_bins"][instance][bin_name] 239 except KeyError: 240 return self.dataset.finish_with_error(f"Lost connection to TCAT instance {bin_host}") 241 # Add TCAT metadata to dataset 242 self.dataset.tcat_bin_data = current_bin 243 if current_bin.get("type") in ["follow", "track", "timeline", "geotrack"] and ("phrase_times" not in current_bin or not "user_times" not in current_bin): 244 self.dataset.update_status("Warning: TCAT not updated to send phrase and user time ranges; consider updating if you would like to retain this BIN metadata.") 245 246 # now get the parameters... 247 request_url = instance_url.rstrip("/") + "/analysis/mod.export_tweets.php" 248 249 # Allow for blank dates 250 if self.parameters.get("min_date"): 251 start_date = datetime.datetime.fromtimestamp(self.parameters.get("min_date")).strftime("%Y-%m-%d") 252 else: 253 first_tweet_timestamp = current_bin.get('range').get('first_tweet') 254 start_date = datetime.datetime.strptime(first_tweet_timestamp, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d") 255 256 end_date = datetime.datetime.fromtimestamp(self.parameters.get("max_date")).strftime("%Y-%m-%d") if self.parameters.get("max_date") else (datetime.datetime.now() + datetime.timedelta(days=1)).strftime("%Y-%m-%d") 257 parameters = { 258 "dataset": bin_name, 259 "query": self.parameters.get("query"), 260 "url_query": self.parameters.get("url"), 261 "media_url_query": self.parameters.get("url-media"), 262 "exclude": self.parameters.get("query-exclude"), 263 "from_user_name": self.parameters.get("user-name"), 264 "from_user_lang": self.parameters.get("user-language"), 265 "lang": self.parameters.get("tweet-language"), 266 "exclude_from_user_name": self.parameters.get("user-exclude"), 267 "from_source": re.sub(r"<[^>]+>", "", self.parameters.get("tweet-client")), 268 "startdate": start_date, 269 "enddate": end_date, 270 "replyto": "yes" if self.parameters.get("exclude-replies") == "exclude" else "no", 271 "whattodo": "", 272 "exportSettings": "urls,mentions,hashtags,media,", 273 "graph_resolution": "day", 274 "outputformat": "csv" 275 } 276 277 # for now we simply request the full CSV export of the bin with the 278 # given parameters, letting TCAT handle the full text search and so 279 # on 280 self.dataset.update_status("Searching for tweets on %s" % bin_host) 281 response = requests.get(request_url, params=parameters, stream=True) 282 if response.status_code != 200: 283 return self.dataset.finish_with_error("Query bin not available: received HTTP Error %i" % response.status_code) 284 285 # process the file in 1kB chunks, buffer as we go 286 # If a newline is encountered, the buffer is processed as a row of csv 287 # data. This works as long as there are no newlines in the csv itself, 288 # which is the case for TCAT exports. Processing as a stream is needed 289 # to avoid having to load the full file in memory 290 buffer = bytearray() 291 fieldnames = None 292 items = 0 293 encoding = None 294 for chunk in response.iter_content(chunk_size=1024): 295 # see if this chunk contains a newline, in which case we have a 296 # full line to process (e.g. as a tweet) 297 lines = [] 298 buffer += bytearray(chunk) 299 300 if not encoding and len(buffer) > 3: 301 # response.encoding is not correct sometimes, since it does not 302 # indicate that the file uses a BOM, so sniff it instead once 303 # we have some bytes 304 encoding = sniff_encoding(buffer) 305 306 # split buffer by newlines and process each full line 307 # the last line is always carried over, since it may be incomplete 308 if b"\n" in buffer: 309 buffered_lines = buffer.split(b"\n") 310 lines = buffered_lines[:-1] 311 buffer = buffered_lines.pop() 312 elif not chunk: 313 # eof, process left-over data 314 lines = buffer.split(b"\n") 315 316 # and finally we can process the data 317 for line in lines: 318 # use a dummy csv reader to abstract away the annoying csv parsing 319 # this is quite a bit of overhead, but beats implementing csv parsing 320 # manually, and it's still reasonably fast (about 10k/second) 321 dummy_file = io.TextIOWrapper(io.BytesIO(line.replace(b"\0", b"")), encoding=encoding) 322 reader = csv.reader(dummy_file, 323 delimiter=",", 324 quotechar='"', 325 doublequote=True, 326 quoting=csv.QUOTE_MINIMAL) 327 row_data = next(reader) 328 329 if row_data and not fieldnames: 330 # first line in file 331 fieldnames = row_data.copy() 332 333 elif row_data: 334 tweet = dict(zip(fieldnames, row_data)) 335 items += 1 336 337 if items % 250 == 0: 338 self.dataset.update_status("Loaded %i tweets from bin %s@%s" % (items, bin_name, bin_host)) 339 340 yield self.tcat_to_APIv2(tweet) 341 342 if not chunk: 343 # end of file 344 break 345 346 @ staticmethod 347 def tcat_to_4cat_time(tcat_time): 348 """ 349 Twitter APIv2 time is in format "%Y-%m-%dT%H:%M:%S.000Z" while TCAT uses "%Y-%m-%d %H:%M:%S" and a timestamp. 350 351 :return datetime: 352 """ 353 try: 354 tcat_time = int(tcat_time) 355 return datetime.datetime.fromtimestamp(tcat_time).strftime("%Y-%m-%dT%H:%M:%S.000Z") 356 except ValueError: 357 return datetime.datetime.strptime(tcat_time, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S.000Z") 358 359 @staticmethod 360 def tcat_to_APIv2(tcat_tweet): 361 """ 362 Attempt to construct a 4CAT tweet gathered from APIv2 to allow for use of Twitter specific processors! 363 364 A great deal of information is missing so there may result in some issues. Notes are kept for the expected 365 type and, if the data is missing in TCAT, None is used. Therefor it should be possible to refactor processors 366 to handle None if necessary. 367 """ 368 # We're missing lots of data here... 369 370 urls = [url.strip() for url in (tcat_tweet["urls_expanded"].split(";") if tcat_tweet["urls_expanded"] else tcat_tweet["urls_followed"].split(";") if tcat_tweet["urls_followed"] else tcat_tweet["urls_followed"].split(";")) if url] 371 # TCAT media_id: 7 = video, 3 = photo, 16 = animated_gif 372 media_type = "video" if tcat_tweet["media_id"] == "7" else "photo" if tcat_tweet["media_id"] == "3" else "animated_gif" if tcat_tweet["media_id"] == "16" else tcat_tweet["media_id"] 373 374 # 4CAT Twitter APIv2 result data structure 375 APIv2_tweet = { 376 "lang": tcat_tweet["lang"], # str 377 "source": tcat_tweet["source"], # REMOVED FROM TWITTER API v2 378 "possibly_sensitive": True if tcat_tweet["possibly_sensitive"] == 1 else False if tcat_tweet["possibly_sensitive"] == 0 else None, # bool 379 "text": tcat_tweet["text"], # str 380 "edit_history_tweet_ids": None, # list; Missing in TCAT data 381 "public_metrics": { 382 "retweet_count": tcat_tweet["retweet_count"], # int 383 "reply_count": None, # int; Missing in TCAT data 384 "like_count": tcat_tweet["favorite_count"], # int 385 "quote_count": None, # int; Missing in TCAT data 386 "impression_count": None, # int; Missing in TCAT data 387 # TCAT has also favorite_count 388 }, 389 "entities": { 390 "mentions": [{ 391 "id": None, # str; Missing in TCAT data 392 "username": mention.strip(), # str 393 # Twitter v2 API has additional user fields 394 } for mention in tcat_tweet["mentions"].split(";") if mention], 395 "annotations": None, # list; Missing in TCAT data 396 "urls": [{ 397 "url": url, # str 398 "expanded_url": url, # str 399 # Twitter v2 API has additional URL fields 400 } for url in urls], 401 "hashtags": [{ 402 "tag": hashtag.strip(), # str 403 "start": None, # int; Missing in TCAT data 404 "end": None, # int; Missing in TCAT data 405 } for hashtag in tcat_tweet["hashtags"].split(";") if hashtag], 406 "cashtags": None, # list; Missing in TCAT data 407 }, 408 "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["time"]), # str 409 "id": tcat_tweet["id"], # str 410 "author_id": tcat_tweet["from_user_id"], # str 411 "context_annotations": None, # list; Missing in TCAT data 412 "reply_settings": None, # str; Missing in TCAT data 413 "conversation_id": None, # str; TCAT has a in_reply_to_status_id but this is not necessarily the original Tweet that started the conversation 414 "author_user": { 415 "protected": None, # bool; Missing in TCAT data 416 "verified": True if tcat_tweet["from_user_verified"] == 1 else False if tcat_tweet["from_user_verified"] == 0 else None, # bool 417 "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["from_user_created_at"]), # str 418 "name": tcat_tweet["from_user_realname"], # str 419 "entities": { 420 "description": None, # dict; contains entities from author description such as mentions, URLs, etc.; Missing in TCAT data 421 "url": None, # dict; containers entities from author url e.g. URL data; Missing in TCAT data 422 }, 423 "description": tcat_tweet["from_user_description"], # str 424 "pinned_tweet_id": None, # str; Missing in TCAT data 425 "profile_image_url": tcat_tweet["from_user_profile_image_url"], # str 426 "url": tcat_tweet["from_user_url"], # str 427 "username": tcat_tweet["from_user_name"], # str 428 "id": tcat_tweet["from_user_id"], # str 429 "location": None, # str; Missing in TCAT data 430 "public_metrics": { 431 "followers_count": tcat_tweet["from_user_followercount"], # int 432 "following_count": tcat_tweet["from_user_friendcount"], # int 433 "tweet_count": tcat_tweet["from_user_tweetcount"], # int 434 "listed_count": tcat_tweet["from_user_listed"], # int 435 # TCAT has also from_user_favourites_count 436 }, 437 "withheld": { 438 "country_codes": tcat_tweet["from_user_withheld_scope"].split(";"), # list; TODO TCAT has column, but have not seen it populated in testing... This is guess 439 }, 440 # TCAT has also from_user_lang, from_user_utcoffset, from_user_timezone 441 }, 442 "attachments": { 443 # TCAT has some media data, but not the URLs listed 444 "media_keys": [{ 445 "type": media_type, 446 "url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["photo"]]), # str; TCAT does not have the URL though it may be in the list of URLs 447 "variants": [{"url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["video"]]), "bit_rate":0}] # list; This is not the expected direct link to video, but it is a URL to the video 448 # Twitter API v2 has additional data 449 }], # list; TCAT seems to only have one type of media per tweet 450 "poll_ids": None, # list; Missing from TCAT data 451 }, 452 "geo": { 453 "place_id": None, # str; Missing from TCAT data 454 "place": { 455 "country": None, # str; Missing from TCAT data 456 "id": None, # str; Missing from TCAT data 457 "geo": { 458 459 }, 460 "country_code": None, # str; Missing from TCAT data 461 "name": tcat_tweet["location"], # str 462 "place_type": None, # str; Missing from TCAT data 463 "full_name": tcat_tweet["location"], # str 464 }, 465 "coordindates": { 466 "type": None, # str; Missing from TCAT data 467 "coordinates": [tcat_tweet["lng"], tcat_tweet["lat"]], # list i.e. [longitude, latitude] 468 }, 469 }, 470 "withheld": { 471 "copyright": True if tcat_tweet["withheld_copyright"] == 1 else False if tcat_tweet["withheld_copyright"] == 0 else None, # bool; TODO TCAT has column, but have not seen it populated in testing... This is guess 472 "country_codes": tcat_tweet["withheld_scope"].split(";"), # list; TODO TCAT has column, but have not seen it populated in testing... This is guess 473 }, 474 } 475 476 # Referenced Tweets; Twitter API v2 has entire tweet data here which we will be missing 477 referenced_tweets = [] 478 if tcat_tweet["text"][:4] == "RT @": 479 # Retweet 480 referenced_tweets.append({ 481 "type": "retweeted", 482 "id": None, # str; Missing in TCAT data 483 }) 484 if tcat_tweet["quoted_status_id"]: 485 # Quote 486 referenced_tweets.append({ 487 "type": "quoted", 488 "id": tcat_tweet["quoted_status_id"], # str; Missing in TCAT data 489 }) 490 if tcat_tweet["in_reply_to_status_id"]: 491 # Reply 492 referenced_tweets.append({ 493 "type": "replied_to", 494 "id": tcat_tweet["in_reply_to_status_id"], # str; Missing in TCAT data 495 }) 496 # These should NOT be None in case a processor/user attempts to identify a reply using these 497 APIv2_tweet["in_reply_to_user_id"] = "UNKNOWN" # str; Missing from TCAT data 498 APIv2_tweet["in_reply_to_user"] = {"username": "UNKNOWN"} # dict; Missing from TCAT data 499 500 APIv2_tweet["referenced_tweets"] = referenced_tweets # list 501 502 # Append any extra TCAT data 503 additional_TCAT_data = {} 504 for field in SearchWithinTCATBins.additional_TCAT_fields: 505 additional_TCAT_data["TCAT_"+field] = tcat_tweet[field] 506 APIv2_tweet.update(additional_TCAT_data) 507 508 return APIv2_tweet 509 510 @staticmethod 511 def validate_query(query, request, user): 512 """ 513 Validate DMI-TCAT query input 514 515 :param dict query: Query parameters, from client-side. 516 :param request: Flask request 517 :param User user: User object of user who has submitted the query 518 :return dict: Safe query parameters 519 """ 520 # no query 4 u 521 if not query.get("bin", "").strip(): 522 raise QueryParametersException("You must choose a query bin to get tweets from.") 523 524 # Dates need to make sense as a range to search within 525 after, before = query.get("daterange") 526 if (after and before) and before <= after: 527 raise QueryParametersException("A date range must start before it ends") 528 529 query["min_date"], query["max_date"] = query.get("daterange") 530 del query["daterange"] 531 532 # simple! 533 return query 534 535 @staticmethod 536 def map_item(item): 537 """ 538 Use Twitter APIv2 map_item 539 """ 540 mapped_tweet = SearchWithTwitterAPIv2.map_item(item) 541 542 # Add TCAT extra data 543 data = mapped_tweet.get_item_data() 544 message = mapped_tweet.get_message() 545 for field in SearchWithinTCATBins.additional_TCAT_fields: 546 data["TCAT_" + field] = item.get("TCAT_" + field) 547 548 return MappedItem(data, message)
Get Tweets via DMI-TCAT
This allows subsetting an existing query bin, similar to the 'Data Selection' panel in the DMI-TCAT analysis interface
139 @classmethod 140 def collect_all_bins(cls, force_update=False): 141 """ 142 Requests bin information from TCAT instances 143 """ 144 instances = config.get("dmi-tcat-search.instances", []) 145 for instance in instances: 146 # query each configured TCAT instance for a list of bins that can 147 # be subsetted 148 instance = instance.rstrip("/") 149 api_url = instance + "/api/bin-stats.php" 150 151 if force_update or instance not in cls.bin_data["last_collected"] or datetime.datetime.now()-datetime.timedelta(days=1) >= cls.bin_data["last_collected"][instance]: 152 # Collect Instance data 153 try: 154 api_request = requests.get(api_url, timeout=5) 155 instance_bins = json.loads(api_request.content) 156 cls.bin_data["all_bins"][instance] = {k: instance_bins[k] for k in sorted(instance_bins)} 157 cls.bin_data["last_collected"][instance] = datetime.datetime.now() 158 except (requests.RequestException, json.JSONDecodeError): 159 cls.bin_data["all_bins"][instance] = {"failed": True} 160 # TODO: No logger here as nothing has been initialized 161 # print(f"WARNING, unable to collect TCAT bins from instance {instance}") 162 pass
Requests bin information from TCAT instances
164 @classmethod 165 def get_options(cls, parent_dataset=None, user=None): 166 """ 167 Get data source options 168 169 This method takes the pre-defined options, but fills the 'bins' options 170 with bins currently available from the configured TCAT instances. 171 172 :param DataSet parent_dataset: An object representing the dataset that 173 the processor would be run on 174 :param User user: Flask user the options will be displayed for, in 175 case they are requested for display in the 4CAT web interface. This can 176 be used to show some options only to privileges users. 177 """ 178 options = cls.options 179 180 cls.collect_all_bins() 181 if all([data.get("failed", False) for instance, data in cls.bin_data["all_bins"].items()]): 182 options["bin"] = { 183 "type": UserInput.OPTION_INFO, 184 "help": "Could not connect to DMI-TCAT instance(s)." 185 } 186 return options 187 188 options["bin"] = { 189 "type": UserInput.OPTION_CHOICE, 190 "options": {}, 191 "help": "Query bin" 192 } 193 194 for instance, bins in cls.bin_data["all_bins"].items(): 195 # make the host somewhat human-readable 196 # also strip out embedded HTTP auths 197 host = re.sub(r"^https?://", "", instance).split("@").pop() 198 for bin_name, bin in bins.items(): 199 bin_key = "%s@%s" % (bin_name, host) 200 display_text = f"{bin_name}: {bin.get('tweets_approximate')} tweets from {bin.get('range').get('first_tweet')} to {bin.get('range').get('last_tweet')}" 201 options["bin"]["options"][bin_key] = display_text 202 203 return options
Get data source options
This method takes the pre-defined options, but fills the 'bins' options with bins currently available from the configured TCAT instances.
Parameters
- DataSet parent_dataset: An object representing the dataset that the processor would be run on
- User user: Flask user the options will be displayed for, in case they are requested for display in the 4CAT web interface. This can be used to show some options only to privileges users.
205 def get_items(self, query): 206 """ 207 Use the DMI-TCAT tweet export to retrieve tweets 208 209 :param query: 210 :return: 211 """ 212 bin = self.parameters.get("bin") 213 bin_name = bin.split("@")[0] 214 bin_host = bin.split("@").pop() 215 216 # we cannot store the full instance URL as a parameter, because it may 217 # contain sensitive information (e.g. HTTP auth) - so we find the full 218 # instance URL again here 219 # while the parameter could be marked 'sensitive', the values would 220 # still show up in e.g. the HTML of the 'create dataset' form 221 available_instances = config.get("dmi-tcat-search.instances", []) 222 instance_url = "" 223 instance = None 224 for available_instance in available_instances: 225 hostname = re.sub(r"https?://", "", available_instance).split("@").pop().rstrip("/") 226 if hostname == bin_host: 227 instance_url = available_instance 228 instance = available_instance.rstrip("/") 229 break 230 231 if not instance_url: 232 return self.dataset.finish_with_error("Invalid DMI-TCAT instance name '%s'" % bin_host) 233 234 # Collect the bins again (ensure we have updated info in case bin is still active) 235 self.collect_all_bins(force_update=True) 236 # Add metadata to parameters 237 try: 238 current_bin = self.bin_data["all_bins"][instance][bin_name] 239 except KeyError: 240 return self.dataset.finish_with_error(f"Lost connection to TCAT instance {bin_host}") 241 # Add TCAT metadata to dataset 242 self.dataset.tcat_bin_data = current_bin 243 if current_bin.get("type") in ["follow", "track", "timeline", "geotrack"] and ("phrase_times" not in current_bin or not "user_times" not in current_bin): 244 self.dataset.update_status("Warning: TCAT not updated to send phrase and user time ranges; consider updating if you would like to retain this BIN metadata.") 245 246 # now get the parameters... 247 request_url = instance_url.rstrip("/") + "/analysis/mod.export_tweets.php" 248 249 # Allow for blank dates 250 if self.parameters.get("min_date"): 251 start_date = datetime.datetime.fromtimestamp(self.parameters.get("min_date")).strftime("%Y-%m-%d") 252 else: 253 first_tweet_timestamp = current_bin.get('range').get('first_tweet') 254 start_date = datetime.datetime.strptime(first_tweet_timestamp, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d") 255 256 end_date = datetime.datetime.fromtimestamp(self.parameters.get("max_date")).strftime("%Y-%m-%d") if self.parameters.get("max_date") else (datetime.datetime.now() + datetime.timedelta(days=1)).strftime("%Y-%m-%d") 257 parameters = { 258 "dataset": bin_name, 259 "query": self.parameters.get("query"), 260 "url_query": self.parameters.get("url"), 261 "media_url_query": self.parameters.get("url-media"), 262 "exclude": self.parameters.get("query-exclude"), 263 "from_user_name": self.parameters.get("user-name"), 264 "from_user_lang": self.parameters.get("user-language"), 265 "lang": self.parameters.get("tweet-language"), 266 "exclude_from_user_name": self.parameters.get("user-exclude"), 267 "from_source": re.sub(r"<[^>]+>", "", self.parameters.get("tweet-client")), 268 "startdate": start_date, 269 "enddate": end_date, 270 "replyto": "yes" if self.parameters.get("exclude-replies") == "exclude" else "no", 271 "whattodo": "", 272 "exportSettings": "urls,mentions,hashtags,media,", 273 "graph_resolution": "day", 274 "outputformat": "csv" 275 } 276 277 # for now we simply request the full CSV export of the bin with the 278 # given parameters, letting TCAT handle the full text search and so 279 # on 280 self.dataset.update_status("Searching for tweets on %s" % bin_host) 281 response = requests.get(request_url, params=parameters, stream=True) 282 if response.status_code != 200: 283 return self.dataset.finish_with_error("Query bin not available: received HTTP Error %i" % response.status_code) 284 285 # process the file in 1kB chunks, buffer as we go 286 # If a newline is encountered, the buffer is processed as a row of csv 287 # data. This works as long as there are no newlines in the csv itself, 288 # which is the case for TCAT exports. Processing as a stream is needed 289 # to avoid having to load the full file in memory 290 buffer = bytearray() 291 fieldnames = None 292 items = 0 293 encoding = None 294 for chunk in response.iter_content(chunk_size=1024): 295 # see if this chunk contains a newline, in which case we have a 296 # full line to process (e.g. as a tweet) 297 lines = [] 298 buffer += bytearray(chunk) 299 300 if not encoding and len(buffer) > 3: 301 # response.encoding is not correct sometimes, since it does not 302 # indicate that the file uses a BOM, so sniff it instead once 303 # we have some bytes 304 encoding = sniff_encoding(buffer) 305 306 # split buffer by newlines and process each full line 307 # the last line is always carried over, since it may be incomplete 308 if b"\n" in buffer: 309 buffered_lines = buffer.split(b"\n") 310 lines = buffered_lines[:-1] 311 buffer = buffered_lines.pop() 312 elif not chunk: 313 # eof, process left-over data 314 lines = buffer.split(b"\n") 315 316 # and finally we can process the data 317 for line in lines: 318 # use a dummy csv reader to abstract away the annoying csv parsing 319 # this is quite a bit of overhead, but beats implementing csv parsing 320 # manually, and it's still reasonably fast (about 10k/second) 321 dummy_file = io.TextIOWrapper(io.BytesIO(line.replace(b"\0", b"")), encoding=encoding) 322 reader = csv.reader(dummy_file, 323 delimiter=",", 324 quotechar='"', 325 doublequote=True, 326 quoting=csv.QUOTE_MINIMAL) 327 row_data = next(reader) 328 329 if row_data and not fieldnames: 330 # first line in file 331 fieldnames = row_data.copy() 332 333 elif row_data: 334 tweet = dict(zip(fieldnames, row_data)) 335 items += 1 336 337 if items % 250 == 0: 338 self.dataset.update_status("Loaded %i tweets from bin %s@%s" % (items, bin_name, bin_host)) 339 340 yield self.tcat_to_APIv2(tweet) 341 342 if not chunk: 343 # end of file 344 break
Use the DMI-TCAT tweet export to retrieve tweets
Parameters
- query:
Returns
346 @ staticmethod 347 def tcat_to_4cat_time(tcat_time): 348 """ 349 Twitter APIv2 time is in format "%Y-%m-%dT%H:%M:%S.000Z" while TCAT uses "%Y-%m-%d %H:%M:%S" and a timestamp. 350 351 :return datetime: 352 """ 353 try: 354 tcat_time = int(tcat_time) 355 return datetime.datetime.fromtimestamp(tcat_time).strftime("%Y-%m-%dT%H:%M:%S.000Z") 356 except ValueError: 357 return datetime.datetime.strptime(tcat_time, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S.000Z")
Twitter APIv2 time is in format "%Y-%m-%dT%H:%M:%S.000Z" while TCAT uses "%Y-%m-%d %H:%M:%S" and a timestamp.
Returns
359 @staticmethod 360 def tcat_to_APIv2(tcat_tweet): 361 """ 362 Attempt to construct a 4CAT tweet gathered from APIv2 to allow for use of Twitter specific processors! 363 364 A great deal of information is missing so there may result in some issues. Notes are kept for the expected 365 type and, if the data is missing in TCAT, None is used. Therefor it should be possible to refactor processors 366 to handle None if necessary. 367 """ 368 # We're missing lots of data here... 369 370 urls = [url.strip() for url in (tcat_tweet["urls_expanded"].split(";") if tcat_tweet["urls_expanded"] else tcat_tweet["urls_followed"].split(";") if tcat_tweet["urls_followed"] else tcat_tweet["urls_followed"].split(";")) if url] 371 # TCAT media_id: 7 = video, 3 = photo, 16 = animated_gif 372 media_type = "video" if tcat_tweet["media_id"] == "7" else "photo" if tcat_tweet["media_id"] == "3" else "animated_gif" if tcat_tweet["media_id"] == "16" else tcat_tweet["media_id"] 373 374 # 4CAT Twitter APIv2 result data structure 375 APIv2_tweet = { 376 "lang": tcat_tweet["lang"], # str 377 "source": tcat_tweet["source"], # REMOVED FROM TWITTER API v2 378 "possibly_sensitive": True if tcat_tweet["possibly_sensitive"] == 1 else False if tcat_tweet["possibly_sensitive"] == 0 else None, # bool 379 "text": tcat_tweet["text"], # str 380 "edit_history_tweet_ids": None, # list; Missing in TCAT data 381 "public_metrics": { 382 "retweet_count": tcat_tweet["retweet_count"], # int 383 "reply_count": None, # int; Missing in TCAT data 384 "like_count": tcat_tweet["favorite_count"], # int 385 "quote_count": None, # int; Missing in TCAT data 386 "impression_count": None, # int; Missing in TCAT data 387 # TCAT has also favorite_count 388 }, 389 "entities": { 390 "mentions": [{ 391 "id": None, # str; Missing in TCAT data 392 "username": mention.strip(), # str 393 # Twitter v2 API has additional user fields 394 } for mention in tcat_tweet["mentions"].split(";") if mention], 395 "annotations": None, # list; Missing in TCAT data 396 "urls": [{ 397 "url": url, # str 398 "expanded_url": url, # str 399 # Twitter v2 API has additional URL fields 400 } for url in urls], 401 "hashtags": [{ 402 "tag": hashtag.strip(), # str 403 "start": None, # int; Missing in TCAT data 404 "end": None, # int; Missing in TCAT data 405 } for hashtag in tcat_tweet["hashtags"].split(";") if hashtag], 406 "cashtags": None, # list; Missing in TCAT data 407 }, 408 "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["time"]), # str 409 "id": tcat_tweet["id"], # str 410 "author_id": tcat_tweet["from_user_id"], # str 411 "context_annotations": None, # list; Missing in TCAT data 412 "reply_settings": None, # str; Missing in TCAT data 413 "conversation_id": None, # str; TCAT has a in_reply_to_status_id but this is not necessarily the original Tweet that started the conversation 414 "author_user": { 415 "protected": None, # bool; Missing in TCAT data 416 "verified": True if tcat_tweet["from_user_verified"] == 1 else False if tcat_tweet["from_user_verified"] == 0 else None, # bool 417 "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["from_user_created_at"]), # str 418 "name": tcat_tweet["from_user_realname"], # str 419 "entities": { 420 "description": None, # dict; contains entities from author description such as mentions, URLs, etc.; Missing in TCAT data 421 "url": None, # dict; containers entities from author url e.g. URL data; Missing in TCAT data 422 }, 423 "description": tcat_tweet["from_user_description"], # str 424 "pinned_tweet_id": None, # str; Missing in TCAT data 425 "profile_image_url": tcat_tweet["from_user_profile_image_url"], # str 426 "url": tcat_tweet["from_user_url"], # str 427 "username": tcat_tweet["from_user_name"], # str 428 "id": tcat_tweet["from_user_id"], # str 429 "location": None, # str; Missing in TCAT data 430 "public_metrics": { 431 "followers_count": tcat_tweet["from_user_followercount"], # int 432 "following_count": tcat_tweet["from_user_friendcount"], # int 433 "tweet_count": tcat_tweet["from_user_tweetcount"], # int 434 "listed_count": tcat_tweet["from_user_listed"], # int 435 # TCAT has also from_user_favourites_count 436 }, 437 "withheld": { 438 "country_codes": tcat_tweet["from_user_withheld_scope"].split(";"), # list; TODO TCAT has column, but have not seen it populated in testing... This is guess 439 }, 440 # TCAT has also from_user_lang, from_user_utcoffset, from_user_timezone 441 }, 442 "attachments": { 443 # TCAT has some media data, but not the URLs listed 444 "media_keys": [{ 445 "type": media_type, 446 "url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["photo"]]), # str; TCAT does not have the URL though it may be in the list of URLs 447 "variants": [{"url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["video"]]), "bit_rate":0}] # list; This is not the expected direct link to video, but it is a URL to the video 448 # Twitter API v2 has additional data 449 }], # list; TCAT seems to only have one type of media per tweet 450 "poll_ids": None, # list; Missing from TCAT data 451 }, 452 "geo": { 453 "place_id": None, # str; Missing from TCAT data 454 "place": { 455 "country": None, # str; Missing from TCAT data 456 "id": None, # str; Missing from TCAT data 457 "geo": { 458 459 }, 460 "country_code": None, # str; Missing from TCAT data 461 "name": tcat_tweet["location"], # str 462 "place_type": None, # str; Missing from TCAT data 463 "full_name": tcat_tweet["location"], # str 464 }, 465 "coordindates": { 466 "type": None, # str; Missing from TCAT data 467 "coordinates": [tcat_tweet["lng"], tcat_tweet["lat"]], # list i.e. [longitude, latitude] 468 }, 469 }, 470 "withheld": { 471 "copyright": True if tcat_tweet["withheld_copyright"] == 1 else False if tcat_tweet["withheld_copyright"] == 0 else None, # bool; TODO TCAT has column, but have not seen it populated in testing... This is guess 472 "country_codes": tcat_tweet["withheld_scope"].split(";"), # list; TODO TCAT has column, but have not seen it populated in testing... This is guess 473 }, 474 } 475 476 # Referenced Tweets; Twitter API v2 has entire tweet data here which we will be missing 477 referenced_tweets = [] 478 if tcat_tweet["text"][:4] == "RT @": 479 # Retweet 480 referenced_tweets.append({ 481 "type": "retweeted", 482 "id": None, # str; Missing in TCAT data 483 }) 484 if tcat_tweet["quoted_status_id"]: 485 # Quote 486 referenced_tweets.append({ 487 "type": "quoted", 488 "id": tcat_tweet["quoted_status_id"], # str; Missing in TCAT data 489 }) 490 if tcat_tweet["in_reply_to_status_id"]: 491 # Reply 492 referenced_tweets.append({ 493 "type": "replied_to", 494 "id": tcat_tweet["in_reply_to_status_id"], # str; Missing in TCAT data 495 }) 496 # These should NOT be None in case a processor/user attempts to identify a reply using these 497 APIv2_tweet["in_reply_to_user_id"] = "UNKNOWN" # str; Missing from TCAT data 498 APIv2_tweet["in_reply_to_user"] = {"username": "UNKNOWN"} # dict; Missing from TCAT data 499 500 APIv2_tweet["referenced_tweets"] = referenced_tweets # list 501 502 # Append any extra TCAT data 503 additional_TCAT_data = {} 504 for field in SearchWithinTCATBins.additional_TCAT_fields: 505 additional_TCAT_data["TCAT_"+field] = tcat_tweet[field] 506 APIv2_tweet.update(additional_TCAT_data) 507 508 return APIv2_tweet
Attempt to construct a 4CAT tweet gathered from APIv2 to allow for use of Twitter specific processors!
A great deal of information is missing so there may result in some issues. Notes are kept for the expected type and, if the data is missing in TCAT, None is used. Therefor it should be possible to refactor processors to handle None if necessary.
510 @staticmethod 511 def validate_query(query, request, user): 512 """ 513 Validate DMI-TCAT query input 514 515 :param dict query: Query parameters, from client-side. 516 :param request: Flask request 517 :param User user: User object of user who has submitted the query 518 :return dict: Safe query parameters 519 """ 520 # no query 4 u 521 if not query.get("bin", "").strip(): 522 raise QueryParametersException("You must choose a query bin to get tweets from.") 523 524 # Dates need to make sense as a range to search within 525 after, before = query.get("daterange") 526 if (after and before) and before <= after: 527 raise QueryParametersException("A date range must start before it ends") 528 529 query["min_date"], query["max_date"] = query.get("daterange") 530 del query["daterange"] 531 532 # simple! 533 return query
Validate DMI-TCAT query input
Parameters
- dict query: Query parameters, from client-side.
- request: Flask request
- User user: User object of user who has submitted the query
Returns
Safe query parameters
535 @staticmethod 536 def map_item(item): 537 """ 538 Use Twitter APIv2 map_item 539 """ 540 mapped_tweet = SearchWithTwitterAPIv2.map_item(item) 541 542 # Add TCAT extra data 543 data = mapped_tweet.get_item_data() 544 message = mapped_tweet.get_message() 545 for field in SearchWithinTCATBins.additional_TCAT_fields: 546 data["TCAT_" + field] = item.get("TCAT_" + field) 547 548 return MappedItem(data, message)
Use Twitter APIv2 map_item
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- description
- category
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor