datasources.dmi-tcatv2.search_tcat_v2
Twitter search within a DMI-TCAT bin v2 Direct database connection
1""" 2Twitter search within a DMI-TCAT bin v2 3Direct database connection 4""" 5import datetime 6import re 7import ural 8import pymysql 9 10 11from backend.lib.search import Search 12from common.lib.exceptions import QueryParametersException 13from common.lib.user_input import UserInput 14from backend.lib.database_mysql import MySQLDatabase 15 16 17class SearchWithinTCATBinsV2(Search): 18 """ 19 Get Tweets via DMI-TCAT 20 21 This allows subsetting an existing query bin, similar to the 'Data 22 Selection' panel in the DMI-TCAT analysis interface 23 """ 24 type = "dmi-tcatv2-search" # job ID 25 extension = "csv" 26 title = "TCAT Search (SQL)" 27 28 options = { 29 "intro-1": { 30 "type": UserInput.OPTION_INFO, 31 "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet " 32 "bin in that instance." 33 }, 34 "divider-1": { 35 "type": UserInput.OPTION_DIVIDER 36 }, 37 "bin": { 38 "type": UserInput.OPTION_INFO, 39 "help": "Query bin" 40 }, 41 "query_type": { 42 "type": UserInput.OPTION_CHOICE, 43 "help": "Basic or Advanced Query", 44 "options": { 45 "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)", 46 "advanced": "Select queries on any TCAT twitter tables" 47 }, 48 "default": "basic", 49 "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary" 50 }, 51 "query": { 52 "type": UserInput.OPTION_TEXT, 53 "help": "Query text", 54 "tooltip": "Match all tweets containing this text." 55 }, 56 "daterange": { 57 "type": UserInput.OPTION_DATERANGE, 58 "help": "Date range" 59 } 60 } 61 62 config = { 63 "dmi-tcatv2-search.database_instances": { 64 "type": UserInput.OPTION_TEXT_JSON, 65 "help": "DMI-TCAT instances", 66 "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture'," 67 "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of " 68 "these values need to be provided for each instance. This needs to be formatted as a JSON list of " 69 "objects.", 70 "default": {} 71 } 72 } 73 74 @classmethod 75 def get_options(cls, parent_dataset=None, config=None): 76 """ 77 Get data source options 78 79 This method takes the pre-defined options, but fills the 'bins' options 80 with bins currently available from the configured TCAT instances. 81 82 :param config: 83 :param DataSet parent_dataset: An object representing the dataset that 84 the processor would be run on 85can 86 be used to show some options only to privileges users. 87 """ 88 options = cls.options 89 90 # Collect Metadata from TCAT instances 91 all_bins = cls.collect_tcat_metadata(config) 92 93 options["bin"] = { 94 "type": UserInput.OPTION_CHOICE, 95 "options": {}, 96 "help": "Query bin" 97 } 98 99 for instance_name, bins in all_bins.items(): 100 for bin_name, bin in bins.items(): 101 bin_key = "%s@%s" % (bin_name, instance_name) 102 display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}" 103 options["bin"]["options"][bin_key] = display_text 104 105 return options 106 107 def get_items(self, query): 108 """ 109 Use the DMI-TCAT tweet export to retrieve tweets 110 111 :param query: 112 :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result 113 """ 114 bin = self.parameters.get("bin") 115 bin_name = bin.split("@")[0] 116 tcat_name = bin.split("@").pop() 117 118 available_instances = self.config.get("dmi-tcatv2-search.database_instances", []) 119 instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0] 120 121 db = MySQLDatabase(logger=self.log, 122 dbname=instance.get('db_name'), 123 user=instance.get('db_user'), 124 password=instance.get('db_password'), 125 host=instance.get('db_host'), 126 port=instance.get('db_port')) 127 128 self.dataset.update_status("Searching for tweets on %s" % bin_name) 129 if self.parameters.get("query_type") == 'advanced': 130 # Advanced query should be simple from our perspective... 131 self.dataset.log('Query: %s' % self.parameters.get("query")) 132 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 133 try: 134 unbuffered_cursor.execute(self.parameters.get("query")) 135 except pymysql.err.ProgrammingError as e: 136 self.dataset.update_status("SQL query error: %s" % str(e), is_final=True) 137 return 138 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 139 # Get column names from cursor 140 column_names = [description[0] for description in unbuffered_cursor.description] 141 for result in unbuffered_cursor.fetchall_unbuffered(): 142 # Reformat result (which is a tuple with each column in the row) to dict 143 new_result = {k: v for k, v in zip(column_names, result)} 144 # 4CAT necessary fieldnames 145 new_result['id'] = new_result.get('id', '') 146 new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get( 147 "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get( 148 "quoted_status_id") else new_result.get("id") 149 new_result['body'] = new_result.get('text', '') 150 new_result['timestamp'] = new_result.get('created_at', None) 151 new_result['subject'] = '' 152 new_result['author'] = new_result.get('from_user_name', '') 153 yield new_result 154 155 else: 156 # "Basic" query 157 text_query = self.parameters.get("query") 158 159 where = [] 160 replacements = [] 161 # Find AND and OR 162 placeholder = 0 163 start_of_match = 0 164 while start_of_match >= 0: 165 match = None 166 and_match = text_query[placeholder:].find(' AND ') 167 or_match = text_query[placeholder:].find(' OR ') 168 if and_match != -1 and or_match != -1: 169 # both found 170 if and_match < or_match: 171 # and match first 172 match = 'AND ' 173 start_of_match = and_match 174 else: 175 # or match first 176 match ='OR ' 177 start_of_match = or_match 178 elif and_match != -1: 179 # and match only 180 match ='AND ' 181 start_of_match = and_match 182 elif or_match != -1: 183 # or match only 184 match = 'OR ' 185 start_of_match = or_match 186 else: 187 # neither 188 match = None 189 start_of_match = -1 190 # Add partial query to where and replacements 191 if match: 192 where.append('lower(text) LIKE %s ' + match) 193 replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%') 194 # new start 195 placeholder = placeholder + start_of_match + len(match) 196 else: 197 where.append('lower(text) LIKE %s') 198 replacements.append('%'+text_query[placeholder:].lower().strip()+'%') 199 200 if query.get("min_date", None): 201 try: 202 if int(query.get("min_date")) > 0: 203 where.append("AND created_at >= %s") 204 replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date")))) 205 except ValueError: 206 pass 207 208 if query.get("max_date", None): 209 try: 210 if int(query.get("max_date")) > 0: 211 where.append("AND created_at < %s") 212 replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date")))) 213 except ValueError: 214 pass 215 216 where = " ".join(where) 217 query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where 218 self.dataset.log('Query: %s' % query) 219 self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements])) 220 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 221 unbuffered_cursor.execute(query, replacements) 222 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 223 column_names = [description[0] for description in unbuffered_cursor.description] 224 for result in unbuffered_cursor.fetchall_unbuffered(): 225 new_result = {k: v for k, v in zip(column_names, result)} 226 # Map tweet to 4CAT fields 227 new_result = self.tweet_mapping(new_result) 228 yield new_result 229 230 @staticmethod 231 def tweet_mapping(tweet): 232 """ 233 Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to 234 mirror that mapped_tweet from twitterv2 datasource. 235 236 :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table 237 :return dict: 238 """ 239 mapped_tweet = {'id': tweet.get('id', ''), 240 # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is 241 # retweet, or its own ID 242 # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply 243 # or reply to retweet. 244 # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets 245 # with added text) 246 'thread_id': tweet.get("in_reply_to_status_id") if tweet.get( 247 "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get( 248 "retweet_id") else tweet.get("id"), 249 'body': tweet.get('text', ''), 250 # 'created_at': tweet.get('created_at'), 251 'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type( 252 tweet.get('created_at')) is datetime.datetime else None, 253 'subject': '', 254 'author': tweet.get('from_user_name', ''), 255 "author_fullname": tweet["from_user_realname"], 256 "author_id": tweet["from_user_id"], 257 "source": tweet["source"], 258 "language_guess": tweet.get("lang"), 259 260 "retweet_count": tweet["retweet_count"], 261 "like_count": tweet["favorite_count"], 262 "is_retweet": "yes" if tweet.get('retweet_id', False) else "no", 263 "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no", 264 "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None, 265 "reply_to": tweet["to_user_name"], 266 "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None, 267 268 # 4CAT specifics 269 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 270 "urls": ",".join(ural.urls_from_text(tweet["text"])), 271 "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])), 272 "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 273 274 # Additional TCAT data (compared to twitterv2 map_item function) 275 "filter_level": tweet['filter_level'], 276 'location': tweet['location'], 277 'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None, 278 'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None, 279 'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None, 280 'author_description': tweet['from_user_description'], 281 'author_url': tweet['from_user_url'], 282 'author_profile_image': tweet['from_user_profile_image_url'], 283 'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if 284 tweet['from_user_utcoffset'] else 0)/60/60), 285 'author_timezone_name': tweet['from_user_timezone'], 286 'author_language': tweet['from_user_lang'], 287 'author_tweet_count': tweet['from_user_tweetcount'], 288 'author_follower_count': tweet['from_user_followercount'], 289 'author_friend_following_count': tweet['from_user_friendcount'], 290 'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table? 291 'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None, 292 'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table? 293 'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table? 294 295 # TODO find in other TCAT tables or does not exist 296 # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no", 297 # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no", 298 # 'withheld_copyright': tweet['withheld_copyright'], # TCAT may no collect this anymore 299 # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore 300 # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits 301 302 } 303 304 # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet 305 mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id', 306 'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount', 307 'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description', 308 'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang', 309 'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count', 310 'in_reply_to_status_id'] 311 for key in tweet.keys(): 312 if key not in mapped_keys: 313 index = '' 314 while key + index in mapped_tweet.keys(): 315 index += '_1' 316 mapped_tweet[key + index] = tweet.get(key) 317 318 return mapped_tweet 319 320 321 @classmethod 322 def collect_tcat_metadata(cls, config): 323 """ 324 Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing 325 this data. To be used to infor the user of available TCAT bins and create the options from which a user will 326 select. 327 328 :return dict: All of the available bins from accessible TCAT instances 329 """ 330 331 # todo: cache this somehow! and check for the cache 332 instances = config.get("dmi-tcatv2.instances", []) 333 334 all_bins = {} 335 for instance in instances: 336 # Query each instance for bins 337 db = MySQLDatabase(logger=None, 338 dbname=instance.get('db_name'), 339 user=instance.get('db_user'), 340 password=instance.get('db_password'), 341 host=instance.get('db_host'), 342 port=instance.get('db_port')) 343 # try: 344 instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins') 345 # except: 346 347 instance_bins_metadata = {} 348 for instance_bin in instance_bins: 349 bin_data = { 350 'instance': instance, 351 'querybin': instance_bin['querybin'], 352 } 353 354 # Query for number of tweets 355 tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[ 356 'COUNT(id)'] 357 bin_data['tweet_count'] = tweet_count 358 359 # Collect first and last tweet datetimes 360 first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at'] 361 last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at'] 362 bin_data['first_tweet_datetime'] = first_tweet_datetime 363 bin_data['last_tweet_datetime'] = last_tweet_datetime 364 365 # Could check if bin currently should be collecting 366 # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active') 367 368 # Could collect phrases or usernames... 369 # if instance_bin['type'] in []: 370 # elif instance_bin['type'] in []: 371 372 # Could collect all periods for nuanced metadata... 373 #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id']) 374 375 # Add bin_data to instance collection 376 instance_bins_metadata[instance_bin['querybin']] = bin_data 377 # Add bins to instance 378 all_bins[instance['tcat_name']] = instance_bins_metadata 379 380 return all_bins 381 382 @staticmethod 383 def validate_query(query, request, config): 384 """ 385 Validate DMI-TCAT query input 386 387 :param dict query: Query parameters, from client-side. 388 :param request: Flask request 389 :param ConfigManager|None config: Configuration reader (context-aware) 390 :return dict: Safe query parameters 391 """ 392 # no query 4 u 393 if not query.get("bin", ""): 394 raise QueryParametersException("You must choose a query bin to get tweets from.") 395 396 # the dates need to make sense as a range to search within 397 # and a date range is needed, to not make it too easy to just get all tweets 398 after, before = query.get("daterange") 399 if (after and before) and not before <= after: 400 raise QueryParametersException("A date range must start before it ends") 401 402 query["min_date"], query["max_date"] = query.get("daterange") 403 query["bin"] = query.get("bin", "").strip() 404 del query["daterange"] 405 406 # simple! 407 return query
18class SearchWithinTCATBinsV2(Search): 19 """ 20 Get Tweets via DMI-TCAT 21 22 This allows subsetting an existing query bin, similar to the 'Data 23 Selection' panel in the DMI-TCAT analysis interface 24 """ 25 type = "dmi-tcatv2-search" # job ID 26 extension = "csv" 27 title = "TCAT Search (SQL)" 28 29 options = { 30 "intro-1": { 31 "type": UserInput.OPTION_INFO, 32 "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet " 33 "bin in that instance." 34 }, 35 "divider-1": { 36 "type": UserInput.OPTION_DIVIDER 37 }, 38 "bin": { 39 "type": UserInput.OPTION_INFO, 40 "help": "Query bin" 41 }, 42 "query_type": { 43 "type": UserInput.OPTION_CHOICE, 44 "help": "Basic or Advanced Query", 45 "options": { 46 "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)", 47 "advanced": "Select queries on any TCAT twitter tables" 48 }, 49 "default": "basic", 50 "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary" 51 }, 52 "query": { 53 "type": UserInput.OPTION_TEXT, 54 "help": "Query text", 55 "tooltip": "Match all tweets containing this text." 56 }, 57 "daterange": { 58 "type": UserInput.OPTION_DATERANGE, 59 "help": "Date range" 60 } 61 } 62 63 config = { 64 "dmi-tcatv2-search.database_instances": { 65 "type": UserInput.OPTION_TEXT_JSON, 66 "help": "DMI-TCAT instances", 67 "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture'," 68 "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of " 69 "these values need to be provided for each instance. This needs to be formatted as a JSON list of " 70 "objects.", 71 "default": {} 72 } 73 } 74 75 @classmethod 76 def get_options(cls, parent_dataset=None, config=None): 77 """ 78 Get data source options 79 80 This method takes the pre-defined options, but fills the 'bins' options 81 with bins currently available from the configured TCAT instances. 82 83 :param config: 84 :param DataSet parent_dataset: An object representing the dataset that 85 the processor would be run on 86can 87 be used to show some options only to privileges users. 88 """ 89 options = cls.options 90 91 # Collect Metadata from TCAT instances 92 all_bins = cls.collect_tcat_metadata(config) 93 94 options["bin"] = { 95 "type": UserInput.OPTION_CHOICE, 96 "options": {}, 97 "help": "Query bin" 98 } 99 100 for instance_name, bins in all_bins.items(): 101 for bin_name, bin in bins.items(): 102 bin_key = "%s@%s" % (bin_name, instance_name) 103 display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}" 104 options["bin"]["options"][bin_key] = display_text 105 106 return options 107 108 def get_items(self, query): 109 """ 110 Use the DMI-TCAT tweet export to retrieve tweets 111 112 :param query: 113 :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result 114 """ 115 bin = self.parameters.get("bin") 116 bin_name = bin.split("@")[0] 117 tcat_name = bin.split("@").pop() 118 119 available_instances = self.config.get("dmi-tcatv2-search.database_instances", []) 120 instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0] 121 122 db = MySQLDatabase(logger=self.log, 123 dbname=instance.get('db_name'), 124 user=instance.get('db_user'), 125 password=instance.get('db_password'), 126 host=instance.get('db_host'), 127 port=instance.get('db_port')) 128 129 self.dataset.update_status("Searching for tweets on %s" % bin_name) 130 if self.parameters.get("query_type") == 'advanced': 131 # Advanced query should be simple from our perspective... 132 self.dataset.log('Query: %s' % self.parameters.get("query")) 133 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 134 try: 135 unbuffered_cursor.execute(self.parameters.get("query")) 136 except pymysql.err.ProgrammingError as e: 137 self.dataset.update_status("SQL query error: %s" % str(e), is_final=True) 138 return 139 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 140 # Get column names from cursor 141 column_names = [description[0] for description in unbuffered_cursor.description] 142 for result in unbuffered_cursor.fetchall_unbuffered(): 143 # Reformat result (which is a tuple with each column in the row) to dict 144 new_result = {k: v for k, v in zip(column_names, result)} 145 # 4CAT necessary fieldnames 146 new_result['id'] = new_result.get('id', '') 147 new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get( 148 "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get( 149 "quoted_status_id") else new_result.get("id") 150 new_result['body'] = new_result.get('text', '') 151 new_result['timestamp'] = new_result.get('created_at', None) 152 new_result['subject'] = '' 153 new_result['author'] = new_result.get('from_user_name', '') 154 yield new_result 155 156 else: 157 # "Basic" query 158 text_query = self.parameters.get("query") 159 160 where = [] 161 replacements = [] 162 # Find AND and OR 163 placeholder = 0 164 start_of_match = 0 165 while start_of_match >= 0: 166 match = None 167 and_match = text_query[placeholder:].find(' AND ') 168 or_match = text_query[placeholder:].find(' OR ') 169 if and_match != -1 and or_match != -1: 170 # both found 171 if and_match < or_match: 172 # and match first 173 match = 'AND ' 174 start_of_match = and_match 175 else: 176 # or match first 177 match ='OR ' 178 start_of_match = or_match 179 elif and_match != -1: 180 # and match only 181 match ='AND ' 182 start_of_match = and_match 183 elif or_match != -1: 184 # or match only 185 match = 'OR ' 186 start_of_match = or_match 187 else: 188 # neither 189 match = None 190 start_of_match = -1 191 # Add partial query to where and replacements 192 if match: 193 where.append('lower(text) LIKE %s ' + match) 194 replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%') 195 # new start 196 placeholder = placeholder + start_of_match + len(match) 197 else: 198 where.append('lower(text) LIKE %s') 199 replacements.append('%'+text_query[placeholder:].lower().strip()+'%') 200 201 if query.get("min_date", None): 202 try: 203 if int(query.get("min_date")) > 0: 204 where.append("AND created_at >= %s") 205 replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date")))) 206 except ValueError: 207 pass 208 209 if query.get("max_date", None): 210 try: 211 if int(query.get("max_date")) > 0: 212 where.append("AND created_at < %s") 213 replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date")))) 214 except ValueError: 215 pass 216 217 where = " ".join(where) 218 query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where 219 self.dataset.log('Query: %s' % query) 220 self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements])) 221 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 222 unbuffered_cursor.execute(query, replacements) 223 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 224 column_names = [description[0] for description in unbuffered_cursor.description] 225 for result in unbuffered_cursor.fetchall_unbuffered(): 226 new_result = {k: v for k, v in zip(column_names, result)} 227 # Map tweet to 4CAT fields 228 new_result = self.tweet_mapping(new_result) 229 yield new_result 230 231 @staticmethod 232 def tweet_mapping(tweet): 233 """ 234 Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to 235 mirror that mapped_tweet from twitterv2 datasource. 236 237 :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table 238 :return dict: 239 """ 240 mapped_tweet = {'id': tweet.get('id', ''), 241 # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is 242 # retweet, or its own ID 243 # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply 244 # or reply to retweet. 245 # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets 246 # with added text) 247 'thread_id': tweet.get("in_reply_to_status_id") if tweet.get( 248 "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get( 249 "retweet_id") else tweet.get("id"), 250 'body': tweet.get('text', ''), 251 # 'created_at': tweet.get('created_at'), 252 'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type( 253 tweet.get('created_at')) is datetime.datetime else None, 254 'subject': '', 255 'author': tweet.get('from_user_name', ''), 256 "author_fullname": tweet["from_user_realname"], 257 "author_id": tweet["from_user_id"], 258 "source": tweet["source"], 259 "language_guess": tweet.get("lang"), 260 261 "retweet_count": tweet["retweet_count"], 262 "like_count": tweet["favorite_count"], 263 "is_retweet": "yes" if tweet.get('retweet_id', False) else "no", 264 "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no", 265 "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None, 266 "reply_to": tweet["to_user_name"], 267 "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None, 268 269 # 4CAT specifics 270 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 271 "urls": ",".join(ural.urls_from_text(tweet["text"])), 272 "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])), 273 "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 274 275 # Additional TCAT data (compared to twitterv2 map_item function) 276 "filter_level": tweet['filter_level'], 277 'location': tweet['location'], 278 'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None, 279 'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None, 280 'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None, 281 'author_description': tweet['from_user_description'], 282 'author_url': tweet['from_user_url'], 283 'author_profile_image': tweet['from_user_profile_image_url'], 284 'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if 285 tweet['from_user_utcoffset'] else 0)/60/60), 286 'author_timezone_name': tweet['from_user_timezone'], 287 'author_language': tweet['from_user_lang'], 288 'author_tweet_count': tweet['from_user_tweetcount'], 289 'author_follower_count': tweet['from_user_followercount'], 290 'author_friend_following_count': tweet['from_user_friendcount'], 291 'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table? 292 'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None, 293 'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table? 294 'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table? 295 296 # TODO find in other TCAT tables or does not exist 297 # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no", 298 # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no", 299 # 'withheld_copyright': tweet['withheld_copyright'], # TCAT may no collect this anymore 300 # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore 301 # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits 302 303 } 304 305 # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet 306 mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id', 307 'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount', 308 'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description', 309 'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang', 310 'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count', 311 'in_reply_to_status_id'] 312 for key in tweet.keys(): 313 if key not in mapped_keys: 314 index = '' 315 while key + index in mapped_tweet.keys(): 316 index += '_1' 317 mapped_tweet[key + index] = tweet.get(key) 318 319 return mapped_tweet 320 321 322 @classmethod 323 def collect_tcat_metadata(cls, config): 324 """ 325 Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing 326 this data. To be used to infor the user of available TCAT bins and create the options from which a user will 327 select. 328 329 :return dict: All of the available bins from accessible TCAT instances 330 """ 331 332 # todo: cache this somehow! and check for the cache 333 instances = config.get("dmi-tcatv2.instances", []) 334 335 all_bins = {} 336 for instance in instances: 337 # Query each instance for bins 338 db = MySQLDatabase(logger=None, 339 dbname=instance.get('db_name'), 340 user=instance.get('db_user'), 341 password=instance.get('db_password'), 342 host=instance.get('db_host'), 343 port=instance.get('db_port')) 344 # try: 345 instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins') 346 # except: 347 348 instance_bins_metadata = {} 349 for instance_bin in instance_bins: 350 bin_data = { 351 'instance': instance, 352 'querybin': instance_bin['querybin'], 353 } 354 355 # Query for number of tweets 356 tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[ 357 'COUNT(id)'] 358 bin_data['tweet_count'] = tweet_count 359 360 # Collect first and last tweet datetimes 361 first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at'] 362 last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at'] 363 bin_data['first_tweet_datetime'] = first_tweet_datetime 364 bin_data['last_tweet_datetime'] = last_tweet_datetime 365 366 # Could check if bin currently should be collecting 367 # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active') 368 369 # Could collect phrases or usernames... 370 # if instance_bin['type'] in []: 371 # elif instance_bin['type'] in []: 372 373 # Could collect all periods for nuanced metadata... 374 #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id']) 375 376 # Add bin_data to instance collection 377 instance_bins_metadata[instance_bin['querybin']] = bin_data 378 # Add bins to instance 379 all_bins[instance['tcat_name']] = instance_bins_metadata 380 381 return all_bins 382 383 @staticmethod 384 def validate_query(query, request, config): 385 """ 386 Validate DMI-TCAT query input 387 388 :param dict query: Query parameters, from client-side. 389 :param request: Flask request 390 :param ConfigManager|None config: Configuration reader (context-aware) 391 :return dict: Safe query parameters 392 """ 393 # no query 4 u 394 if not query.get("bin", ""): 395 raise QueryParametersException("You must choose a query bin to get tweets from.") 396 397 # the dates need to make sense as a range to search within 398 # and a date range is needed, to not make it too easy to just get all tweets 399 after, before = query.get("daterange") 400 if (after and before) and not before <= after: 401 raise QueryParametersException("A date range must start before it ends") 402 403 query["min_date"], query["max_date"] = query.get("daterange") 404 query["bin"] = query.get("bin", "").strip() 405 del query["daterange"] 406 407 # simple! 408 return query
Get Tweets via DMI-TCAT
This allows subsetting an existing query bin, similar to the 'Data Selection' panel in the DMI-TCAT analysis interface
75 @classmethod 76 def get_options(cls, parent_dataset=None, config=None): 77 """ 78 Get data source options 79 80 This method takes the pre-defined options, but fills the 'bins' options 81 with bins currently available from the configured TCAT instances. 82 83 :param config: 84 :param DataSet parent_dataset: An object representing the dataset that 85 the processor would be run on 86can 87 be used to show some options only to privileges users. 88 """ 89 options = cls.options 90 91 # Collect Metadata from TCAT instances 92 all_bins = cls.collect_tcat_metadata(config) 93 94 options["bin"] = { 95 "type": UserInput.OPTION_CHOICE, 96 "options": {}, 97 "help": "Query bin" 98 } 99 100 for instance_name, bins in all_bins.items(): 101 for bin_name, bin in bins.items(): 102 bin_key = "%s@%s" % (bin_name, instance_name) 103 display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}" 104 options["bin"]["options"][bin_key] = display_text 105 106 return options
Get data source options
This method takes the pre-defined options, but fills the 'bins' options
with bins currently available from the configured TCAT instances.
:param config:
:param DataSet parent_dataset: An object representing the dataset that
the processor would be run on
can be used to show some options only to privileges users.
108 def get_items(self, query): 109 """ 110 Use the DMI-TCAT tweet export to retrieve tweets 111 112 :param query: 113 :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result 114 """ 115 bin = self.parameters.get("bin") 116 bin_name = bin.split("@")[0] 117 tcat_name = bin.split("@").pop() 118 119 available_instances = self.config.get("dmi-tcatv2-search.database_instances", []) 120 instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0] 121 122 db = MySQLDatabase(logger=self.log, 123 dbname=instance.get('db_name'), 124 user=instance.get('db_user'), 125 password=instance.get('db_password'), 126 host=instance.get('db_host'), 127 port=instance.get('db_port')) 128 129 self.dataset.update_status("Searching for tweets on %s" % bin_name) 130 if self.parameters.get("query_type") == 'advanced': 131 # Advanced query should be simple from our perspective... 132 self.dataset.log('Query: %s' % self.parameters.get("query")) 133 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 134 try: 135 unbuffered_cursor.execute(self.parameters.get("query")) 136 except pymysql.err.ProgrammingError as e: 137 self.dataset.update_status("SQL query error: %s" % str(e), is_final=True) 138 return 139 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 140 # Get column names from cursor 141 column_names = [description[0] for description in unbuffered_cursor.description] 142 for result in unbuffered_cursor.fetchall_unbuffered(): 143 # Reformat result (which is a tuple with each column in the row) to dict 144 new_result = {k: v for k, v in zip(column_names, result)} 145 # 4CAT necessary fieldnames 146 new_result['id'] = new_result.get('id', '') 147 new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get( 148 "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get( 149 "quoted_status_id") else new_result.get("id") 150 new_result['body'] = new_result.get('text', '') 151 new_result['timestamp'] = new_result.get('created_at', None) 152 new_result['subject'] = '' 153 new_result['author'] = new_result.get('from_user_name', '') 154 yield new_result 155 156 else: 157 # "Basic" query 158 text_query = self.parameters.get("query") 159 160 where = [] 161 replacements = [] 162 # Find AND and OR 163 placeholder = 0 164 start_of_match = 0 165 while start_of_match >= 0: 166 match = None 167 and_match = text_query[placeholder:].find(' AND ') 168 or_match = text_query[placeholder:].find(' OR ') 169 if and_match != -1 and or_match != -1: 170 # both found 171 if and_match < or_match: 172 # and match first 173 match = 'AND ' 174 start_of_match = and_match 175 else: 176 # or match first 177 match ='OR ' 178 start_of_match = or_match 179 elif and_match != -1: 180 # and match only 181 match ='AND ' 182 start_of_match = and_match 183 elif or_match != -1: 184 # or match only 185 match = 'OR ' 186 start_of_match = or_match 187 else: 188 # neither 189 match = None 190 start_of_match = -1 191 # Add partial query to where and replacements 192 if match: 193 where.append('lower(text) LIKE %s ' + match) 194 replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%') 195 # new start 196 placeholder = placeholder + start_of_match + len(match) 197 else: 198 where.append('lower(text) LIKE %s') 199 replacements.append('%'+text_query[placeholder:].lower().strip()+'%') 200 201 if query.get("min_date", None): 202 try: 203 if int(query.get("min_date")) > 0: 204 where.append("AND created_at >= %s") 205 replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date")))) 206 except ValueError: 207 pass 208 209 if query.get("max_date", None): 210 try: 211 if int(query.get("max_date")) > 0: 212 where.append("AND created_at < %s") 213 replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date")))) 214 except ValueError: 215 pass 216 217 where = " ".join(where) 218 query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where 219 self.dataset.log('Query: %s' % query) 220 self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements])) 221 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 222 unbuffered_cursor.execute(query, replacements) 223 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 224 column_names = [description[0] for description in unbuffered_cursor.description] 225 for result in unbuffered_cursor.fetchall_unbuffered(): 226 new_result = {k: v for k, v in zip(column_names, result)} 227 # Map tweet to 4CAT fields 228 new_result = self.tweet_mapping(new_result) 229 yield new_result
Use the DMI-TCAT tweet export to retrieve tweets
Parameters
- query: :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
231 @staticmethod 232 def tweet_mapping(tweet): 233 """ 234 Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to 235 mirror that mapped_tweet from twitterv2 datasource. 236 237 :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table 238 :return dict: 239 """ 240 mapped_tweet = {'id': tweet.get('id', ''), 241 # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is 242 # retweet, or its own ID 243 # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply 244 # or reply to retweet. 245 # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets 246 # with added text) 247 'thread_id': tweet.get("in_reply_to_status_id") if tweet.get( 248 "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get( 249 "retweet_id") else tweet.get("id"), 250 'body': tweet.get('text', ''), 251 # 'created_at': tweet.get('created_at'), 252 'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type( 253 tweet.get('created_at')) is datetime.datetime else None, 254 'subject': '', 255 'author': tweet.get('from_user_name', ''), 256 "author_fullname": tweet["from_user_realname"], 257 "author_id": tweet["from_user_id"], 258 "source": tweet["source"], 259 "language_guess": tweet.get("lang"), 260 261 "retweet_count": tweet["retweet_count"], 262 "like_count": tweet["favorite_count"], 263 "is_retweet": "yes" if tweet.get('retweet_id', False) else "no", 264 "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no", 265 "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None, 266 "reply_to": tweet["to_user_name"], 267 "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None, 268 269 # 4CAT specifics 270 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 271 "urls": ",".join(ural.urls_from_text(tweet["text"])), 272 "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])), 273 "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 274 275 # Additional TCAT data (compared to twitterv2 map_item function) 276 "filter_level": tweet['filter_level'], 277 'location': tweet['location'], 278 'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None, 279 'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None, 280 'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None, 281 'author_description': tweet['from_user_description'], 282 'author_url': tweet['from_user_url'], 283 'author_profile_image': tweet['from_user_profile_image_url'], 284 'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if 285 tweet['from_user_utcoffset'] else 0)/60/60), 286 'author_timezone_name': tweet['from_user_timezone'], 287 'author_language': tweet['from_user_lang'], 288 'author_tweet_count': tweet['from_user_tweetcount'], 289 'author_follower_count': tweet['from_user_followercount'], 290 'author_friend_following_count': tweet['from_user_friendcount'], 291 'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table? 292 'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None, 293 'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table? 294 'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table? 295 296 # TODO find in other TCAT tables or does not exist 297 # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no", 298 # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no", 299 # 'withheld_copyright': tweet['withheld_copyright'], # TCAT may no collect this anymore 300 # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore 301 # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits 302 303 } 304 305 # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet 306 mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id', 307 'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount', 308 'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description', 309 'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang', 310 'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count', 311 'in_reply_to_status_id'] 312 for key in tweet.keys(): 313 if key not in mapped_keys: 314 index = '' 315 while key + index in mapped_tweet.keys(): 316 index += '_1' 317 mapped_tweet[key + index] = tweet.get(key) 318 319 return mapped_tweet
Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to mirror that mapped_tweet from twitterv2 datasource.
Parameters
- dict tweet: TCAT dict returned from query; expected to be from bin tweets table
Returns
322 @classmethod 323 def collect_tcat_metadata(cls, config): 324 """ 325 Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing 326 this data. To be used to infor the user of available TCAT bins and create the options from which a user will 327 select. 328 329 :return dict: All of the available bins from accessible TCAT instances 330 """ 331 332 # todo: cache this somehow! and check for the cache 333 instances = config.get("dmi-tcatv2.instances", []) 334 335 all_bins = {} 336 for instance in instances: 337 # Query each instance for bins 338 db = MySQLDatabase(logger=None, 339 dbname=instance.get('db_name'), 340 user=instance.get('db_user'), 341 password=instance.get('db_password'), 342 host=instance.get('db_host'), 343 port=instance.get('db_port')) 344 # try: 345 instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins') 346 # except: 347 348 instance_bins_metadata = {} 349 for instance_bin in instance_bins: 350 bin_data = { 351 'instance': instance, 352 'querybin': instance_bin['querybin'], 353 } 354 355 # Query for number of tweets 356 tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[ 357 'COUNT(id)'] 358 bin_data['tweet_count'] = tweet_count 359 360 # Collect first and last tweet datetimes 361 first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at'] 362 last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at'] 363 bin_data['first_tweet_datetime'] = first_tweet_datetime 364 bin_data['last_tweet_datetime'] = last_tweet_datetime 365 366 # Could check if bin currently should be collecting 367 # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active') 368 369 # Could collect phrases or usernames... 370 # if instance_bin['type'] in []: 371 # elif instance_bin['type'] in []: 372 373 # Could collect all periods for nuanced metadata... 374 #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id']) 375 376 # Add bin_data to instance collection 377 instance_bins_metadata[instance_bin['querybin']] = bin_data 378 # Add bins to instance 379 all_bins[instance['tcat_name']] = instance_bins_metadata 380 381 return all_bins
Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing this data. To be used to infor the user of available TCAT bins and create the options from which a user will select.
Returns
All of the available bins from accessible TCAT instances
383 @staticmethod 384 def validate_query(query, request, config): 385 """ 386 Validate DMI-TCAT query input 387 388 :param dict query: Query parameters, from client-side. 389 :param request: Flask request 390 :param ConfigManager|None config: Configuration reader (context-aware) 391 :return dict: Safe query parameters 392 """ 393 # no query 4 u 394 if not query.get("bin", ""): 395 raise QueryParametersException("You must choose a query bin to get tweets from.") 396 397 # the dates need to make sense as a range to search within 398 # and a date range is needed, to not make it too easy to just get all tweets 399 after, before = query.get("daterange") 400 if (after and before) and not before <= after: 401 raise QueryParametersException("A date range must start before it ends") 402 403 query["min_date"], query["max_date"] = query.get("daterange") 404 query["bin"] = query.get("bin", "").strip() 405 del query["daterange"] 406 407 # simple! 408 return query
Validate DMI-TCAT query input
Parameters
- dict query: Query parameters, from client-side.
- request: Flask request
- ConfigManager|None config: Configuration reader (context-aware)
Returns
Safe query parameters
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- description
- category
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor