datasources.dmi-tcatv2.search_tcat_v2
Twitter search within a DMI-TCAT bin v2 Direct database connection
1""" 2Twitter search within a DMI-TCAT bin v2 3Direct database connection 4""" 5import datetime 6import re 7import ural 8import pymysql 9 10 11from backend.lib.search import Search 12from common.lib.exceptions import QueryParametersException 13from common.lib.user_input import UserInput 14from backend.lib.database_mysql import MySQLDatabase 15from common.config_manager import config 16 17 18class SearchWithinTCATBinsV2(Search): 19 """ 20 Get Tweets via DMI-TCAT 21 22 This allows subsetting an existing query bin, similar to the 'Data 23 Selection' panel in the DMI-TCAT analysis interface 24 """ 25 type = "dmi-tcatv2-search" # job ID 26 extension = "csv" 27 title = "TCAT Search (SQL)" 28 29 options = { 30 "intro-1": { 31 "type": UserInput.OPTION_INFO, 32 "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet " 33 "bin in that instance." 34 }, 35 "divider-1": { 36 "type": UserInput.OPTION_DIVIDER 37 }, 38 "bin": { 39 "type": UserInput.OPTION_INFO, 40 "help": "Query bin" 41 }, 42 "query_type": { 43 "type": UserInput.OPTION_CHOICE, 44 "help": "Basic or Advanced Query", 45 "options": { 46 "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)", 47 "advanced": "Select queries on any TCAT twitter tables" 48 }, 49 "default": "basic", 50 "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary" 51 }, 52 "query": { 53 "type": UserInput.OPTION_TEXT, 54 "help": "Query text", 55 "tooltip": "Match all tweets containing this text." 56 }, 57 "daterange": { 58 "type": UserInput.OPTION_DATERANGE, 59 "help": "Date range" 60 } 61 } 62 63 config = { 64 "dmi-tcatv2-search.database_instances": { 65 "type": UserInput.OPTION_TEXT_JSON, 66 "help": "DMI-TCAT instances", 67 "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture'," 68 "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of " 69 "these values need to be provided for each instance. This needs to be formatted as a JSON list of " 70 "objects.", 71 "default": {} 72 } 73 } 74 75 @classmethod 76 def get_options(cls, parent_dataset=None, user=None): 77 """ 78 Get data source options 79 80 This method takes the pre-defined options, but fills the 'bins' options 81 with bins currently available from the configured TCAT instances. 82 83 :param DataSet parent_dataset: An object representing the dataset that 84 the processor would be run on 85 :param User user: Flask user the options will be displayed for, in 86 case they are requested for display in the 4CAT web interface. This can 87 be used to show some options only to privileges users. 88 """ 89 options = cls.options 90 91 # Collect Metadata from TCAT instances 92 all_bins = cls.collect_tcat_metadata() 93 94 options["bin"] = { 95 "type": UserInput.OPTION_CHOICE, 96 "options": {}, 97 "help": "Query bin" 98 } 99 100 for instance_name, bins in all_bins.items(): 101 for bin_name, bin in bins.items(): 102 bin_key = "%s@%s" % (bin_name, instance_name) 103 display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}" 104 options["bin"]["options"][bin_key] = display_text 105 106 return options 107 108 def get_items(self, query): 109 """ 110 Use the DMI-TCAT tweet export to retrieve tweets 111 112 :param query: 113 :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result 114 """ 115 bin = self.parameters.get("bin") 116 bin_name = bin.split("@")[0] 117 tcat_name = bin.split("@").pop() 118 119 available_instances = config.get("dmi-tcatv2-search.database_instances", []) 120 instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0] 121 122 db = MySQLDatabase(logger=self.log, 123 dbname=instance.get('db_name'), 124 user=instance.get('db_user'), 125 password=instance.get('db_password'), 126 host=instance.get('db_host'), 127 port=instance.get('db_port')) 128 129 self.dataset.update_status("Searching for tweets on %s" % bin_name) 130 if self.parameters.get("query_type") == 'advanced': 131 # Advanced query should be simple from our perspective... 132 self.dataset.log('Query: %s' % self.parameters.get("query")) 133 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 134 try: 135 num_results = unbuffered_cursor.execute(self.parameters.get("query")) 136 except pymysql.err.ProgrammingError as e: 137 self.dataset.update_status("SQL query error: %s" % str(e), is_final=True) 138 return 139 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 140 # Get column names from cursor 141 column_names = [description[0] for description in unbuffered_cursor.description] 142 for result in unbuffered_cursor.fetchall_unbuffered(): 143 # Reformat result (which is a tuple with each column in the row) to dict 144 new_result = {k: v for k, v in zip(column_names, result)} 145 # 4CAT necessary fieldnames 146 new_result['id'] = new_result.get('id', '') 147 new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get( 148 "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get( 149 "quoted_status_id") else new_result.get("id") 150 new_result['body'] = new_result.get('text', '') 151 new_result['timestamp'] = new_result.get('created_at', None) 152 new_result['subject'] = '' 153 new_result['author'] = new_result.get('from_user_name', '') 154 yield new_result 155 156 else: 157 # "Basic" query 158 text_query = self.parameters.get("query") 159 160 where = [] 161 replacements = [] 162 # Find AND and OR 163 placeholder = 0 164 start_of_match = 0 165 while start_of_match >= 0: 166 match = None 167 and_match = text_query[placeholder:].find(' AND ') 168 or_match = text_query[placeholder:].find(' OR ') 169 if and_match != -1 and or_match != -1: 170 # both found 171 if and_match < or_match: 172 # and match first 173 match = 'AND ' 174 start_of_match = and_match 175 else: 176 # or match first 177 match ='OR ' 178 start_of_match = or_match 179 elif and_match != -1: 180 # and match only 181 match ='AND ' 182 start_of_match = and_match 183 elif or_match != -1: 184 # or match only 185 match = 'OR ' 186 start_of_match = or_match 187 else: 188 # neither 189 match = None 190 start_of_match = -1 191 # Add partial query to where and replacements 192 if match: 193 where.append('lower(text) LIKE %s ' + match) 194 replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%') 195 # new start 196 placeholder = placeholder + start_of_match + len(match) 197 else: 198 where.append('lower(text) LIKE %s') 199 replacements.append('%'+text_query[placeholder:].lower().strip()+'%') 200 201 if query.get("min_date", None): 202 try: 203 if int(query.get("min_date")) > 0: 204 where.append("AND created_at >= %s") 205 replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date")))) 206 except ValueError: 207 pass 208 209 if query.get("max_date", None): 210 try: 211 if int(query.get("max_date")) > 0: 212 where.append("AND created_at < %s") 213 replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date")))) 214 except ValueError: 215 pass 216 217 where = " ".join(where) 218 query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where 219 self.dataset.log('Query: %s' % query) 220 self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements])) 221 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 222 num_results = unbuffered_cursor.execute(query, replacements) 223 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 224 column_names = [description[0] for description in unbuffered_cursor.description] 225 for result in unbuffered_cursor.fetchall_unbuffered(): 226 new_result = {k: v for k, v in zip(column_names, result)} 227 # Map tweet to 4CAT fields 228 new_result = self.tweet_mapping(new_result) 229 yield new_result 230 231 @staticmethod 232 def tweet_mapping(tweet): 233 """ 234 Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to 235 mirror that mapped_tweet from twitterv2 datasource. 236 237 :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table 238 :return dict: 239 """ 240 mapped_tweet = {'id': tweet.get('id', ''), 241 # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is 242 # retweet, or its own ID 243 # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply 244 # or reply to retweet. 245 # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets 246 # with added text) 247 'thread_id': tweet.get("in_reply_to_status_id") if tweet.get( 248 "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get( 249 "retweet_id") else tweet.get("id"), 250 'body': tweet.get('text', ''), 251 # 'created_at': tweet.get('created_at'), 252 'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type( 253 tweet.get('created_at')) == datetime.datetime else None, 254 'subject': '', 255 'author': tweet.get('from_user_name', ''), 256 "author_fullname": tweet["from_user_realname"], 257 "author_id": tweet["from_user_id"], 258 "source": tweet["source"], 259 "language_guess": tweet.get("lang"), 260 261 "retweet_count": tweet["retweet_count"], 262 "like_count": tweet["favorite_count"], 263 "is_retweet": "yes" if tweet.get('retweet_id', False) else "no", 264 "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no", 265 "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None, 266 "reply_to": tweet["to_user_name"], 267 "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None, 268 269 # 4CAT specifics 270 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 271 "urls": ",".join(ural.urls_from_text(tweet["text"])), 272 "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])), 273 "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 274 275 # Additional TCAT data (compared to twitterv2 map_item function) 276 "filter_level": tweet['filter_level'], 277 'location': tweet['location'], 278 'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None, 279 'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None, 280 'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None, 281 'author_description': tweet['from_user_description'], 282 'author_url': tweet['from_user_url'], 283 'author_profile_image': tweet['from_user_profile_image_url'], 284 'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if 285 tweet['from_user_utcoffset'] else 0)/60/60), 286 'author_timezone_name': tweet['from_user_timezone'], 287 'author_language': tweet['from_user_lang'], 288 'author_tweet_count': tweet['from_user_tweetcount'], 289 'author_follower_count': tweet['from_user_followercount'], 290 'author_friend_following_count': tweet['from_user_friendcount'], 291 'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table? 292 'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None, 293 'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table? 294 'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table? 295 296 # TODO find in other TCAT tables or does not exist 297 # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no", 298 # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no", 299 # 'withheld_copyright': tweet['withheld_copyright'], # TCAT may no collect this anymore 300 # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore 301 # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits 302 303 } 304 305 # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet 306 mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id', 307 'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount', 308 'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description', 309 'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang', 310 'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count', 311 'in_reply_to_status_id'] 312 for key in tweet.keys(): 313 if key not in mapped_keys: 314 index = '' 315 while key + index in mapped_tweet.keys(): 316 index += '_1' 317 mapped_tweet[key + index] = tweet.get(key) 318 319 return mapped_tweet 320 321 322 @classmethod 323 def collect_tcat_metadata(cls): 324 """ 325 Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing 326 this data. To be used to infor the user of available TCAT bins and create the options from which a user will 327 select. 328 329 :return dict: All of the available bins from accessible TCAT instances 330 """ 331 332 # todo: cache this somehow! and check for the cache 333 instances = config.get("dmi-tcatv2.instances", []) 334 335 all_bins = {} 336 for instance in instances: 337 # Query each instance for bins 338 db = MySQLDatabase(logger=None, 339 dbname=instance.get('db_name'), 340 user=instance.get('db_user'), 341 password=instance.get('db_password'), 342 host=instance.get('db_host'), 343 port=instance.get('db_port')) 344 # try: 345 instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins') 346 # except: 347 348 instance_bins_metadata = {} 349 for instance_bin in instance_bins: 350 bin_data = { 351 'instance': instance, 352 'querybin': instance_bin['querybin'], 353 } 354 355 # Query for number of tweets 356 tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[ 357 'COUNT(id)'] 358 bin_data['tweet_count'] = tweet_count 359 360 # Collect first and last tweet datetimes 361 first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at'] 362 last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at'] 363 bin_data['first_tweet_datetime'] = first_tweet_datetime 364 bin_data['last_tweet_datetime'] = last_tweet_datetime 365 366 # Could check if bin currently should be collecting 367 # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active') 368 369 # Could collect phrases or usernames... 370 # if instance_bin['type'] in []: 371 # elif instance_bin['type'] in []: 372 373 # Could collect all periods for nuanced metadata... 374 #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id']) 375 376 # Add bin_data to instance collection 377 instance_bins_metadata[instance_bin['querybin']] = bin_data 378 # Add bins to instance 379 all_bins[instance['tcat_name']] = instance_bins_metadata 380 381 return all_bins 382 383 @staticmethod 384 def validate_query(query, request, user): 385 """ 386 Validate DMI-TCAT query input 387 388 :param dict query: Query parameters, from client-side. 389 :param request: Flask request 390 :param User user: User object of user who has submitted the query 391 :return dict: Safe query parameters 392 """ 393 # no query 4 u 394 if not query.get("bin", "").strip(): 395 raise QueryParametersException("You must choose a query bin to get tweets from.") 396 397 # the dates need to make sense as a range to search within 398 # and a date range is needed, to not make it too easy to just get all tweets 399 after, before = query.get("daterange") 400 if (after and before) and not before <= after: 401 raise QueryParametersException("A date range must start before it ends") 402 403 query["min_date"], query["max_date"] = query.get("daterange") 404 del query["daterange"] 405 406 # simple! 407 return query
19class SearchWithinTCATBinsV2(Search): 20 """ 21 Get Tweets via DMI-TCAT 22 23 This allows subsetting an existing query bin, similar to the 'Data 24 Selection' panel in the DMI-TCAT analysis interface 25 """ 26 type = "dmi-tcatv2-search" # job ID 27 extension = "csv" 28 title = "TCAT Search (SQL)" 29 30 options = { 31 "intro-1": { 32 "type": UserInput.OPTION_INFO, 33 "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet " 34 "bin in that instance." 35 }, 36 "divider-1": { 37 "type": UserInput.OPTION_DIVIDER 38 }, 39 "bin": { 40 "type": UserInput.OPTION_INFO, 41 "help": "Query bin" 42 }, 43 "query_type": { 44 "type": UserInput.OPTION_CHOICE, 45 "help": "Basic or Advanced Query", 46 "options": { 47 "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)", 48 "advanced": "Select queries on any TCAT twitter tables" 49 }, 50 "default": "basic", 51 "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary" 52 }, 53 "query": { 54 "type": UserInput.OPTION_TEXT, 55 "help": "Query text", 56 "tooltip": "Match all tweets containing this text." 57 }, 58 "daterange": { 59 "type": UserInput.OPTION_DATERANGE, 60 "help": "Date range" 61 } 62 } 63 64 config = { 65 "dmi-tcatv2-search.database_instances": { 66 "type": UserInput.OPTION_TEXT_JSON, 67 "help": "DMI-TCAT instances", 68 "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture'," 69 "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of " 70 "these values need to be provided for each instance. This needs to be formatted as a JSON list of " 71 "objects.", 72 "default": {} 73 } 74 } 75 76 @classmethod 77 def get_options(cls, parent_dataset=None, user=None): 78 """ 79 Get data source options 80 81 This method takes the pre-defined options, but fills the 'bins' options 82 with bins currently available from the configured TCAT instances. 83 84 :param DataSet parent_dataset: An object representing the dataset that 85 the processor would be run on 86 :param User user: Flask user the options will be displayed for, in 87 case they are requested for display in the 4CAT web interface. This can 88 be used to show some options only to privileges users. 89 """ 90 options = cls.options 91 92 # Collect Metadata from TCAT instances 93 all_bins = cls.collect_tcat_metadata() 94 95 options["bin"] = { 96 "type": UserInput.OPTION_CHOICE, 97 "options": {}, 98 "help": "Query bin" 99 } 100 101 for instance_name, bins in all_bins.items(): 102 for bin_name, bin in bins.items(): 103 bin_key = "%s@%s" % (bin_name, instance_name) 104 display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}" 105 options["bin"]["options"][bin_key] = display_text 106 107 return options 108 109 def get_items(self, query): 110 """ 111 Use the DMI-TCAT tweet export to retrieve tweets 112 113 :param query: 114 :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result 115 """ 116 bin = self.parameters.get("bin") 117 bin_name = bin.split("@")[0] 118 tcat_name = bin.split("@").pop() 119 120 available_instances = config.get("dmi-tcatv2-search.database_instances", []) 121 instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0] 122 123 db = MySQLDatabase(logger=self.log, 124 dbname=instance.get('db_name'), 125 user=instance.get('db_user'), 126 password=instance.get('db_password'), 127 host=instance.get('db_host'), 128 port=instance.get('db_port')) 129 130 self.dataset.update_status("Searching for tweets on %s" % bin_name) 131 if self.parameters.get("query_type") == 'advanced': 132 # Advanced query should be simple from our perspective... 133 self.dataset.log('Query: %s' % self.parameters.get("query")) 134 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 135 try: 136 num_results = unbuffered_cursor.execute(self.parameters.get("query")) 137 except pymysql.err.ProgrammingError as e: 138 self.dataset.update_status("SQL query error: %s" % str(e), is_final=True) 139 return 140 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 141 # Get column names from cursor 142 column_names = [description[0] for description in unbuffered_cursor.description] 143 for result in unbuffered_cursor.fetchall_unbuffered(): 144 # Reformat result (which is a tuple with each column in the row) to dict 145 new_result = {k: v for k, v in zip(column_names, result)} 146 # 4CAT necessary fieldnames 147 new_result['id'] = new_result.get('id', '') 148 new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get( 149 "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get( 150 "quoted_status_id") else new_result.get("id") 151 new_result['body'] = new_result.get('text', '') 152 new_result['timestamp'] = new_result.get('created_at', None) 153 new_result['subject'] = '' 154 new_result['author'] = new_result.get('from_user_name', '') 155 yield new_result 156 157 else: 158 # "Basic" query 159 text_query = self.parameters.get("query") 160 161 where = [] 162 replacements = [] 163 # Find AND and OR 164 placeholder = 0 165 start_of_match = 0 166 while start_of_match >= 0: 167 match = None 168 and_match = text_query[placeholder:].find(' AND ') 169 or_match = text_query[placeholder:].find(' OR ') 170 if and_match != -1 and or_match != -1: 171 # both found 172 if and_match < or_match: 173 # and match first 174 match = 'AND ' 175 start_of_match = and_match 176 else: 177 # or match first 178 match ='OR ' 179 start_of_match = or_match 180 elif and_match != -1: 181 # and match only 182 match ='AND ' 183 start_of_match = and_match 184 elif or_match != -1: 185 # or match only 186 match = 'OR ' 187 start_of_match = or_match 188 else: 189 # neither 190 match = None 191 start_of_match = -1 192 # Add partial query to where and replacements 193 if match: 194 where.append('lower(text) LIKE %s ' + match) 195 replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%') 196 # new start 197 placeholder = placeholder + start_of_match + len(match) 198 else: 199 where.append('lower(text) LIKE %s') 200 replacements.append('%'+text_query[placeholder:].lower().strip()+'%') 201 202 if query.get("min_date", None): 203 try: 204 if int(query.get("min_date")) > 0: 205 where.append("AND created_at >= %s") 206 replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date")))) 207 except ValueError: 208 pass 209 210 if query.get("max_date", None): 211 try: 212 if int(query.get("max_date")) > 0: 213 where.append("AND created_at < %s") 214 replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date")))) 215 except ValueError: 216 pass 217 218 where = " ".join(where) 219 query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where 220 self.dataset.log('Query: %s' % query) 221 self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements])) 222 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 223 num_results = unbuffered_cursor.execute(query, replacements) 224 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 225 column_names = [description[0] for description in unbuffered_cursor.description] 226 for result in unbuffered_cursor.fetchall_unbuffered(): 227 new_result = {k: v for k, v in zip(column_names, result)} 228 # Map tweet to 4CAT fields 229 new_result = self.tweet_mapping(new_result) 230 yield new_result 231 232 @staticmethod 233 def tweet_mapping(tweet): 234 """ 235 Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to 236 mirror that mapped_tweet from twitterv2 datasource. 237 238 :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table 239 :return dict: 240 """ 241 mapped_tweet = {'id': tweet.get('id', ''), 242 # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is 243 # retweet, or its own ID 244 # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply 245 # or reply to retweet. 246 # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets 247 # with added text) 248 'thread_id': tweet.get("in_reply_to_status_id") if tweet.get( 249 "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get( 250 "retweet_id") else tweet.get("id"), 251 'body': tweet.get('text', ''), 252 # 'created_at': tweet.get('created_at'), 253 'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type( 254 tweet.get('created_at')) == datetime.datetime else None, 255 'subject': '', 256 'author': tweet.get('from_user_name', ''), 257 "author_fullname": tweet["from_user_realname"], 258 "author_id": tweet["from_user_id"], 259 "source": tweet["source"], 260 "language_guess": tweet.get("lang"), 261 262 "retweet_count": tweet["retweet_count"], 263 "like_count": tweet["favorite_count"], 264 "is_retweet": "yes" if tweet.get('retweet_id', False) else "no", 265 "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no", 266 "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None, 267 "reply_to": tweet["to_user_name"], 268 "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None, 269 270 # 4CAT specifics 271 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 272 "urls": ",".join(ural.urls_from_text(tweet["text"])), 273 "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])), 274 "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 275 276 # Additional TCAT data (compared to twitterv2 map_item function) 277 "filter_level": tweet['filter_level'], 278 'location': tweet['location'], 279 'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None, 280 'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None, 281 'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None, 282 'author_description': tweet['from_user_description'], 283 'author_url': tweet['from_user_url'], 284 'author_profile_image': tweet['from_user_profile_image_url'], 285 'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if 286 tweet['from_user_utcoffset'] else 0)/60/60), 287 'author_timezone_name': tweet['from_user_timezone'], 288 'author_language': tweet['from_user_lang'], 289 'author_tweet_count': tweet['from_user_tweetcount'], 290 'author_follower_count': tweet['from_user_followercount'], 291 'author_friend_following_count': tweet['from_user_friendcount'], 292 'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table? 293 'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None, 294 'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table? 295 'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table? 296 297 # TODO find in other TCAT tables or does not exist 298 # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no", 299 # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no", 300 # 'withheld_copyright': tweet['withheld_copyright'], # TCAT may no collect this anymore 301 # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore 302 # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits 303 304 } 305 306 # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet 307 mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id', 308 'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount', 309 'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description', 310 'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang', 311 'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count', 312 'in_reply_to_status_id'] 313 for key in tweet.keys(): 314 if key not in mapped_keys: 315 index = '' 316 while key + index in mapped_tweet.keys(): 317 index += '_1' 318 mapped_tweet[key + index] = tweet.get(key) 319 320 return mapped_tweet 321 322 323 @classmethod 324 def collect_tcat_metadata(cls): 325 """ 326 Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing 327 this data. To be used to infor the user of available TCAT bins and create the options from which a user will 328 select. 329 330 :return dict: All of the available bins from accessible TCAT instances 331 """ 332 333 # todo: cache this somehow! and check for the cache 334 instances = config.get("dmi-tcatv2.instances", []) 335 336 all_bins = {} 337 for instance in instances: 338 # Query each instance for bins 339 db = MySQLDatabase(logger=None, 340 dbname=instance.get('db_name'), 341 user=instance.get('db_user'), 342 password=instance.get('db_password'), 343 host=instance.get('db_host'), 344 port=instance.get('db_port')) 345 # try: 346 instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins') 347 # except: 348 349 instance_bins_metadata = {} 350 for instance_bin in instance_bins: 351 bin_data = { 352 'instance': instance, 353 'querybin': instance_bin['querybin'], 354 } 355 356 # Query for number of tweets 357 tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[ 358 'COUNT(id)'] 359 bin_data['tweet_count'] = tweet_count 360 361 # Collect first and last tweet datetimes 362 first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at'] 363 last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at'] 364 bin_data['first_tweet_datetime'] = first_tweet_datetime 365 bin_data['last_tweet_datetime'] = last_tweet_datetime 366 367 # Could check if bin currently should be collecting 368 # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active') 369 370 # Could collect phrases or usernames... 371 # if instance_bin['type'] in []: 372 # elif instance_bin['type'] in []: 373 374 # Could collect all periods for nuanced metadata... 375 #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id']) 376 377 # Add bin_data to instance collection 378 instance_bins_metadata[instance_bin['querybin']] = bin_data 379 # Add bins to instance 380 all_bins[instance['tcat_name']] = instance_bins_metadata 381 382 return all_bins 383 384 @staticmethod 385 def validate_query(query, request, user): 386 """ 387 Validate DMI-TCAT query input 388 389 :param dict query: Query parameters, from client-side. 390 :param request: Flask request 391 :param User user: User object of user who has submitted the query 392 :return dict: Safe query parameters 393 """ 394 # no query 4 u 395 if not query.get("bin", "").strip(): 396 raise QueryParametersException("You must choose a query bin to get tweets from.") 397 398 # the dates need to make sense as a range to search within 399 # and a date range is needed, to not make it too easy to just get all tweets 400 after, before = query.get("daterange") 401 if (after and before) and not before <= after: 402 raise QueryParametersException("A date range must start before it ends") 403 404 query["min_date"], query["max_date"] = query.get("daterange") 405 del query["daterange"] 406 407 # simple! 408 return query
Get Tweets via DMI-TCAT
This allows subsetting an existing query bin, similar to the 'Data Selection' panel in the DMI-TCAT analysis interface
76 @classmethod 77 def get_options(cls, parent_dataset=None, user=None): 78 """ 79 Get data source options 80 81 This method takes the pre-defined options, but fills the 'bins' options 82 with bins currently available from the configured TCAT instances. 83 84 :param DataSet parent_dataset: An object representing the dataset that 85 the processor would be run on 86 :param User user: Flask user the options will be displayed for, in 87 case they are requested for display in the 4CAT web interface. This can 88 be used to show some options only to privileges users. 89 """ 90 options = cls.options 91 92 # Collect Metadata from TCAT instances 93 all_bins = cls.collect_tcat_metadata() 94 95 options["bin"] = { 96 "type": UserInput.OPTION_CHOICE, 97 "options": {}, 98 "help": "Query bin" 99 } 100 101 for instance_name, bins in all_bins.items(): 102 for bin_name, bin in bins.items(): 103 bin_key = "%s@%s" % (bin_name, instance_name) 104 display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}" 105 options["bin"]["options"][bin_key] = display_text 106 107 return options
Get data source options
This method takes the pre-defined options, but fills the 'bins' options with bins currently available from the configured TCAT instances.
Parameters
- DataSet parent_dataset: An object representing the dataset that the processor would be run on
- User user: Flask user the options will be displayed for, in case they are requested for display in the 4CAT web interface. This can be used to show some options only to privileges users.
109 def get_items(self, query): 110 """ 111 Use the DMI-TCAT tweet export to retrieve tweets 112 113 :param query: 114 :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result 115 """ 116 bin = self.parameters.get("bin") 117 bin_name = bin.split("@")[0] 118 tcat_name = bin.split("@").pop() 119 120 available_instances = config.get("dmi-tcatv2-search.database_instances", []) 121 instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0] 122 123 db = MySQLDatabase(logger=self.log, 124 dbname=instance.get('db_name'), 125 user=instance.get('db_user'), 126 password=instance.get('db_password'), 127 host=instance.get('db_host'), 128 port=instance.get('db_port')) 129 130 self.dataset.update_status("Searching for tweets on %s" % bin_name) 131 if self.parameters.get("query_type") == 'advanced': 132 # Advanced query should be simple from our perspective... 133 self.dataset.log('Query: %s' % self.parameters.get("query")) 134 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 135 try: 136 num_results = unbuffered_cursor.execute(self.parameters.get("query")) 137 except pymysql.err.ProgrammingError as e: 138 self.dataset.update_status("SQL query error: %s" % str(e), is_final=True) 139 return 140 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 141 # Get column names from cursor 142 column_names = [description[0] for description in unbuffered_cursor.description] 143 for result in unbuffered_cursor.fetchall_unbuffered(): 144 # Reformat result (which is a tuple with each column in the row) to dict 145 new_result = {k: v for k, v in zip(column_names, result)} 146 # 4CAT necessary fieldnames 147 new_result['id'] = new_result.get('id', '') 148 new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get( 149 "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get( 150 "quoted_status_id") else new_result.get("id") 151 new_result['body'] = new_result.get('text', '') 152 new_result['timestamp'] = new_result.get('created_at', None) 153 new_result['subject'] = '' 154 new_result['author'] = new_result.get('from_user_name', '') 155 yield new_result 156 157 else: 158 # "Basic" query 159 text_query = self.parameters.get("query") 160 161 where = [] 162 replacements = [] 163 # Find AND and OR 164 placeholder = 0 165 start_of_match = 0 166 while start_of_match >= 0: 167 match = None 168 and_match = text_query[placeholder:].find(' AND ') 169 or_match = text_query[placeholder:].find(' OR ') 170 if and_match != -1 and or_match != -1: 171 # both found 172 if and_match < or_match: 173 # and match first 174 match = 'AND ' 175 start_of_match = and_match 176 else: 177 # or match first 178 match ='OR ' 179 start_of_match = or_match 180 elif and_match != -1: 181 # and match only 182 match ='AND ' 183 start_of_match = and_match 184 elif or_match != -1: 185 # or match only 186 match = 'OR ' 187 start_of_match = or_match 188 else: 189 # neither 190 match = None 191 start_of_match = -1 192 # Add partial query to where and replacements 193 if match: 194 where.append('lower(text) LIKE %s ' + match) 195 replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%') 196 # new start 197 placeholder = placeholder + start_of_match + len(match) 198 else: 199 where.append('lower(text) LIKE %s') 200 replacements.append('%'+text_query[placeholder:].lower().strip()+'%') 201 202 if query.get("min_date", None): 203 try: 204 if int(query.get("min_date")) > 0: 205 where.append("AND created_at >= %s") 206 replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date")))) 207 except ValueError: 208 pass 209 210 if query.get("max_date", None): 211 try: 212 if int(query.get("max_date")) > 0: 213 where.append("AND created_at < %s") 214 replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date")))) 215 except ValueError: 216 pass 217 218 where = " ".join(where) 219 query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where 220 self.dataset.log('Query: %s' % query) 221 self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements])) 222 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 223 num_results = unbuffered_cursor.execute(query, replacements) 224 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 225 column_names = [description[0] for description in unbuffered_cursor.description] 226 for result in unbuffered_cursor.fetchall_unbuffered(): 227 new_result = {k: v for k, v in zip(column_names, result)} 228 # Map tweet to 4CAT fields 229 new_result = self.tweet_mapping(new_result) 230 yield new_result
Use the DMI-TCAT tweet export to retrieve tweets
Parameters
- query: :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
232 @staticmethod 233 def tweet_mapping(tweet): 234 """ 235 Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to 236 mirror that mapped_tweet from twitterv2 datasource. 237 238 :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table 239 :return dict: 240 """ 241 mapped_tweet = {'id': tweet.get('id', ''), 242 # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is 243 # retweet, or its own ID 244 # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply 245 # or reply to retweet. 246 # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets 247 # with added text) 248 'thread_id': tweet.get("in_reply_to_status_id") if tweet.get( 249 "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get( 250 "retweet_id") else tweet.get("id"), 251 'body': tweet.get('text', ''), 252 # 'created_at': tweet.get('created_at'), 253 'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type( 254 tweet.get('created_at')) == datetime.datetime else None, 255 'subject': '', 256 'author': tweet.get('from_user_name', ''), 257 "author_fullname": tweet["from_user_realname"], 258 "author_id": tweet["from_user_id"], 259 "source": tweet["source"], 260 "language_guess": tweet.get("lang"), 261 262 "retweet_count": tweet["retweet_count"], 263 "like_count": tweet["favorite_count"], 264 "is_retweet": "yes" if tweet.get('retweet_id', False) else "no", 265 "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no", 266 "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None, 267 "reply_to": tweet["to_user_name"], 268 "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None, 269 270 # 4CAT specifics 271 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 272 "urls": ",".join(ural.urls_from_text(tweet["text"])), 273 "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])), 274 "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 275 276 # Additional TCAT data (compared to twitterv2 map_item function) 277 "filter_level": tweet['filter_level'], 278 'location': tweet['location'], 279 'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None, 280 'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None, 281 'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None, 282 'author_description': tweet['from_user_description'], 283 'author_url': tweet['from_user_url'], 284 'author_profile_image': tweet['from_user_profile_image_url'], 285 'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if 286 tweet['from_user_utcoffset'] else 0)/60/60), 287 'author_timezone_name': tweet['from_user_timezone'], 288 'author_language': tweet['from_user_lang'], 289 'author_tweet_count': tweet['from_user_tweetcount'], 290 'author_follower_count': tweet['from_user_followercount'], 291 'author_friend_following_count': tweet['from_user_friendcount'], 292 'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table? 293 'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None, 294 'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table? 295 'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table? 296 297 # TODO find in other TCAT tables or does not exist 298 # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no", 299 # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no", 300 # 'withheld_copyright': tweet['withheld_copyright'], # TCAT may no collect this anymore 301 # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore 302 # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits 303 304 } 305 306 # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet 307 mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id', 308 'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount', 309 'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description', 310 'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang', 311 'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count', 312 'in_reply_to_status_id'] 313 for key in tweet.keys(): 314 if key not in mapped_keys: 315 index = '' 316 while key + index in mapped_tweet.keys(): 317 index += '_1' 318 mapped_tweet[key + index] = tweet.get(key) 319 320 return mapped_tweet
Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to mirror that mapped_tweet from twitterv2 datasource.
Parameters
- dict tweet: TCAT dict returned from query; expected to be from bin tweets table
Returns
323 @classmethod 324 def collect_tcat_metadata(cls): 325 """ 326 Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing 327 this data. To be used to infor the user of available TCAT bins and create the options from which a user will 328 select. 329 330 :return dict: All of the available bins from accessible TCAT instances 331 """ 332 333 # todo: cache this somehow! and check for the cache 334 instances = config.get("dmi-tcatv2.instances", []) 335 336 all_bins = {} 337 for instance in instances: 338 # Query each instance for bins 339 db = MySQLDatabase(logger=None, 340 dbname=instance.get('db_name'), 341 user=instance.get('db_user'), 342 password=instance.get('db_password'), 343 host=instance.get('db_host'), 344 port=instance.get('db_port')) 345 # try: 346 instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins') 347 # except: 348 349 instance_bins_metadata = {} 350 for instance_bin in instance_bins: 351 bin_data = { 352 'instance': instance, 353 'querybin': instance_bin['querybin'], 354 } 355 356 # Query for number of tweets 357 tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[ 358 'COUNT(id)'] 359 bin_data['tweet_count'] = tweet_count 360 361 # Collect first and last tweet datetimes 362 first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at'] 363 last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at'] 364 bin_data['first_tweet_datetime'] = first_tweet_datetime 365 bin_data['last_tweet_datetime'] = last_tweet_datetime 366 367 # Could check if bin currently should be collecting 368 # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active') 369 370 # Could collect phrases or usernames... 371 # if instance_bin['type'] in []: 372 # elif instance_bin['type'] in []: 373 374 # Could collect all periods for nuanced metadata... 375 #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id']) 376 377 # Add bin_data to instance collection 378 instance_bins_metadata[instance_bin['querybin']] = bin_data 379 # Add bins to instance 380 all_bins[instance['tcat_name']] = instance_bins_metadata 381 382 return all_bins
Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing this data. To be used to infor the user of available TCAT bins and create the options from which a user will select.
Returns
All of the available bins from accessible TCAT instances
384 @staticmethod 385 def validate_query(query, request, user): 386 """ 387 Validate DMI-TCAT query input 388 389 :param dict query: Query parameters, from client-side. 390 :param request: Flask request 391 :param User user: User object of user who has submitted the query 392 :return dict: Safe query parameters 393 """ 394 # no query 4 u 395 if not query.get("bin", "").strip(): 396 raise QueryParametersException("You must choose a query bin to get tweets from.") 397 398 # the dates need to make sense as a range to search within 399 # and a date range is needed, to not make it too easy to just get all tweets 400 after, before = query.get("daterange") 401 if (after and before) and not before <= after: 402 raise QueryParametersException("A date range must start before it ends") 403 404 query["min_date"], query["max_date"] = query.get("daterange") 405 del query["daterange"] 406 407 # simple! 408 return query
Validate DMI-TCAT query input
Parameters
- dict query: Query parameters, from client-side.
- request: Flask request
- User user: User object of user who has submitted the query
Returns
Safe query parameters
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- description
- category
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor