datasources.dmi-tcatv2.search_tcat_v2
Twitter search within a DMI-TCAT bin v2 Direct database connection
1""" 2Twitter search within a DMI-TCAT bin v2 3Direct database connection 4""" 5import datetime 6import re 7import ural 8import pymysql 9 10 11from backend.lib.search import Search 12from common.lib.exceptions import QueryParametersException 13from common.lib.user_input import UserInput 14from backend.lib.database_mysql import MySQLDatabase 15 16 17class SearchWithinTCATBinsV2(Search): 18 """ 19 Get Tweets via DMI-TCAT 20 21 This allows subsetting an existing query bin, similar to the 'Data 22 Selection' panel in the DMI-TCAT analysis interface 23 """ 24 type = "dmi-tcatv2-search" # job ID 25 extension = "csv" 26 title = "TCAT Search (SQL)" 27 28 config = { 29 "dmi-tcatv2-search.database_instances": { 30 "type": UserInput.OPTION_TEXT_JSON, 31 "help": "DMI-TCAT instances", 32 "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture'," 33 "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of " 34 "these values need to be provided for each instance. This needs to be formatted as a JSON list of " 35 "objects.", 36 "default": {} 37 } 38 } 39 40 @classmethod 41 def get_options(cls, parent_dataset=None, config=None): 42 """ 43 Get data source options 44 45 This method takes the pre-defined options, but fills the 'bins' options 46 with bins currently available from the configured TCAT instances. 47 48 :param config: 49 :param DataSet parent_dataset: An object representing the dataset that 50 the processor would be run on can 51 be used to show some options only to privileges users. 52 """ 53 options = { 54 "intro-1": { 55 "type": UserInput.OPTION_INFO, 56 "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet " 57 "bin in that instance." 58 }, 59 "divider-1": { 60 "type": UserInput.OPTION_DIVIDER 61 }, 62 "bin": { 63 "type": UserInput.OPTION_INFO, 64 "help": "Query bin" 65 }, 66 "query_type": { 67 "type": UserInput.OPTION_CHOICE, 68 "help": "Basic or Advanced Query", 69 "options": { 70 "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)", 71 "advanced": "Select queries on any TCAT twitter tables" 72 }, 73 "default": "basic", 74 "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary" 75 }, 76 "query": { 77 "type": UserInput.OPTION_TEXT, 78 "help": "Query text", 79 "tooltip": "Match all tweets containing this text." 80 }, 81 "daterange": { 82 "type": UserInput.OPTION_DATERANGE, 83 "help": "Date range" 84 } 85 } 86 87 # Collect Metadata from TCAT instances 88 all_bins = cls.collect_tcat_metadata(config) 89 90 options["bin"] = { 91 "type": UserInput.OPTION_CHOICE, 92 "options": {}, 93 "help": "Query bin" 94 } 95 96 for instance_name, bins in all_bins.items(): 97 for bin_name, bin in bins.items(): 98 bin_key = "%s@%s" % (bin_name, instance_name) 99 display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}" 100 options["bin"]["options"][bin_key] = display_text 101 102 return options 103 104 def get_items(self, query): 105 """ 106 Use the DMI-TCAT tweet export to retrieve tweets 107 108 :param query: 109 :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result 110 """ 111 bin = self.parameters.get("bin") 112 bin_name = bin.split("@")[0] 113 tcat_name = bin.split("@").pop() 114 115 available_instances = self.config.get("dmi-tcatv2-search.database_instances", []) 116 instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0] 117 118 db = MySQLDatabase(logger=self.log, 119 dbname=instance.get('db_name'), 120 user=instance.get('db_user'), 121 password=instance.get('db_password'), 122 host=instance.get('db_host'), 123 port=instance.get('db_port')) 124 125 self.dataset.update_status("Searching for tweets on %s" % bin_name) 126 if self.parameters.get("query_type") == 'advanced': 127 # Advanced query should be simple from our perspective... 128 self.dataset.log('Query: %s' % self.parameters.get("query")) 129 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 130 try: 131 unbuffered_cursor.execute(self.parameters.get("query")) 132 except pymysql.err.ProgrammingError as e: 133 self.dataset.update_status("SQL query error: %s" % str(e), is_final=True) 134 return 135 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 136 # Get column names from cursor 137 column_names = [description[0] for description in unbuffered_cursor.description] 138 for result in unbuffered_cursor.fetchall_unbuffered(): 139 # Reformat result (which is a tuple with each column in the row) to dict 140 new_result = {k: v for k, v in zip(column_names, result)} 141 # 4CAT necessary fieldnames 142 new_result['id'] = new_result.get('id', '') 143 new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get( 144 "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get( 145 "quoted_status_id") else new_result.get("id") 146 new_result['body'] = new_result.get('text', '') 147 new_result['timestamp'] = new_result.get('created_at', None) 148 new_result['subject'] = '' 149 new_result['author'] = new_result.get('from_user_name', '') 150 yield new_result 151 152 else: 153 # "Basic" query 154 text_query = self.parameters.get("query") 155 156 where = [] 157 replacements = [] 158 # Find AND and OR 159 placeholder = 0 160 start_of_match = 0 161 while start_of_match >= 0: 162 match = None 163 and_match = text_query[placeholder:].find(' AND ') 164 or_match = text_query[placeholder:].find(' OR ') 165 if and_match != -1 and or_match != -1: 166 # both found 167 if and_match < or_match: 168 # and match first 169 match = 'AND ' 170 start_of_match = and_match 171 else: 172 # or match first 173 match ='OR ' 174 start_of_match = or_match 175 elif and_match != -1: 176 # and match only 177 match ='AND ' 178 start_of_match = and_match 179 elif or_match != -1: 180 # or match only 181 match = 'OR ' 182 start_of_match = or_match 183 else: 184 # neither 185 match = None 186 start_of_match = -1 187 # Add partial query to where and replacements 188 if match: 189 where.append('lower(text) LIKE %s ' + match) 190 replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%') 191 # new start 192 placeholder = placeholder + start_of_match + len(match) 193 else: 194 where.append('lower(text) LIKE %s') 195 replacements.append('%'+text_query[placeholder:].lower().strip()+'%') 196 197 if query.get("min_date", None): 198 try: 199 if int(query.get("min_date")) > 0: 200 where.append("AND created_at >= %s") 201 replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date")))) 202 except ValueError: 203 pass 204 205 if query.get("max_date", None): 206 try: 207 if int(query.get("max_date")) > 0: 208 where.append("AND created_at < %s") 209 replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date")))) 210 except ValueError: 211 pass 212 213 where = " ".join(where) 214 query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where 215 self.dataset.log('Query: %s' % query) 216 self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements])) 217 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 218 unbuffered_cursor.execute(query, replacements) 219 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 220 column_names = [description[0] for description in unbuffered_cursor.description] 221 for result in unbuffered_cursor.fetchall_unbuffered(): 222 new_result = {k: v for k, v in zip(column_names, result)} 223 # Map tweet to 4CAT fields 224 new_result = self.tweet_mapping(new_result) 225 yield new_result 226 227 @staticmethod 228 def tweet_mapping(tweet): 229 """ 230 Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to 231 mirror that mapped_tweet from twitterv2 datasource. 232 233 :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table 234 :return dict: 235 """ 236 mapped_tweet = {'id': tweet.get('id', ''), 237 # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is 238 # retweet, or its own ID 239 # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply 240 # or reply to retweet. 241 # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets 242 # with added text) 243 'thread_id': tweet.get("in_reply_to_status_id") if tweet.get( 244 "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get( 245 "retweet_id") else tweet.get("id"), 246 'body': tweet.get('text', ''), 247 # 'created_at': tweet.get('created_at'), 248 'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type( 249 tweet.get('created_at')) is datetime.datetime else None, 250 'subject': '', 251 'author': tweet.get('from_user_name', ''), 252 "author_fullname": tweet["from_user_realname"], 253 "author_id": tweet["from_user_id"], 254 "source": tweet["source"], 255 "language_guess": tweet.get("lang"), 256 257 "retweet_count": tweet["retweet_count"], 258 "like_count": tweet["favorite_count"], 259 "is_retweet": "yes" if tweet.get('retweet_id', False) else "no", 260 "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no", 261 "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None, 262 "reply_to": tweet["to_user_name"], 263 "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None, 264 265 # 4CAT specifics 266 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 267 "urls": ",".join(ural.urls_from_text(tweet["text"])), 268 "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])), 269 "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 270 271 # Additional TCAT data (compared to twitterv2 map_item function) 272 "filter_level": tweet['filter_level'], 273 'location': tweet['location'], 274 'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None, 275 'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None, 276 'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None, 277 'author_description': tweet['from_user_description'], 278 'author_url': tweet['from_user_url'], 279 'author_profile_image': tweet['from_user_profile_image_url'], 280 'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if 281 tweet['from_user_utcoffset'] else 0)/60/60), 282 'author_timezone_name': tweet['from_user_timezone'], 283 'author_language': tweet['from_user_lang'], 284 'author_tweet_count': tweet['from_user_tweetcount'], 285 'author_follower_count': tweet['from_user_followercount'], 286 'author_friend_following_count': tweet['from_user_friendcount'], 287 'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table? 288 'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None, 289 'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table? 290 'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table? 291 292 # TODO find in other TCAT tables or does not exist 293 # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no", 294 # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no", 295 # 'withheld_copyright': tweet['withheld_copyright'], # TCAT may no collect this anymore 296 # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore 297 # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits 298 299 } 300 301 # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet 302 mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id', 303 'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount', 304 'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description', 305 'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang', 306 'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count', 307 'in_reply_to_status_id'] 308 for key in tweet.keys(): 309 if key not in mapped_keys: 310 index = '' 311 while key + index in mapped_tweet.keys(): 312 index += '_1' 313 mapped_tweet[key + index] = tweet.get(key) 314 315 return mapped_tweet 316 317 318 @classmethod 319 def collect_tcat_metadata(cls, config): 320 """ 321 Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing 322 this data. To be used to infor the user of available TCAT bins and create the options from which a user will 323 select. 324 325 :return dict: All of the available bins from accessible TCAT instances 326 """ 327 328 # todo: cache this somehow! and check for the cache 329 instances = config.get("dmi-tcatv2.instances", []) 330 331 all_bins = {} 332 for instance in instances: 333 # Query each instance for bins 334 db = MySQLDatabase(logger=None, 335 dbname=instance.get('db_name'), 336 user=instance.get('db_user'), 337 password=instance.get('db_password'), 338 host=instance.get('db_host'), 339 port=instance.get('db_port')) 340 # try: 341 instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins') 342 # except: 343 344 instance_bins_metadata = {} 345 for instance_bin in instance_bins: 346 bin_data = { 347 'instance': instance, 348 'querybin': instance_bin['querybin'], 349 } 350 351 # Query for number of tweets 352 tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[ 353 'COUNT(id)'] 354 bin_data['tweet_count'] = tweet_count 355 356 # Collect first and last tweet datetimes 357 first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at'] 358 last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at'] 359 bin_data['first_tweet_datetime'] = first_tweet_datetime 360 bin_data['last_tweet_datetime'] = last_tweet_datetime 361 362 # Could check if bin currently should be collecting 363 # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active') 364 365 # Could collect phrases or usernames... 366 # if instance_bin['type'] in []: 367 # elif instance_bin['type'] in []: 368 369 # Could collect all periods for nuanced metadata... 370 #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id']) 371 372 # Add bin_data to instance collection 373 instance_bins_metadata[instance_bin['querybin']] = bin_data 374 # Add bins to instance 375 all_bins[instance['tcat_name']] = instance_bins_metadata 376 377 return all_bins 378 379 @staticmethod 380 def validate_query(query, request, config): 381 """ 382 Validate DMI-TCAT query input 383 384 :param dict query: Query parameters, from client-side. 385 :param request: Flask request 386 :param ConfigManager|None config: Configuration reader (context-aware) 387 :return dict: Safe query parameters 388 """ 389 # no query 4 u 390 if not query.get("bin", ""): 391 raise QueryParametersException("You must choose a query bin to get tweets from.") 392 393 # the dates need to make sense as a range to search within 394 # and a date range is needed, to not make it too easy to just get all tweets 395 after, before = query.get("daterange") 396 if (after and before) and not before <= after: 397 raise QueryParametersException("A date range must start before it ends") 398 399 query["min_date"], query["max_date"] = query.get("daterange") 400 query["bin"] = query.get("bin", "").strip() 401 del query["daterange"] 402 403 # simple! 404 return query
18class SearchWithinTCATBinsV2(Search): 19 """ 20 Get Tweets via DMI-TCAT 21 22 This allows subsetting an existing query bin, similar to the 'Data 23 Selection' panel in the DMI-TCAT analysis interface 24 """ 25 type = "dmi-tcatv2-search" # job ID 26 extension = "csv" 27 title = "TCAT Search (SQL)" 28 29 config = { 30 "dmi-tcatv2-search.database_instances": { 31 "type": UserInput.OPTION_TEXT_JSON, 32 "help": "DMI-TCAT instances", 33 "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture'," 34 "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of " 35 "these values need to be provided for each instance. This needs to be formatted as a JSON list of " 36 "objects.", 37 "default": {} 38 } 39 } 40 41 @classmethod 42 def get_options(cls, parent_dataset=None, config=None): 43 """ 44 Get data source options 45 46 This method takes the pre-defined options, but fills the 'bins' options 47 with bins currently available from the configured TCAT instances. 48 49 :param config: 50 :param DataSet parent_dataset: An object representing the dataset that 51 the processor would be run on can 52 be used to show some options only to privileges users. 53 """ 54 options = { 55 "intro-1": { 56 "type": UserInput.OPTION_INFO, 57 "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet " 58 "bin in that instance." 59 }, 60 "divider-1": { 61 "type": UserInput.OPTION_DIVIDER 62 }, 63 "bin": { 64 "type": UserInput.OPTION_INFO, 65 "help": "Query bin" 66 }, 67 "query_type": { 68 "type": UserInput.OPTION_CHOICE, 69 "help": "Basic or Advanced Query", 70 "options": { 71 "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)", 72 "advanced": "Select queries on any TCAT twitter tables" 73 }, 74 "default": "basic", 75 "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary" 76 }, 77 "query": { 78 "type": UserInput.OPTION_TEXT, 79 "help": "Query text", 80 "tooltip": "Match all tweets containing this text." 81 }, 82 "daterange": { 83 "type": UserInput.OPTION_DATERANGE, 84 "help": "Date range" 85 } 86 } 87 88 # Collect Metadata from TCAT instances 89 all_bins = cls.collect_tcat_metadata(config) 90 91 options["bin"] = { 92 "type": UserInput.OPTION_CHOICE, 93 "options": {}, 94 "help": "Query bin" 95 } 96 97 for instance_name, bins in all_bins.items(): 98 for bin_name, bin in bins.items(): 99 bin_key = "%s@%s" % (bin_name, instance_name) 100 display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}" 101 options["bin"]["options"][bin_key] = display_text 102 103 return options 104 105 def get_items(self, query): 106 """ 107 Use the DMI-TCAT tweet export to retrieve tweets 108 109 :param query: 110 :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result 111 """ 112 bin = self.parameters.get("bin") 113 bin_name = bin.split("@")[0] 114 tcat_name = bin.split("@").pop() 115 116 available_instances = self.config.get("dmi-tcatv2-search.database_instances", []) 117 instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0] 118 119 db = MySQLDatabase(logger=self.log, 120 dbname=instance.get('db_name'), 121 user=instance.get('db_user'), 122 password=instance.get('db_password'), 123 host=instance.get('db_host'), 124 port=instance.get('db_port')) 125 126 self.dataset.update_status("Searching for tweets on %s" % bin_name) 127 if self.parameters.get("query_type") == 'advanced': 128 # Advanced query should be simple from our perspective... 129 self.dataset.log('Query: %s' % self.parameters.get("query")) 130 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 131 try: 132 unbuffered_cursor.execute(self.parameters.get("query")) 133 except pymysql.err.ProgrammingError as e: 134 self.dataset.update_status("SQL query error: %s" % str(e), is_final=True) 135 return 136 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 137 # Get column names from cursor 138 column_names = [description[0] for description in unbuffered_cursor.description] 139 for result in unbuffered_cursor.fetchall_unbuffered(): 140 # Reformat result (which is a tuple with each column in the row) to dict 141 new_result = {k: v for k, v in zip(column_names, result)} 142 # 4CAT necessary fieldnames 143 new_result['id'] = new_result.get('id', '') 144 new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get( 145 "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get( 146 "quoted_status_id") else new_result.get("id") 147 new_result['body'] = new_result.get('text', '') 148 new_result['timestamp'] = new_result.get('created_at', None) 149 new_result['subject'] = '' 150 new_result['author'] = new_result.get('from_user_name', '') 151 yield new_result 152 153 else: 154 # "Basic" query 155 text_query = self.parameters.get("query") 156 157 where = [] 158 replacements = [] 159 # Find AND and OR 160 placeholder = 0 161 start_of_match = 0 162 while start_of_match >= 0: 163 match = None 164 and_match = text_query[placeholder:].find(' AND ') 165 or_match = text_query[placeholder:].find(' OR ') 166 if and_match != -1 and or_match != -1: 167 # both found 168 if and_match < or_match: 169 # and match first 170 match = 'AND ' 171 start_of_match = and_match 172 else: 173 # or match first 174 match ='OR ' 175 start_of_match = or_match 176 elif and_match != -1: 177 # and match only 178 match ='AND ' 179 start_of_match = and_match 180 elif or_match != -1: 181 # or match only 182 match = 'OR ' 183 start_of_match = or_match 184 else: 185 # neither 186 match = None 187 start_of_match = -1 188 # Add partial query to where and replacements 189 if match: 190 where.append('lower(text) LIKE %s ' + match) 191 replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%') 192 # new start 193 placeholder = placeholder + start_of_match + len(match) 194 else: 195 where.append('lower(text) LIKE %s') 196 replacements.append('%'+text_query[placeholder:].lower().strip()+'%') 197 198 if query.get("min_date", None): 199 try: 200 if int(query.get("min_date")) > 0: 201 where.append("AND created_at >= %s") 202 replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date")))) 203 except ValueError: 204 pass 205 206 if query.get("max_date", None): 207 try: 208 if int(query.get("max_date")) > 0: 209 where.append("AND created_at < %s") 210 replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date")))) 211 except ValueError: 212 pass 213 214 where = " ".join(where) 215 query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where 216 self.dataset.log('Query: %s' % query) 217 self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements])) 218 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 219 unbuffered_cursor.execute(query, replacements) 220 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 221 column_names = [description[0] for description in unbuffered_cursor.description] 222 for result in unbuffered_cursor.fetchall_unbuffered(): 223 new_result = {k: v for k, v in zip(column_names, result)} 224 # Map tweet to 4CAT fields 225 new_result = self.tweet_mapping(new_result) 226 yield new_result 227 228 @staticmethod 229 def tweet_mapping(tweet): 230 """ 231 Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to 232 mirror that mapped_tweet from twitterv2 datasource. 233 234 :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table 235 :return dict: 236 """ 237 mapped_tweet = {'id': tweet.get('id', ''), 238 # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is 239 # retweet, or its own ID 240 # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply 241 # or reply to retweet. 242 # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets 243 # with added text) 244 'thread_id': tweet.get("in_reply_to_status_id") if tweet.get( 245 "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get( 246 "retweet_id") else tweet.get("id"), 247 'body': tweet.get('text', ''), 248 # 'created_at': tweet.get('created_at'), 249 'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type( 250 tweet.get('created_at')) is datetime.datetime else None, 251 'subject': '', 252 'author': tweet.get('from_user_name', ''), 253 "author_fullname": tweet["from_user_realname"], 254 "author_id": tweet["from_user_id"], 255 "source": tweet["source"], 256 "language_guess": tweet.get("lang"), 257 258 "retweet_count": tweet["retweet_count"], 259 "like_count": tweet["favorite_count"], 260 "is_retweet": "yes" if tweet.get('retweet_id', False) else "no", 261 "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no", 262 "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None, 263 "reply_to": tweet["to_user_name"], 264 "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None, 265 266 # 4CAT specifics 267 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 268 "urls": ",".join(ural.urls_from_text(tweet["text"])), 269 "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])), 270 "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 271 272 # Additional TCAT data (compared to twitterv2 map_item function) 273 "filter_level": tweet['filter_level'], 274 'location': tweet['location'], 275 'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None, 276 'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None, 277 'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None, 278 'author_description': tweet['from_user_description'], 279 'author_url': tweet['from_user_url'], 280 'author_profile_image': tweet['from_user_profile_image_url'], 281 'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if 282 tweet['from_user_utcoffset'] else 0)/60/60), 283 'author_timezone_name': tweet['from_user_timezone'], 284 'author_language': tweet['from_user_lang'], 285 'author_tweet_count': tweet['from_user_tweetcount'], 286 'author_follower_count': tweet['from_user_followercount'], 287 'author_friend_following_count': tweet['from_user_friendcount'], 288 'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table? 289 'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None, 290 'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table? 291 'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table? 292 293 # TODO find in other TCAT tables or does not exist 294 # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no", 295 # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no", 296 # 'withheld_copyright': tweet['withheld_copyright'], # TCAT may no collect this anymore 297 # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore 298 # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits 299 300 } 301 302 # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet 303 mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id', 304 'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount', 305 'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description', 306 'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang', 307 'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count', 308 'in_reply_to_status_id'] 309 for key in tweet.keys(): 310 if key not in mapped_keys: 311 index = '' 312 while key + index in mapped_tweet.keys(): 313 index += '_1' 314 mapped_tweet[key + index] = tweet.get(key) 315 316 return mapped_tweet 317 318 319 @classmethod 320 def collect_tcat_metadata(cls, config): 321 """ 322 Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing 323 this data. To be used to infor the user of available TCAT bins and create the options from which a user will 324 select. 325 326 :return dict: All of the available bins from accessible TCAT instances 327 """ 328 329 # todo: cache this somehow! and check for the cache 330 instances = config.get("dmi-tcatv2.instances", []) 331 332 all_bins = {} 333 for instance in instances: 334 # Query each instance for bins 335 db = MySQLDatabase(logger=None, 336 dbname=instance.get('db_name'), 337 user=instance.get('db_user'), 338 password=instance.get('db_password'), 339 host=instance.get('db_host'), 340 port=instance.get('db_port')) 341 # try: 342 instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins') 343 # except: 344 345 instance_bins_metadata = {} 346 for instance_bin in instance_bins: 347 bin_data = { 348 'instance': instance, 349 'querybin': instance_bin['querybin'], 350 } 351 352 # Query for number of tweets 353 tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[ 354 'COUNT(id)'] 355 bin_data['tweet_count'] = tweet_count 356 357 # Collect first and last tweet datetimes 358 first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at'] 359 last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at'] 360 bin_data['first_tweet_datetime'] = first_tweet_datetime 361 bin_data['last_tweet_datetime'] = last_tweet_datetime 362 363 # Could check if bin currently should be collecting 364 # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active') 365 366 # Could collect phrases or usernames... 367 # if instance_bin['type'] in []: 368 # elif instance_bin['type'] in []: 369 370 # Could collect all periods for nuanced metadata... 371 #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id']) 372 373 # Add bin_data to instance collection 374 instance_bins_metadata[instance_bin['querybin']] = bin_data 375 # Add bins to instance 376 all_bins[instance['tcat_name']] = instance_bins_metadata 377 378 return all_bins 379 380 @staticmethod 381 def validate_query(query, request, config): 382 """ 383 Validate DMI-TCAT query input 384 385 :param dict query: Query parameters, from client-side. 386 :param request: Flask request 387 :param ConfigManager|None config: Configuration reader (context-aware) 388 :return dict: Safe query parameters 389 """ 390 # no query 4 u 391 if not query.get("bin", ""): 392 raise QueryParametersException("You must choose a query bin to get tweets from.") 393 394 # the dates need to make sense as a range to search within 395 # and a date range is needed, to not make it too easy to just get all tweets 396 after, before = query.get("daterange") 397 if (after and before) and not before <= after: 398 raise QueryParametersException("A date range must start before it ends") 399 400 query["min_date"], query["max_date"] = query.get("daterange") 401 query["bin"] = query.get("bin", "").strip() 402 del query["daterange"] 403 404 # simple! 405 return query
Get Tweets via DMI-TCAT
This allows subsetting an existing query bin, similar to the 'Data Selection' panel in the DMI-TCAT analysis interface
41 @classmethod 42 def get_options(cls, parent_dataset=None, config=None): 43 """ 44 Get data source options 45 46 This method takes the pre-defined options, but fills the 'bins' options 47 with bins currently available from the configured TCAT instances. 48 49 :param config: 50 :param DataSet parent_dataset: An object representing the dataset that 51 the processor would be run on can 52 be used to show some options only to privileges users. 53 """ 54 options = { 55 "intro-1": { 56 "type": UserInput.OPTION_INFO, 57 "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet " 58 "bin in that instance." 59 }, 60 "divider-1": { 61 "type": UserInput.OPTION_DIVIDER 62 }, 63 "bin": { 64 "type": UserInput.OPTION_INFO, 65 "help": "Query bin" 66 }, 67 "query_type": { 68 "type": UserInput.OPTION_CHOICE, 69 "help": "Basic or Advanced Query", 70 "options": { 71 "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)", 72 "advanced": "Select queries on any TCAT twitter tables" 73 }, 74 "default": "basic", 75 "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary" 76 }, 77 "query": { 78 "type": UserInput.OPTION_TEXT, 79 "help": "Query text", 80 "tooltip": "Match all tweets containing this text." 81 }, 82 "daterange": { 83 "type": UserInput.OPTION_DATERANGE, 84 "help": "Date range" 85 } 86 } 87 88 # Collect Metadata from TCAT instances 89 all_bins = cls.collect_tcat_metadata(config) 90 91 options["bin"] = { 92 "type": UserInput.OPTION_CHOICE, 93 "options": {}, 94 "help": "Query bin" 95 } 96 97 for instance_name, bins in all_bins.items(): 98 for bin_name, bin in bins.items(): 99 bin_key = "%s@%s" % (bin_name, instance_name) 100 display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}" 101 options["bin"]["options"][bin_key] = display_text 102 103 return options
Get data source options
This method takes the pre-defined options, but fills the 'bins' options with bins currently available from the configured TCAT instances.
Parameters
- config:
- DataSet parent_dataset: An object representing the dataset that the processor would be run on can be used to show some options only to privileges users.
105 def get_items(self, query): 106 """ 107 Use the DMI-TCAT tweet export to retrieve tweets 108 109 :param query: 110 :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result 111 """ 112 bin = self.parameters.get("bin") 113 bin_name = bin.split("@")[0] 114 tcat_name = bin.split("@").pop() 115 116 available_instances = self.config.get("dmi-tcatv2-search.database_instances", []) 117 instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0] 118 119 db = MySQLDatabase(logger=self.log, 120 dbname=instance.get('db_name'), 121 user=instance.get('db_user'), 122 password=instance.get('db_password'), 123 host=instance.get('db_host'), 124 port=instance.get('db_port')) 125 126 self.dataset.update_status("Searching for tweets on %s" % bin_name) 127 if self.parameters.get("query_type") == 'advanced': 128 # Advanced query should be simple from our perspective... 129 self.dataset.log('Query: %s' % self.parameters.get("query")) 130 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 131 try: 132 unbuffered_cursor.execute(self.parameters.get("query")) 133 except pymysql.err.ProgrammingError as e: 134 self.dataset.update_status("SQL query error: %s" % str(e), is_final=True) 135 return 136 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 137 # Get column names from cursor 138 column_names = [description[0] for description in unbuffered_cursor.description] 139 for result in unbuffered_cursor.fetchall_unbuffered(): 140 # Reformat result (which is a tuple with each column in the row) to dict 141 new_result = {k: v for k, v in zip(column_names, result)} 142 # 4CAT necessary fieldnames 143 new_result['id'] = new_result.get('id', '') 144 new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get( 145 "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get( 146 "quoted_status_id") else new_result.get("id") 147 new_result['body'] = new_result.get('text', '') 148 new_result['timestamp'] = new_result.get('created_at', None) 149 new_result['subject'] = '' 150 new_result['author'] = new_result.get('from_user_name', '') 151 yield new_result 152 153 else: 154 # "Basic" query 155 text_query = self.parameters.get("query") 156 157 where = [] 158 replacements = [] 159 # Find AND and OR 160 placeholder = 0 161 start_of_match = 0 162 while start_of_match >= 0: 163 match = None 164 and_match = text_query[placeholder:].find(' AND ') 165 or_match = text_query[placeholder:].find(' OR ') 166 if and_match != -1 and or_match != -1: 167 # both found 168 if and_match < or_match: 169 # and match first 170 match = 'AND ' 171 start_of_match = and_match 172 else: 173 # or match first 174 match ='OR ' 175 start_of_match = or_match 176 elif and_match != -1: 177 # and match only 178 match ='AND ' 179 start_of_match = and_match 180 elif or_match != -1: 181 # or match only 182 match = 'OR ' 183 start_of_match = or_match 184 else: 185 # neither 186 match = None 187 start_of_match = -1 188 # Add partial query to where and replacements 189 if match: 190 where.append('lower(text) LIKE %s ' + match) 191 replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%') 192 # new start 193 placeholder = placeholder + start_of_match + len(match) 194 else: 195 where.append('lower(text) LIKE %s') 196 replacements.append('%'+text_query[placeholder:].lower().strip()+'%') 197 198 if query.get("min_date", None): 199 try: 200 if int(query.get("min_date")) > 0: 201 where.append("AND created_at >= %s") 202 replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date")))) 203 except ValueError: 204 pass 205 206 if query.get("max_date", None): 207 try: 208 if int(query.get("max_date")) > 0: 209 where.append("AND created_at < %s") 210 replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date")))) 211 except ValueError: 212 pass 213 214 where = " ".join(where) 215 query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where 216 self.dataset.log('Query: %s' % query) 217 self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements])) 218 unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor) 219 unbuffered_cursor.execute(query, replacements) 220 # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought 221 column_names = [description[0] for description in unbuffered_cursor.description] 222 for result in unbuffered_cursor.fetchall_unbuffered(): 223 new_result = {k: v for k, v in zip(column_names, result)} 224 # Map tweet to 4CAT fields 225 new_result = self.tweet_mapping(new_result) 226 yield new_result
Use the DMI-TCAT tweet export to retrieve tweets
Parameters
- query: :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
228 @staticmethod 229 def tweet_mapping(tweet): 230 """ 231 Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to 232 mirror that mapped_tweet from twitterv2 datasource. 233 234 :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table 235 :return dict: 236 """ 237 mapped_tweet = {'id': tweet.get('id', ''), 238 # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is 239 # retweet, or its own ID 240 # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply 241 # or reply to retweet. 242 # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets 243 # with added text) 244 'thread_id': tweet.get("in_reply_to_status_id") if tweet.get( 245 "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get( 246 "retweet_id") else tweet.get("id"), 247 'body': tweet.get('text', ''), 248 # 'created_at': tweet.get('created_at'), 249 'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type( 250 tweet.get('created_at')) is datetime.datetime else None, 251 'subject': '', 252 'author': tweet.get('from_user_name', ''), 253 "author_fullname": tweet["from_user_realname"], 254 "author_id": tweet["from_user_id"], 255 "source": tweet["source"], 256 "language_guess": tweet.get("lang"), 257 258 "retweet_count": tweet["retweet_count"], 259 "like_count": tweet["favorite_count"], 260 "is_retweet": "yes" if tweet.get('retweet_id', False) else "no", 261 "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no", 262 "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None, 263 "reply_to": tweet["to_user_name"], 264 "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None, 265 266 # 4CAT specifics 267 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 268 "urls": ",".join(ural.urls_from_text(tweet["text"])), 269 "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])), 270 "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])), 271 272 # Additional TCAT data (compared to twitterv2 map_item function) 273 "filter_level": tweet['filter_level'], 274 'location': tweet['location'], 275 'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None, 276 'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None, 277 'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None, 278 'author_description': tweet['from_user_description'], 279 'author_url': tweet['from_user_url'], 280 'author_profile_image': tweet['from_user_profile_image_url'], 281 'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if 282 tweet['from_user_utcoffset'] else 0)/60/60), 283 'author_timezone_name': tweet['from_user_timezone'], 284 'author_language': tweet['from_user_lang'], 285 'author_tweet_count': tweet['from_user_tweetcount'], 286 'author_follower_count': tweet['from_user_followercount'], 287 'author_friend_following_count': tweet['from_user_friendcount'], 288 'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table? 289 'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None, 290 'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table? 291 'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table? 292 293 # TODO find in other TCAT tables or does not exist 294 # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no", 295 # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no", 296 # 'withheld_copyright': tweet['withheld_copyright'], # TCAT may no collect this anymore 297 # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore 298 # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits 299 300 } 301 302 # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet 303 mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id', 304 'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount', 305 'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description', 306 'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang', 307 'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count', 308 'in_reply_to_status_id'] 309 for key in tweet.keys(): 310 if key not in mapped_keys: 311 index = '' 312 while key + index in mapped_tweet.keys(): 313 index += '_1' 314 mapped_tweet[key + index] = tweet.get(key) 315 316 return mapped_tweet
Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to mirror that mapped_tweet from twitterv2 datasource.
Parameters
- dict tweet: TCAT dict returned from query; expected to be from bin tweets table
Returns
319 @classmethod 320 def collect_tcat_metadata(cls, config): 321 """ 322 Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing 323 this data. To be used to infor the user of available TCAT bins and create the options from which a user will 324 select. 325 326 :return dict: All of the available bins from accessible TCAT instances 327 """ 328 329 # todo: cache this somehow! and check for the cache 330 instances = config.get("dmi-tcatv2.instances", []) 331 332 all_bins = {} 333 for instance in instances: 334 # Query each instance for bins 335 db = MySQLDatabase(logger=None, 336 dbname=instance.get('db_name'), 337 user=instance.get('db_user'), 338 password=instance.get('db_password'), 339 host=instance.get('db_host'), 340 port=instance.get('db_port')) 341 # try: 342 instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins') 343 # except: 344 345 instance_bins_metadata = {} 346 for instance_bin in instance_bins: 347 bin_data = { 348 'instance': instance, 349 'querybin': instance_bin['querybin'], 350 } 351 352 # Query for number of tweets 353 tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[ 354 'COUNT(id)'] 355 bin_data['tweet_count'] = tweet_count 356 357 # Collect first and last tweet datetimes 358 first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at'] 359 last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at'] 360 bin_data['first_tweet_datetime'] = first_tweet_datetime 361 bin_data['last_tweet_datetime'] = last_tweet_datetime 362 363 # Could check if bin currently should be collecting 364 # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active') 365 366 # Could collect phrases or usernames... 367 # if instance_bin['type'] in []: 368 # elif instance_bin['type'] in []: 369 370 # Could collect all periods for nuanced metadata... 371 #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id']) 372 373 # Add bin_data to instance collection 374 instance_bins_metadata[instance_bin['querybin']] = bin_data 375 # Add bins to instance 376 all_bins[instance['tcat_name']] = instance_bins_metadata 377 378 return all_bins
Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing this data. To be used to infor the user of available TCAT bins and create the options from which a user will select.
Returns
All of the available bins from accessible TCAT instances
380 @staticmethod 381 def validate_query(query, request, config): 382 """ 383 Validate DMI-TCAT query input 384 385 :param dict query: Query parameters, from client-side. 386 :param request: Flask request 387 :param ConfigManager|None config: Configuration reader (context-aware) 388 :return dict: Safe query parameters 389 """ 390 # no query 4 u 391 if not query.get("bin", ""): 392 raise QueryParametersException("You must choose a query bin to get tweets from.") 393 394 # the dates need to make sense as a range to search within 395 # and a date range is needed, to not make it too easy to just get all tweets 396 after, before = query.get("daterange") 397 if (after and before) and not before <= after: 398 raise QueryParametersException("A date range must start before it ends") 399 400 query["min_date"], query["max_date"] = query.get("daterange") 401 query["bin"] = query.get("bin", "").strip() 402 del query["daterange"] 403 404 # simple! 405 return query
Validate DMI-TCAT query input
Parameters
- dict query: Query parameters, from client-side.
- request: Flask request
- ConfigManager|None config: Configuration reader (context-aware)
Returns
Safe query parameters
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- run_interruptable_process
- get_queue_id
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- description
- category
- is_running_in_preset
- filepath
- for_cleanup
- work
- after_process
- clean_up_on_error
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor