Edit on GitHub

datasources.dmi-tcatv2.search_tcat_v2

Twitter search within a DMI-TCAT bin v2 Direct database connection

  1"""
  2Twitter search within a DMI-TCAT bin v2
  3Direct database connection
  4"""
  5import datetime
  6import re
  7import ural
  8import pymysql
  9
 10
 11from backend.lib.search import Search
 12from common.lib.exceptions import QueryParametersException
 13from common.lib.user_input import UserInput
 14from backend.lib.database_mysql import MySQLDatabase
 15
 16
 17class SearchWithinTCATBinsV2(Search):
 18    """
 19    Get Tweets via DMI-TCAT
 20
 21    This allows subsetting an existing query bin, similar to the 'Data
 22    Selection' panel in the DMI-TCAT analysis interface
 23    """
 24    type = "dmi-tcatv2-search"  # job ID
 25    extension = "csv"
 26    title = "TCAT Search (SQL)"
 27
 28    config = {
 29        "dmi-tcatv2-search.database_instances": {
 30            "type": UserInput.OPTION_TEXT_JSON,
 31            "help": "DMI-TCAT instances",
 32            "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture',"
 33                    "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of "
 34                    "these values need to be provided for each instance. This needs to be formatted as a JSON list of "
 35                    "objects.",
 36            "default": {}
 37        }
 38    }
 39
 40    @classmethod
 41    def get_options(cls, parent_dataset=None, config=None):
 42        """
 43        Get data source options
 44
 45        This method takes the pre-defined options, but fills the 'bins' options
 46        with bins currently available from the configured TCAT instances.
 47
 48        :param config:
 49        :param DataSet parent_dataset:  An object representing the dataset that
 50        the processor would be run on can
 51        be used to show some options only to privileges users.
 52        """
 53        options = {
 54            "intro-1": {
 55                "type": UserInput.OPTION_INFO,
 56                "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet "
 57                        "bin in that instance."
 58            },
 59            "divider-1": {
 60                "type": UserInput.OPTION_DIVIDER
 61            },
 62            "bin": {
 63                "type": UserInput.OPTION_INFO,
 64                "help": "Query bin"
 65            },
 66            "query_type": {
 67                "type": UserInput.OPTION_CHOICE,
 68                "help": "Basic or Advanced Query",
 69                "options": {
 70                    "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)",
 71                    "advanced": "Select queries on any TCAT twitter tables"
 72                },
 73                "default": "basic",
 74                "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary"
 75            },
 76            "query": {
 77                "type": UserInput.OPTION_TEXT,
 78                "help": "Query text",
 79                "tooltip": "Match all tweets containing this text."
 80            },
 81            "daterange": {
 82                "type": UserInput.OPTION_DATERANGE,
 83                "help": "Date range"
 84            }
 85        }
 86
 87        # Collect Metadata from TCAT instances
 88        all_bins = cls.collect_tcat_metadata(config)
 89
 90        options["bin"] = {
 91            "type": UserInput.OPTION_CHOICE,
 92            "options": {},
 93            "help": "Query bin"
 94        }
 95
 96        for instance_name, bins in all_bins.items():
 97            for bin_name, bin in bins.items():
 98                bin_key = "%s@%s" % (bin_name, instance_name)
 99                display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}"
100                options["bin"]["options"][bin_key] = display_text
101
102        return options
103
104    def get_items(self, query):
105        """
106        Use the DMI-TCAT tweet export to retrieve tweets
107
108        :param query:
109        :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
110        """
111        bin = self.parameters.get("bin")
112        bin_name = bin.split("@")[0]
113        tcat_name = bin.split("@").pop()
114
115        available_instances = self.config.get("dmi-tcatv2-search.database_instances", [])
116        instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0]
117
118        db = MySQLDatabase(logger=self.log,
119                           dbname=instance.get('db_name'),
120                           user=instance.get('db_user'),
121                           password=instance.get('db_password'),
122                           host=instance.get('db_host'),
123                           port=instance.get('db_port'))
124
125        self.dataset.update_status("Searching for tweets on %s" % bin_name)
126        if self.parameters.get("query_type") == 'advanced':
127            # Advanced query should be simple from our perspective...
128            self.dataset.log('Query: %s' % self.parameters.get("query"))
129            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
130            try:
131                unbuffered_cursor.execute(self.parameters.get("query"))
132            except pymysql.err.ProgrammingError as e:
133                self.dataset.update_status("SQL query error: %s" % str(e), is_final=True)
134                return
135            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
136            # Get column names from cursor
137            column_names = [description[0] for description in unbuffered_cursor.description]
138            for result in unbuffered_cursor.fetchall_unbuffered():
139                # Reformat result (which is a tuple with each column in the row) to dict
140                new_result = {k: v for k, v in zip(column_names, result)}
141                # 4CAT necessary fieldnames
142                new_result['id'] = new_result.get('id', '')
143                new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get(
144                    "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get(
145                    "quoted_status_id") else new_result.get("id")
146                new_result['body'] = new_result.get('text', '')
147                new_result['timestamp'] = new_result.get('created_at', None)
148                new_result['subject'] = ''
149                new_result['author'] = new_result.get('from_user_name', '')
150                yield new_result
151
152        else:
153            # "Basic" query
154            text_query = self.parameters.get("query")
155
156            where = []
157            replacements = []
158            # Find AND and OR
159            placeholder = 0
160            start_of_match = 0
161            while start_of_match >= 0:
162                match = None
163                and_match = text_query[placeholder:].find(' AND ')
164                or_match = text_query[placeholder:].find(' OR ')
165                if and_match != -1 and or_match != -1:
166                    # both found
167                    if and_match < or_match:
168                        # and match first
169                        match = 'AND '
170                        start_of_match = and_match
171                    else:
172                        # or match first
173                        match ='OR '
174                        start_of_match = or_match
175                elif and_match != -1:
176                    # and match only
177                    match ='AND '
178                    start_of_match = and_match
179                elif or_match != -1:
180                    # or match only
181                    match = 'OR '
182                    start_of_match = or_match
183                else:
184                    # neither
185                    match = None
186                    start_of_match = -1
187                # Add partial query to where and replacements
188                if match:
189                    where.append('lower(text) LIKE %s ' + match)
190                    replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%')
191                    # new start
192                    placeholder = placeholder + start_of_match + len(match)
193                else:
194                    where.append('lower(text) LIKE %s')
195                    replacements.append('%'+text_query[placeholder:].lower().strip()+'%')
196
197            if query.get("min_date", None):
198                try:
199                    if int(query.get("min_date")) > 0:
200                        where.append("AND created_at >= %s")
201                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date"))))
202                except ValueError:
203                    pass
204
205            if query.get("max_date", None):
206                try:
207                    if int(query.get("max_date")) > 0:
208                        where.append("AND created_at < %s")
209                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date"))))
210                except ValueError:
211                    pass
212
213            where = " ".join(where)
214            query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where
215            self.dataset.log('Query: %s' % query)
216            self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements]))
217            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
218            unbuffered_cursor.execute(query, replacements)
219            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
220            column_names = [description[0] for description in unbuffered_cursor.description]
221            for result in unbuffered_cursor.fetchall_unbuffered():
222                new_result = {k: v for k, v in zip(column_names, result)}
223                # Map tweet to 4CAT fields
224                new_result = self.tweet_mapping(new_result)
225                yield new_result
226
227    @staticmethod
228    def tweet_mapping(tweet):
229        """
230        Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to
231        mirror that mapped_tweet from twitterv2 datasource.
232
233        :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table
234        :return dict:
235        """
236        mapped_tweet = {'id': tweet.get('id', ''),
237                        # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is
238                        # retweet, or its own ID
239                        # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply
240                        # or reply to retweet.
241                        # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets
242                        # with added text)
243                        'thread_id': tweet.get("in_reply_to_status_id") if tweet.get(
244                                               "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get(
245                                               "retweet_id") else tweet.get("id"),
246                        'body': tweet.get('text', ''),
247                        # 'created_at': tweet.get('created_at'),
248                        'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type(
249                                                tweet.get('created_at')) is datetime.datetime else None,
250                        'subject': '',
251                        'author': tweet.get('from_user_name', ''),
252                        "author_fullname": tweet["from_user_realname"],
253                        "author_id": tweet["from_user_id"],
254                        "source": tweet["source"],
255                        "language_guess": tweet.get("lang"),
256
257                        "retweet_count": tweet["retweet_count"],
258                        "like_count": tweet["favorite_count"],
259                        "is_retweet": "yes" if tweet.get('retweet_id', False) else "no",
260                        "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no",
261                        "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None,
262                        "reply_to": tweet["to_user_name"],
263                        "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None,
264
265                        # 4CAT specifics
266                        "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
267                        "urls": ",".join(ural.urls_from_text(tweet["text"])),
268                        "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])),
269                        "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
270
271                        # Additional TCAT data (compared to twitterv2 map_item function)
272                        "filter_level": tweet['filter_level'],
273                        'location': tweet['location'],
274                        'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None,
275                        'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None,
276                        'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None,
277                        'author_description': tweet['from_user_description'],
278                        'author_url': tweet['from_user_url'],
279                        'author_profile_image': tweet['from_user_profile_image_url'],
280                        'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if
281                                                           tweet['from_user_utcoffset'] else 0)/60/60),
282                        'author_timezone_name': tweet['from_user_timezone'],
283                        'author_language': tweet['from_user_lang'],
284                        'author_tweet_count': tweet['from_user_tweetcount'],
285                        'author_follower_count': tweet['from_user_followercount'],
286                        'author_friend_following_count': tweet['from_user_friendcount'],
287                        'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table?
288                        'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None,
289                        'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table?
290                        'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table?
291
292                        # TODO find in other TCAT tables or does not exist
293                        # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no",
294                        # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no",
295                        # 'withheld_copyright': tweet['withheld_copyright'],  # TCAT may no collect this anymore
296                        # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore
297                        # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits
298
299                        }
300
301        # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet
302        mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id',
303                       'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount',
304                       'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description',
305                       'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang',
306                       'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count',
307                       'in_reply_to_status_id']
308        for key in tweet.keys():
309            if key not in mapped_keys:
310                index = ''
311                while key + index in mapped_tweet.keys():
312                    index += '_1'
313                mapped_tweet[key + index] = tweet.get(key)
314
315        return mapped_tweet
316
317
318    @classmethod
319    def collect_tcat_metadata(cls, config):
320        """
321        Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing
322        this data. To be used to infor the user of available TCAT bins and create the options from which a user will
323        select.
324
325        :return dict: All of the available bins from accessible TCAT instances
326        """
327
328        # todo: cache this somehow! and check for the cache
329        instances = config.get("dmi-tcatv2.instances", [])
330
331        all_bins = {}
332        for instance in instances:
333            # Query each instance for bins
334            db = MySQLDatabase(logger=None,
335                               dbname=instance.get('db_name'),
336                               user=instance.get('db_user'),
337                               password=instance.get('db_password'),
338                               host=instance.get('db_host'),
339                               port=instance.get('db_port'))
340            # try:
341            instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins')
342            # except:
343
344            instance_bins_metadata = {}
345            for instance_bin in instance_bins:
346                bin_data = {
347                    'instance': instance,
348                    'querybin': instance_bin['querybin'],
349                }
350
351                # Query for number of tweets
352                tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[
353                    'COUNT(id)']
354                bin_data['tweet_count'] = tweet_count
355
356                # Collect first and last tweet datetimes
357                first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at']
358                last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at']
359                bin_data['first_tweet_datetime'] = first_tweet_datetime
360                bin_data['last_tweet_datetime'] = last_tweet_datetime
361
362                # Could check if bin currently should be collecting
363                # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active')
364
365                # Could collect phrases or usernames...
366                # if instance_bin['type'] in []:
367                # elif instance_bin['type'] in []:
368
369                # Could collect all periods for nuanced metadata...
370                #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id'])
371
372                # Add bin_data to instance collection
373                instance_bins_metadata[instance_bin['querybin']] = bin_data
374            # Add bins to instance
375            all_bins[instance['tcat_name']] = instance_bins_metadata
376
377        return all_bins
378
379    @staticmethod
380    def validate_query(query, request, config):
381        """
382        Validate DMI-TCAT query input
383
384        :param dict query:  Query parameters, from client-side.
385        :param request:  Flask request
386        :param ConfigManager|None config:  Configuration reader (context-aware)
387        :return dict:  Safe query parameters
388        """
389        # no query 4 u
390        if not query.get("bin", ""):
391            raise QueryParametersException("You must choose a query bin to get tweets from.")
392
393        # the dates need to make sense as a range to search within
394        # and a date range is needed, to not make it too easy to just get all tweets
395        after, before = query.get("daterange")
396        if (after and before) and not before <= after:
397            raise QueryParametersException("A date range must start before it ends")
398
399        query["min_date"], query["max_date"] = query.get("daterange")
400        query["bin"] = query.get("bin", "").strip()
401        del query["daterange"]
402
403        # simple!
404        return query
class SearchWithinTCATBinsV2(backend.lib.search.Search):
 18class SearchWithinTCATBinsV2(Search):
 19    """
 20    Get Tweets via DMI-TCAT
 21
 22    This allows subsetting an existing query bin, similar to the 'Data
 23    Selection' panel in the DMI-TCAT analysis interface
 24    """
 25    type = "dmi-tcatv2-search"  # job ID
 26    extension = "csv"
 27    title = "TCAT Search (SQL)"
 28
 29    config = {
 30        "dmi-tcatv2-search.database_instances": {
 31            "type": UserInput.OPTION_TEXT_JSON,
 32            "help": "DMI-TCAT instances",
 33            "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture',"
 34                    "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of "
 35                    "these values need to be provided for each instance. This needs to be formatted as a JSON list of "
 36                    "objects.",
 37            "default": {}
 38        }
 39    }
 40
 41    @classmethod
 42    def get_options(cls, parent_dataset=None, config=None):
 43        """
 44        Get data source options
 45
 46        This method takes the pre-defined options, but fills the 'bins' options
 47        with bins currently available from the configured TCAT instances.
 48
 49        :param config:
 50        :param DataSet parent_dataset:  An object representing the dataset that
 51        the processor would be run on can
 52        be used to show some options only to privileges users.
 53        """
 54        options = {
 55            "intro-1": {
 56                "type": UserInput.OPTION_INFO,
 57                "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet "
 58                        "bin in that instance."
 59            },
 60            "divider-1": {
 61                "type": UserInput.OPTION_DIVIDER
 62            },
 63            "bin": {
 64                "type": UserInput.OPTION_INFO,
 65                "help": "Query bin"
 66            },
 67            "query_type": {
 68                "type": UserInput.OPTION_CHOICE,
 69                "help": "Basic or Advanced Query",
 70                "options": {
 71                    "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)",
 72                    "advanced": "Select queries on any TCAT twitter tables"
 73                },
 74                "default": "basic",
 75                "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary"
 76            },
 77            "query": {
 78                "type": UserInput.OPTION_TEXT,
 79                "help": "Query text",
 80                "tooltip": "Match all tweets containing this text."
 81            },
 82            "daterange": {
 83                "type": UserInput.OPTION_DATERANGE,
 84                "help": "Date range"
 85            }
 86        }
 87
 88        # Collect Metadata from TCAT instances
 89        all_bins = cls.collect_tcat_metadata(config)
 90
 91        options["bin"] = {
 92            "type": UserInput.OPTION_CHOICE,
 93            "options": {},
 94            "help": "Query bin"
 95        }
 96
 97        for instance_name, bins in all_bins.items():
 98            for bin_name, bin in bins.items():
 99                bin_key = "%s@%s" % (bin_name, instance_name)
100                display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}"
101                options["bin"]["options"][bin_key] = display_text
102
103        return options
104
105    def get_items(self, query):
106        """
107        Use the DMI-TCAT tweet export to retrieve tweets
108
109        :param query:
110        :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
111        """
112        bin = self.parameters.get("bin")
113        bin_name = bin.split("@")[0]
114        tcat_name = bin.split("@").pop()
115
116        available_instances = self.config.get("dmi-tcatv2-search.database_instances", [])
117        instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0]
118
119        db = MySQLDatabase(logger=self.log,
120                           dbname=instance.get('db_name'),
121                           user=instance.get('db_user'),
122                           password=instance.get('db_password'),
123                           host=instance.get('db_host'),
124                           port=instance.get('db_port'))
125
126        self.dataset.update_status("Searching for tweets on %s" % bin_name)
127        if self.parameters.get("query_type") == 'advanced':
128            # Advanced query should be simple from our perspective...
129            self.dataset.log('Query: %s' % self.parameters.get("query"))
130            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
131            try:
132                unbuffered_cursor.execute(self.parameters.get("query"))
133            except pymysql.err.ProgrammingError as e:
134                self.dataset.update_status("SQL query error: %s" % str(e), is_final=True)
135                return
136            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
137            # Get column names from cursor
138            column_names = [description[0] for description in unbuffered_cursor.description]
139            for result in unbuffered_cursor.fetchall_unbuffered():
140                # Reformat result (which is a tuple with each column in the row) to dict
141                new_result = {k: v for k, v in zip(column_names, result)}
142                # 4CAT necessary fieldnames
143                new_result['id'] = new_result.get('id', '')
144                new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get(
145                    "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get(
146                    "quoted_status_id") else new_result.get("id")
147                new_result['body'] = new_result.get('text', '')
148                new_result['timestamp'] = new_result.get('created_at', None)
149                new_result['subject'] = ''
150                new_result['author'] = new_result.get('from_user_name', '')
151                yield new_result
152
153        else:
154            # "Basic" query
155            text_query = self.parameters.get("query")
156
157            where = []
158            replacements = []
159            # Find AND and OR
160            placeholder = 0
161            start_of_match = 0
162            while start_of_match >= 0:
163                match = None
164                and_match = text_query[placeholder:].find(' AND ')
165                or_match = text_query[placeholder:].find(' OR ')
166                if and_match != -1 and or_match != -1:
167                    # both found
168                    if and_match < or_match:
169                        # and match first
170                        match = 'AND '
171                        start_of_match = and_match
172                    else:
173                        # or match first
174                        match ='OR '
175                        start_of_match = or_match
176                elif and_match != -1:
177                    # and match only
178                    match ='AND '
179                    start_of_match = and_match
180                elif or_match != -1:
181                    # or match only
182                    match = 'OR '
183                    start_of_match = or_match
184                else:
185                    # neither
186                    match = None
187                    start_of_match = -1
188                # Add partial query to where and replacements
189                if match:
190                    where.append('lower(text) LIKE %s ' + match)
191                    replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%')
192                    # new start
193                    placeholder = placeholder + start_of_match + len(match)
194                else:
195                    where.append('lower(text) LIKE %s')
196                    replacements.append('%'+text_query[placeholder:].lower().strip()+'%')
197
198            if query.get("min_date", None):
199                try:
200                    if int(query.get("min_date")) > 0:
201                        where.append("AND created_at >= %s")
202                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date"))))
203                except ValueError:
204                    pass
205
206            if query.get("max_date", None):
207                try:
208                    if int(query.get("max_date")) > 0:
209                        where.append("AND created_at < %s")
210                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date"))))
211                except ValueError:
212                    pass
213
214            where = " ".join(where)
215            query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where
216            self.dataset.log('Query: %s' % query)
217            self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements]))
218            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
219            unbuffered_cursor.execute(query, replacements)
220            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
221            column_names = [description[0] for description in unbuffered_cursor.description]
222            for result in unbuffered_cursor.fetchall_unbuffered():
223                new_result = {k: v for k, v in zip(column_names, result)}
224                # Map tweet to 4CAT fields
225                new_result = self.tweet_mapping(new_result)
226                yield new_result
227
228    @staticmethod
229    def tweet_mapping(tweet):
230        """
231        Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to
232        mirror that mapped_tweet from twitterv2 datasource.
233
234        :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table
235        :return dict:
236        """
237        mapped_tweet = {'id': tweet.get('id', ''),
238                        # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is
239                        # retweet, or its own ID
240                        # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply
241                        # or reply to retweet.
242                        # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets
243                        # with added text)
244                        'thread_id': tweet.get("in_reply_to_status_id") if tweet.get(
245                                               "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get(
246                                               "retweet_id") else tweet.get("id"),
247                        'body': tweet.get('text', ''),
248                        # 'created_at': tweet.get('created_at'),
249                        'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type(
250                                                tweet.get('created_at')) is datetime.datetime else None,
251                        'subject': '',
252                        'author': tweet.get('from_user_name', ''),
253                        "author_fullname": tweet["from_user_realname"],
254                        "author_id": tweet["from_user_id"],
255                        "source": tweet["source"],
256                        "language_guess": tweet.get("lang"),
257
258                        "retweet_count": tweet["retweet_count"],
259                        "like_count": tweet["favorite_count"],
260                        "is_retweet": "yes" if tweet.get('retweet_id', False) else "no",
261                        "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no",
262                        "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None,
263                        "reply_to": tweet["to_user_name"],
264                        "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None,
265
266                        # 4CAT specifics
267                        "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
268                        "urls": ",".join(ural.urls_from_text(tweet["text"])),
269                        "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])),
270                        "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
271
272                        # Additional TCAT data (compared to twitterv2 map_item function)
273                        "filter_level": tweet['filter_level'],
274                        'location': tweet['location'],
275                        'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None,
276                        'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None,
277                        'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None,
278                        'author_description': tweet['from_user_description'],
279                        'author_url': tweet['from_user_url'],
280                        'author_profile_image': tweet['from_user_profile_image_url'],
281                        'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if
282                                                           tweet['from_user_utcoffset'] else 0)/60/60),
283                        'author_timezone_name': tweet['from_user_timezone'],
284                        'author_language': tweet['from_user_lang'],
285                        'author_tweet_count': tweet['from_user_tweetcount'],
286                        'author_follower_count': tweet['from_user_followercount'],
287                        'author_friend_following_count': tweet['from_user_friendcount'],
288                        'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table?
289                        'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None,
290                        'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table?
291                        'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table?
292
293                        # TODO find in other TCAT tables or does not exist
294                        # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no",
295                        # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no",
296                        # 'withheld_copyright': tweet['withheld_copyright'],  # TCAT may no collect this anymore
297                        # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore
298                        # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits
299
300                        }
301
302        # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet
303        mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id',
304                       'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount',
305                       'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description',
306                       'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang',
307                       'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count',
308                       'in_reply_to_status_id']
309        for key in tweet.keys():
310            if key not in mapped_keys:
311                index = ''
312                while key + index in mapped_tweet.keys():
313                    index += '_1'
314                mapped_tweet[key + index] = tweet.get(key)
315
316        return mapped_tweet
317
318
319    @classmethod
320    def collect_tcat_metadata(cls, config):
321        """
322        Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing
323        this data. To be used to infor the user of available TCAT bins and create the options from which a user will
324        select.
325
326        :return dict: All of the available bins from accessible TCAT instances
327        """
328
329        # todo: cache this somehow! and check for the cache
330        instances = config.get("dmi-tcatv2.instances", [])
331
332        all_bins = {}
333        for instance in instances:
334            # Query each instance for bins
335            db = MySQLDatabase(logger=None,
336                               dbname=instance.get('db_name'),
337                               user=instance.get('db_user'),
338                               password=instance.get('db_password'),
339                               host=instance.get('db_host'),
340                               port=instance.get('db_port'))
341            # try:
342            instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins')
343            # except:
344
345            instance_bins_metadata = {}
346            for instance_bin in instance_bins:
347                bin_data = {
348                    'instance': instance,
349                    'querybin': instance_bin['querybin'],
350                }
351
352                # Query for number of tweets
353                tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[
354                    'COUNT(id)']
355                bin_data['tweet_count'] = tweet_count
356
357                # Collect first and last tweet datetimes
358                first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at']
359                last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at']
360                bin_data['first_tweet_datetime'] = first_tweet_datetime
361                bin_data['last_tweet_datetime'] = last_tweet_datetime
362
363                # Could check if bin currently should be collecting
364                # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active')
365
366                # Could collect phrases or usernames...
367                # if instance_bin['type'] in []:
368                # elif instance_bin['type'] in []:
369
370                # Could collect all periods for nuanced metadata...
371                #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id'])
372
373                # Add bin_data to instance collection
374                instance_bins_metadata[instance_bin['querybin']] = bin_data
375            # Add bins to instance
376            all_bins[instance['tcat_name']] = instance_bins_metadata
377
378        return all_bins
379
380    @staticmethod
381    def validate_query(query, request, config):
382        """
383        Validate DMI-TCAT query input
384
385        :param dict query:  Query parameters, from client-side.
386        :param request:  Flask request
387        :param ConfigManager|None config:  Configuration reader (context-aware)
388        :return dict:  Safe query parameters
389        """
390        # no query 4 u
391        if not query.get("bin", ""):
392            raise QueryParametersException("You must choose a query bin to get tweets from.")
393
394        # the dates need to make sense as a range to search within
395        # and a date range is needed, to not make it too easy to just get all tweets
396        after, before = query.get("daterange")
397        if (after and before) and not before <= after:
398            raise QueryParametersException("A date range must start before it ends")
399
400        query["min_date"], query["max_date"] = query.get("daterange")
401        query["bin"] = query.get("bin", "").strip()
402        del query["daterange"]
403
404        # simple!
405        return query

Get Tweets via DMI-TCAT

This allows subsetting an existing query bin, similar to the 'Data Selection' panel in the DMI-TCAT analysis interface

type = 'dmi-tcatv2-search'
extension = 'csv'
title = 'TCAT Search (SQL)'
config = {'dmi-tcatv2-search.database_instances': {'type': 'json', 'help': 'DMI-TCAT instances', 'tooltip': "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture','db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of these values need to be provided for each instance. This needs to be formatted as a JSON list of objects.", 'default': {}}}
@classmethod
def get_options(cls, parent_dataset=None, config=None):
 41    @classmethod
 42    def get_options(cls, parent_dataset=None, config=None):
 43        """
 44        Get data source options
 45
 46        This method takes the pre-defined options, but fills the 'bins' options
 47        with bins currently available from the configured TCAT instances.
 48
 49        :param config:
 50        :param DataSet parent_dataset:  An object representing the dataset that
 51        the processor would be run on can
 52        be used to show some options only to privileges users.
 53        """
 54        options = {
 55            "intro-1": {
 56                "type": UserInput.OPTION_INFO,
 57                "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet "
 58                        "bin in that instance."
 59            },
 60            "divider-1": {
 61                "type": UserInput.OPTION_DIVIDER
 62            },
 63            "bin": {
 64                "type": UserInput.OPTION_INFO,
 65                "help": "Query bin"
 66            },
 67            "query_type": {
 68                "type": UserInput.OPTION_CHOICE,
 69                "help": "Basic or Advanced Query",
 70                "options": {
 71                    "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)",
 72                    "advanced": "Select queries on any TCAT twitter tables"
 73                },
 74                "default": "basic",
 75                "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary"
 76            },
 77            "query": {
 78                "type": UserInput.OPTION_TEXT,
 79                "help": "Query text",
 80                "tooltip": "Match all tweets containing this text."
 81            },
 82            "daterange": {
 83                "type": UserInput.OPTION_DATERANGE,
 84                "help": "Date range"
 85            }
 86        }
 87
 88        # Collect Metadata from TCAT instances
 89        all_bins = cls.collect_tcat_metadata(config)
 90
 91        options["bin"] = {
 92            "type": UserInput.OPTION_CHOICE,
 93            "options": {},
 94            "help": "Query bin"
 95        }
 96
 97        for instance_name, bins in all_bins.items():
 98            for bin_name, bin in bins.items():
 99                bin_key = "%s@%s" % (bin_name, instance_name)
100                display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}"
101                options["bin"]["options"][bin_key] = display_text
102
103        return options

Get data source options

This method takes the pre-defined options, but fills the 'bins' options with bins currently available from the configured TCAT instances.

Parameters
  • config:
  • DataSet parent_dataset: An object representing the dataset that the processor would be run on can be used to show some options only to privileges users.
def get_items(self, query):
105    def get_items(self, query):
106        """
107        Use the DMI-TCAT tweet export to retrieve tweets
108
109        :param query:
110        :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
111        """
112        bin = self.parameters.get("bin")
113        bin_name = bin.split("@")[0]
114        tcat_name = bin.split("@").pop()
115
116        available_instances = self.config.get("dmi-tcatv2-search.database_instances", [])
117        instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0]
118
119        db = MySQLDatabase(logger=self.log,
120                           dbname=instance.get('db_name'),
121                           user=instance.get('db_user'),
122                           password=instance.get('db_password'),
123                           host=instance.get('db_host'),
124                           port=instance.get('db_port'))
125
126        self.dataset.update_status("Searching for tweets on %s" % bin_name)
127        if self.parameters.get("query_type") == 'advanced':
128            # Advanced query should be simple from our perspective...
129            self.dataset.log('Query: %s' % self.parameters.get("query"))
130            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
131            try:
132                unbuffered_cursor.execute(self.parameters.get("query"))
133            except pymysql.err.ProgrammingError as e:
134                self.dataset.update_status("SQL query error: %s" % str(e), is_final=True)
135                return
136            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
137            # Get column names from cursor
138            column_names = [description[0] for description in unbuffered_cursor.description]
139            for result in unbuffered_cursor.fetchall_unbuffered():
140                # Reformat result (which is a tuple with each column in the row) to dict
141                new_result = {k: v for k, v in zip(column_names, result)}
142                # 4CAT necessary fieldnames
143                new_result['id'] = new_result.get('id', '')
144                new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get(
145                    "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get(
146                    "quoted_status_id") else new_result.get("id")
147                new_result['body'] = new_result.get('text', '')
148                new_result['timestamp'] = new_result.get('created_at', None)
149                new_result['subject'] = ''
150                new_result['author'] = new_result.get('from_user_name', '')
151                yield new_result
152
153        else:
154            # "Basic" query
155            text_query = self.parameters.get("query")
156
157            where = []
158            replacements = []
159            # Find AND and OR
160            placeholder = 0
161            start_of_match = 0
162            while start_of_match >= 0:
163                match = None
164                and_match = text_query[placeholder:].find(' AND ')
165                or_match = text_query[placeholder:].find(' OR ')
166                if and_match != -1 and or_match != -1:
167                    # both found
168                    if and_match < or_match:
169                        # and match first
170                        match = 'AND '
171                        start_of_match = and_match
172                    else:
173                        # or match first
174                        match ='OR '
175                        start_of_match = or_match
176                elif and_match != -1:
177                    # and match only
178                    match ='AND '
179                    start_of_match = and_match
180                elif or_match != -1:
181                    # or match only
182                    match = 'OR '
183                    start_of_match = or_match
184                else:
185                    # neither
186                    match = None
187                    start_of_match = -1
188                # Add partial query to where and replacements
189                if match:
190                    where.append('lower(text) LIKE %s ' + match)
191                    replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%')
192                    # new start
193                    placeholder = placeholder + start_of_match + len(match)
194                else:
195                    where.append('lower(text) LIKE %s')
196                    replacements.append('%'+text_query[placeholder:].lower().strip()+'%')
197
198            if query.get("min_date", None):
199                try:
200                    if int(query.get("min_date")) > 0:
201                        where.append("AND created_at >= %s")
202                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date"))))
203                except ValueError:
204                    pass
205
206            if query.get("max_date", None):
207                try:
208                    if int(query.get("max_date")) > 0:
209                        where.append("AND created_at < %s")
210                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date"))))
211                except ValueError:
212                    pass
213
214            where = " ".join(where)
215            query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where
216            self.dataset.log('Query: %s' % query)
217            self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements]))
218            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
219            unbuffered_cursor.execute(query, replacements)
220            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
221            column_names = [description[0] for description in unbuffered_cursor.description]
222            for result in unbuffered_cursor.fetchall_unbuffered():
223                new_result = {k: v for k, v in zip(column_names, result)}
224                # Map tweet to 4CAT fields
225                new_result = self.tweet_mapping(new_result)
226                yield new_result

Use the DMI-TCAT tweet export to retrieve tweets

Parameters
  • query: :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
@staticmethod
def tweet_mapping(tweet):
228    @staticmethod
229    def tweet_mapping(tweet):
230        """
231        Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to
232        mirror that mapped_tweet from twitterv2 datasource.
233
234        :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table
235        :return dict:
236        """
237        mapped_tweet = {'id': tweet.get('id', ''),
238                        # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is
239                        # retweet, or its own ID
240                        # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply
241                        # or reply to retweet.
242                        # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets
243                        # with added text)
244                        'thread_id': tweet.get("in_reply_to_status_id") if tweet.get(
245                                               "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get(
246                                               "retweet_id") else tweet.get("id"),
247                        'body': tweet.get('text', ''),
248                        # 'created_at': tweet.get('created_at'),
249                        'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type(
250                                                tweet.get('created_at')) is datetime.datetime else None,
251                        'subject': '',
252                        'author': tweet.get('from_user_name', ''),
253                        "author_fullname": tweet["from_user_realname"],
254                        "author_id": tweet["from_user_id"],
255                        "source": tweet["source"],
256                        "language_guess": tweet.get("lang"),
257
258                        "retweet_count": tweet["retweet_count"],
259                        "like_count": tweet["favorite_count"],
260                        "is_retweet": "yes" if tweet.get('retweet_id', False) else "no",
261                        "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no",
262                        "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None,
263                        "reply_to": tweet["to_user_name"],
264                        "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None,
265
266                        # 4CAT specifics
267                        "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
268                        "urls": ",".join(ural.urls_from_text(tweet["text"])),
269                        "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])),
270                        "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
271
272                        # Additional TCAT data (compared to twitterv2 map_item function)
273                        "filter_level": tweet['filter_level'],
274                        'location': tweet['location'],
275                        'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None,
276                        'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None,
277                        'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None,
278                        'author_description': tweet['from_user_description'],
279                        'author_url': tweet['from_user_url'],
280                        'author_profile_image': tweet['from_user_profile_image_url'],
281                        'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if
282                                                           tweet['from_user_utcoffset'] else 0)/60/60),
283                        'author_timezone_name': tweet['from_user_timezone'],
284                        'author_language': tweet['from_user_lang'],
285                        'author_tweet_count': tweet['from_user_tweetcount'],
286                        'author_follower_count': tweet['from_user_followercount'],
287                        'author_friend_following_count': tweet['from_user_friendcount'],
288                        'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table?
289                        'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None,
290                        'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table?
291                        'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table?
292
293                        # TODO find in other TCAT tables or does not exist
294                        # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no",
295                        # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no",
296                        # 'withheld_copyright': tweet['withheld_copyright'],  # TCAT may no collect this anymore
297                        # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore
298                        # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits
299
300                        }
301
302        # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet
303        mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id',
304                       'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount',
305                       'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description',
306                       'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang',
307                       'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count',
308                       'in_reply_to_status_id']
309        for key in tweet.keys():
310            if key not in mapped_keys:
311                index = ''
312                while key + index in mapped_tweet.keys():
313                    index += '_1'
314                mapped_tweet[key + index] = tweet.get(key)
315
316        return mapped_tweet

Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to mirror that mapped_tweet from twitterv2 datasource.

Parameters
  • dict tweet: TCAT dict returned from query; expected to be from bin tweets table
Returns
@classmethod
def collect_tcat_metadata(cls, config):
319    @classmethod
320    def collect_tcat_metadata(cls, config):
321        """
322        Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing
323        this data. To be used to infor the user of available TCAT bins and create the options from which a user will
324        select.
325
326        :return dict: All of the available bins from accessible TCAT instances
327        """
328
329        # todo: cache this somehow! and check for the cache
330        instances = config.get("dmi-tcatv2.instances", [])
331
332        all_bins = {}
333        for instance in instances:
334            # Query each instance for bins
335            db = MySQLDatabase(logger=None,
336                               dbname=instance.get('db_name'),
337                               user=instance.get('db_user'),
338                               password=instance.get('db_password'),
339                               host=instance.get('db_host'),
340                               port=instance.get('db_port'))
341            # try:
342            instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins')
343            # except:
344
345            instance_bins_metadata = {}
346            for instance_bin in instance_bins:
347                bin_data = {
348                    'instance': instance,
349                    'querybin': instance_bin['querybin'],
350                }
351
352                # Query for number of tweets
353                tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[
354                    'COUNT(id)']
355                bin_data['tweet_count'] = tweet_count
356
357                # Collect first and last tweet datetimes
358                first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at']
359                last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at']
360                bin_data['first_tweet_datetime'] = first_tweet_datetime
361                bin_data['last_tweet_datetime'] = last_tweet_datetime
362
363                # Could check if bin currently should be collecting
364                # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active')
365
366                # Could collect phrases or usernames...
367                # if instance_bin['type'] in []:
368                # elif instance_bin['type'] in []:
369
370                # Could collect all periods for nuanced metadata...
371                #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id'])
372
373                # Add bin_data to instance collection
374                instance_bins_metadata[instance_bin['querybin']] = bin_data
375            # Add bins to instance
376            all_bins[instance['tcat_name']] = instance_bins_metadata
377
378        return all_bins

Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing this data. To be used to infor the user of available TCAT bins and create the options from which a user will select.

Returns

All of the available bins from accessible TCAT instances

@staticmethod
def validate_query(query, request, config):
380    @staticmethod
381    def validate_query(query, request, config):
382        """
383        Validate DMI-TCAT query input
384
385        :param dict query:  Query parameters, from client-side.
386        :param request:  Flask request
387        :param ConfigManager|None config:  Configuration reader (context-aware)
388        :return dict:  Safe query parameters
389        """
390        # no query 4 u
391        if not query.get("bin", ""):
392            raise QueryParametersException("You must choose a query bin to get tweets from.")
393
394        # the dates need to make sense as a range to search within
395        # and a date range is needed, to not make it too easy to just get all tweets
396        after, before = query.get("daterange")
397        if (after and before) and not before <= after:
398            raise QueryParametersException("A date range must start before it ends")
399
400        query["min_date"], query["max_date"] = query.get("daterange")
401        query["bin"] = query.get("bin", "").strip()
402        del query["daterange"]
403
404        # simple!
405        return query

Validate DMI-TCAT query input

Parameters
  • dict query: Query parameters, from client-side.
  • request: Flask request
  • ConfigManager|None config: Configuration reader (context-aware)
Returns

Safe query parameters