Edit on GitHub

datasources.dmi-tcatv2.search_tcat_v2

Twitter search within a DMI-TCAT bin v2 Direct database connection

  1"""
  2Twitter search within a DMI-TCAT bin v2
  3Direct database connection
  4"""
  5import datetime
  6import re
  7import ural
  8import pymysql
  9
 10
 11from backend.lib.search import Search
 12from common.lib.exceptions import QueryParametersException
 13from common.lib.user_input import UserInput
 14from backend.lib.database_mysql import MySQLDatabase
 15from common.config_manager import config
 16
 17
 18class SearchWithinTCATBinsV2(Search):
 19    """
 20    Get Tweets via DMI-TCAT
 21
 22    This allows subsetting an existing query bin, similar to the 'Data
 23    Selection' panel in the DMI-TCAT analysis interface
 24    """
 25    type = "dmi-tcatv2-search"  # job ID
 26    extension = "csv"
 27    title = "TCAT Search (SQL)"
 28
 29    options = {
 30        "intro-1": {
 31            "type": UserInput.OPTION_INFO,
 32            "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet "
 33                    "bin in that instance."
 34        },
 35        "divider-1": {
 36            "type": UserInput.OPTION_DIVIDER
 37        },
 38        "bin": {
 39            "type": UserInput.OPTION_INFO,
 40            "help": "Query bin"
 41        },
 42        "query_type": {
 43            "type": UserInput.OPTION_CHOICE,
 44            "help": "Basic or Advanced Query",
 45            "options": {
 46                "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)",
 47                "advanced": "Select queries on any TCAT twitter tables"
 48            },
 49            "default": "basic",
 50            "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary"
 51        },
 52        "query": {
 53            "type": UserInput.OPTION_TEXT,
 54            "help": "Query text",
 55            "tooltip": "Match all tweets containing this text."
 56        },
 57        "daterange": {
 58            "type": UserInput.OPTION_DATERANGE,
 59            "help": "Date range"
 60        }
 61    }
 62
 63    config = {
 64        "dmi-tcatv2-search.database_instances": {
 65            "type": UserInput.OPTION_TEXT_JSON,
 66            "help": "DMI-TCAT instances",
 67            "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture',"
 68                    "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of "
 69                    "these values need to be provided for each instance. This needs to be formatted as a JSON list of "
 70                    "objects.",
 71            "default": {}
 72        }
 73    }
 74
 75    @classmethod
 76    def get_options(cls, parent_dataset=None, user=None):
 77        """
 78        Get data source options
 79
 80        This method takes the pre-defined options, but fills the 'bins' options
 81        with bins currently available from the configured TCAT instances.
 82
 83        :param DataSet parent_dataset:  An object representing the dataset that
 84        the processor would be run on
 85        :param User user:  Flask user the options will be displayed for, in
 86        case they are requested for display in the 4CAT web interface. This can
 87        be used to show some options only to privileges users.
 88        """
 89        options = cls.options
 90
 91        # Collect Metadata from TCAT instances
 92        all_bins = cls.collect_tcat_metadata()
 93
 94        options["bin"] = {
 95            "type": UserInput.OPTION_CHOICE,
 96            "options": {},
 97            "help": "Query bin"
 98        }
 99
100        for instance_name, bins in all_bins.items():
101            for bin_name, bin in bins.items():
102                bin_key = "%s@%s" % (bin_name, instance_name)
103                display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}"
104                options["bin"]["options"][bin_key] = display_text
105
106        return options
107
108    def get_items(self, query):
109        """
110        Use the DMI-TCAT tweet export to retrieve tweets
111
112        :param query:
113        :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
114        """
115        bin = self.parameters.get("bin")
116        bin_name = bin.split("@")[0]
117        tcat_name = bin.split("@").pop()
118
119        available_instances = config.get("dmi-tcatv2-search.database_instances", [])
120        instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0]
121
122        db = MySQLDatabase(logger=self.log,
123                           dbname=instance.get('db_name'),
124                           user=instance.get('db_user'),
125                           password=instance.get('db_password'),
126                           host=instance.get('db_host'),
127                           port=instance.get('db_port'))
128
129        self.dataset.update_status("Searching for tweets on %s" % bin_name)
130        if self.parameters.get("query_type") == 'advanced':
131            # Advanced query should be simple from our perspective...
132            self.dataset.log('Query: %s' % self.parameters.get("query"))
133            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
134            try:
135                num_results = unbuffered_cursor.execute(self.parameters.get("query"))
136            except pymysql.err.ProgrammingError as e:
137                self.dataset.update_status("SQL query error: %s" % str(e), is_final=True)
138                return
139            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
140            # Get column names from cursor
141            column_names = [description[0] for description in unbuffered_cursor.description]
142            for result in unbuffered_cursor.fetchall_unbuffered():
143                # Reformat result (which is a tuple with each column in the row) to dict
144                new_result = {k: v for k, v in zip(column_names, result)}
145                # 4CAT necessary fieldnames
146                new_result['id'] = new_result.get('id', '')
147                new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get(
148                    "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get(
149                    "quoted_status_id") else new_result.get("id")
150                new_result['body'] = new_result.get('text', '')
151                new_result['timestamp'] = new_result.get('created_at', None)
152                new_result['subject'] = ''
153                new_result['author'] = new_result.get('from_user_name', '')
154                yield new_result
155
156        else:
157            # "Basic" query
158            text_query = self.parameters.get("query")
159
160            where = []
161            replacements = []
162            # Find AND and OR
163            placeholder = 0
164            start_of_match = 0
165            while start_of_match >= 0:
166                match = None
167                and_match = text_query[placeholder:].find(' AND ')
168                or_match = text_query[placeholder:].find(' OR ')
169                if and_match != -1 and or_match != -1:
170                    # both found
171                    if and_match < or_match:
172                        # and match first
173                        match = 'AND '
174                        start_of_match = and_match
175                    else:
176                        # or match first
177                        match ='OR '
178                        start_of_match = or_match
179                elif and_match != -1:
180                    # and match only
181                    match ='AND '
182                    start_of_match = and_match
183                elif or_match != -1:
184                    # or match only
185                    match = 'OR '
186                    start_of_match = or_match
187                else:
188                    # neither
189                    match = None
190                    start_of_match = -1
191                # Add partial query to where and replacements
192                if match:
193                    where.append('lower(text) LIKE %s ' + match)
194                    replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%')
195                    # new start
196                    placeholder = placeholder + start_of_match + len(match)
197                else:
198                    where.append('lower(text) LIKE %s')
199                    replacements.append('%'+text_query[placeholder:].lower().strip()+'%')
200
201            if query.get("min_date", None):
202                try:
203                    if int(query.get("min_date")) > 0:
204                        where.append("AND created_at >= %s")
205                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date"))))
206                except ValueError:
207                    pass
208
209            if query.get("max_date", None):
210                try:
211                    if int(query.get("max_date")) > 0:
212                        where.append("AND created_at < %s")
213                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date"))))
214                except ValueError:
215                    pass
216
217            where = " ".join(where)
218            query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where
219            self.dataset.log('Query: %s' % query)
220            self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements]))
221            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
222            num_results = unbuffered_cursor.execute(query, replacements)
223            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
224            column_names = [description[0] for description in unbuffered_cursor.description]
225            for result in unbuffered_cursor.fetchall_unbuffered():
226                new_result = {k: v for k, v in zip(column_names, result)}
227                # Map tweet to 4CAT fields
228                new_result = self.tweet_mapping(new_result)
229                yield new_result
230
231    @staticmethod
232    def tweet_mapping(tweet):
233        """
234        Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to
235        mirror that mapped_tweet from twitterv2 datasource.
236
237        :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table
238        :return dict:
239        """
240        mapped_tweet = {'id': tweet.get('id', ''),
241                        # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is
242                        # retweet, or its own ID
243                        # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply
244                        # or reply to retweet.
245                        # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets
246                        # with added text)
247                        'thread_id': tweet.get("in_reply_to_status_id") if tweet.get(
248                                               "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get(
249                                               "retweet_id") else tweet.get("id"),
250                        'body': tweet.get('text', ''),
251                        # 'created_at': tweet.get('created_at'),
252                        'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type(
253                                                tweet.get('created_at')) == datetime.datetime else None,
254                        'subject': '',
255                        'author': tweet.get('from_user_name', ''),
256                        "author_fullname": tweet["from_user_realname"],
257                        "author_id": tweet["from_user_id"],
258                        "source": tweet["source"],
259                        "language_guess": tweet.get("lang"),
260
261                        "retweet_count": tweet["retweet_count"],
262                        "like_count": tweet["favorite_count"],
263                        "is_retweet": "yes" if tweet.get('retweet_id', False) else "no",
264                        "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no",
265                        "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None,
266                        "reply_to": tweet["to_user_name"],
267                        "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None,
268
269                        # 4CAT specifics
270                        "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
271                        "urls": ",".join(ural.urls_from_text(tweet["text"])),
272                        "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])),
273                        "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
274
275                        # Additional TCAT data (compared to twitterv2 map_item function)
276                        "filter_level": tweet['filter_level'],
277                        'location': tweet['location'],
278                        'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None,
279                        'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None,
280                        'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None,
281                        'author_description': tweet['from_user_description'],
282                        'author_url': tweet['from_user_url'],
283                        'author_profile_image': tweet['from_user_profile_image_url'],
284                        'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if
285                                                           tweet['from_user_utcoffset'] else 0)/60/60),
286                        'author_timezone_name': tweet['from_user_timezone'],
287                        'author_language': tweet['from_user_lang'],
288                        'author_tweet_count': tweet['from_user_tweetcount'],
289                        'author_follower_count': tweet['from_user_followercount'],
290                        'author_friend_following_count': tweet['from_user_friendcount'],
291                        'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table?
292                        'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None,
293                        'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table?
294                        'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table?
295
296                        # TODO find in other TCAT tables or does not exist
297                        # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no",
298                        # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no",
299                        # 'withheld_copyright': tweet['withheld_copyright'],  # TCAT may no collect this anymore
300                        # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore
301                        # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits
302
303                        }
304
305        # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet
306        mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id',
307                       'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount',
308                       'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description',
309                       'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang',
310                       'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count',
311                       'in_reply_to_status_id']
312        for key in tweet.keys():
313            if key not in mapped_keys:
314                index = ''
315                while key + index in mapped_tweet.keys():
316                    index += '_1'
317                mapped_tweet[key + index] = tweet.get(key)
318
319        return mapped_tweet
320
321
322    @classmethod
323    def collect_tcat_metadata(cls):
324        """
325        Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing
326        this data. To be used to infor the user of available TCAT bins and create the options from which a user will
327        select.
328
329        :return dict: All of the available bins from accessible TCAT instances
330        """
331
332        # todo: cache this somehow! and check for the cache
333        instances = config.get("dmi-tcatv2.instances", [])
334
335        all_bins = {}
336        for instance in instances:
337            # Query each instance for bins
338            db = MySQLDatabase(logger=None,
339                               dbname=instance.get('db_name'),
340                               user=instance.get('db_user'),
341                               password=instance.get('db_password'),
342                               host=instance.get('db_host'),
343                               port=instance.get('db_port'))
344            # try:
345            instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins')
346            # except:
347
348            instance_bins_metadata = {}
349            for instance_bin in instance_bins:
350                bin_data = {
351                    'instance': instance,
352                    'querybin': instance_bin['querybin'],
353                }
354
355                # Query for number of tweets
356                tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[
357                    'COUNT(id)']
358                bin_data['tweet_count'] = tweet_count
359
360                # Collect first and last tweet datetimes
361                first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at']
362                last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at']
363                bin_data['first_tweet_datetime'] = first_tweet_datetime
364                bin_data['last_tweet_datetime'] = last_tweet_datetime
365
366                # Could check if bin currently should be collecting
367                # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active')
368
369                # Could collect phrases or usernames...
370                # if instance_bin['type'] in []:
371                # elif instance_bin['type'] in []:
372
373                # Could collect all periods for nuanced metadata...
374                #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id'])
375
376                # Add bin_data to instance collection
377                instance_bins_metadata[instance_bin['querybin']] = bin_data
378            # Add bins to instance
379            all_bins[instance['tcat_name']] = instance_bins_metadata
380
381        return all_bins
382
383    @staticmethod
384    def validate_query(query, request, user):
385        """
386        Validate DMI-TCAT query input
387
388        :param dict query:  Query parameters, from client-side.
389        :param request:  Flask request
390        :param User user:  User object of user who has submitted the query
391        :return dict:  Safe query parameters
392        """
393        # no query 4 u
394        if not query.get("bin", "").strip():
395            raise QueryParametersException("You must choose a query bin to get tweets from.")
396
397        # the dates need to make sense as a range to search within
398        # and a date range is needed, to not make it too easy to just get all tweets
399        after, before = query.get("daterange")
400        if (after and before) and not before <= after:
401            raise QueryParametersException("A date range must start before it ends")
402
403        query["min_date"], query["max_date"] = query.get("daterange")
404        del query["daterange"]
405
406        # simple!
407        return query
class SearchWithinTCATBinsV2(backend.lib.search.Search):
 19class SearchWithinTCATBinsV2(Search):
 20    """
 21    Get Tweets via DMI-TCAT
 22
 23    This allows subsetting an existing query bin, similar to the 'Data
 24    Selection' panel in the DMI-TCAT analysis interface
 25    """
 26    type = "dmi-tcatv2-search"  # job ID
 27    extension = "csv"
 28    title = "TCAT Search (SQL)"
 29
 30    options = {
 31        "intro-1": {
 32            "type": UserInput.OPTION_INFO,
 33            "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet "
 34                    "bin in that instance."
 35        },
 36        "divider-1": {
 37            "type": UserInput.OPTION_DIVIDER
 38        },
 39        "bin": {
 40            "type": UserInput.OPTION_INFO,
 41            "help": "Query bin"
 42        },
 43        "query_type": {
 44            "type": UserInput.OPTION_CHOICE,
 45            "help": "Basic or Advanced Query",
 46            "options": {
 47                "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)",
 48                "advanced": "Select queries on any TCAT twitter tables"
 49            },
 50            "default": "basic",
 51            "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary"
 52        },
 53        "query": {
 54            "type": UserInput.OPTION_TEXT,
 55            "help": "Query text",
 56            "tooltip": "Match all tweets containing this text."
 57        },
 58        "daterange": {
 59            "type": UserInput.OPTION_DATERANGE,
 60            "help": "Date range"
 61        }
 62    }
 63
 64    config = {
 65        "dmi-tcatv2-search.database_instances": {
 66            "type": UserInput.OPTION_TEXT_JSON,
 67            "help": "DMI-TCAT instances",
 68            "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture',"
 69                    "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of "
 70                    "these values need to be provided for each instance. This needs to be formatted as a JSON list of "
 71                    "objects.",
 72            "default": {}
 73        }
 74    }
 75
 76    @classmethod
 77    def get_options(cls, parent_dataset=None, user=None):
 78        """
 79        Get data source options
 80
 81        This method takes the pre-defined options, but fills the 'bins' options
 82        with bins currently available from the configured TCAT instances.
 83
 84        :param DataSet parent_dataset:  An object representing the dataset that
 85        the processor would be run on
 86        :param User user:  Flask user the options will be displayed for, in
 87        case they are requested for display in the 4CAT web interface. This can
 88        be used to show some options only to privileges users.
 89        """
 90        options = cls.options
 91
 92        # Collect Metadata from TCAT instances
 93        all_bins = cls.collect_tcat_metadata()
 94
 95        options["bin"] = {
 96            "type": UserInput.OPTION_CHOICE,
 97            "options": {},
 98            "help": "Query bin"
 99        }
100
101        for instance_name, bins in all_bins.items():
102            for bin_name, bin in bins.items():
103                bin_key = "%s@%s" % (bin_name, instance_name)
104                display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}"
105                options["bin"]["options"][bin_key] = display_text
106
107        return options
108
109    def get_items(self, query):
110        """
111        Use the DMI-TCAT tweet export to retrieve tweets
112
113        :param query:
114        :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
115        """
116        bin = self.parameters.get("bin")
117        bin_name = bin.split("@")[0]
118        tcat_name = bin.split("@").pop()
119
120        available_instances = config.get("dmi-tcatv2-search.database_instances", [])
121        instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0]
122
123        db = MySQLDatabase(logger=self.log,
124                           dbname=instance.get('db_name'),
125                           user=instance.get('db_user'),
126                           password=instance.get('db_password'),
127                           host=instance.get('db_host'),
128                           port=instance.get('db_port'))
129
130        self.dataset.update_status("Searching for tweets on %s" % bin_name)
131        if self.parameters.get("query_type") == 'advanced':
132            # Advanced query should be simple from our perspective...
133            self.dataset.log('Query: %s' % self.parameters.get("query"))
134            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
135            try:
136                num_results = unbuffered_cursor.execute(self.parameters.get("query"))
137            except pymysql.err.ProgrammingError as e:
138                self.dataset.update_status("SQL query error: %s" % str(e), is_final=True)
139                return
140            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
141            # Get column names from cursor
142            column_names = [description[0] for description in unbuffered_cursor.description]
143            for result in unbuffered_cursor.fetchall_unbuffered():
144                # Reformat result (which is a tuple with each column in the row) to dict
145                new_result = {k: v for k, v in zip(column_names, result)}
146                # 4CAT necessary fieldnames
147                new_result['id'] = new_result.get('id', '')
148                new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get(
149                    "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get(
150                    "quoted_status_id") else new_result.get("id")
151                new_result['body'] = new_result.get('text', '')
152                new_result['timestamp'] = new_result.get('created_at', None)
153                new_result['subject'] = ''
154                new_result['author'] = new_result.get('from_user_name', '')
155                yield new_result
156
157        else:
158            # "Basic" query
159            text_query = self.parameters.get("query")
160
161            where = []
162            replacements = []
163            # Find AND and OR
164            placeholder = 0
165            start_of_match = 0
166            while start_of_match >= 0:
167                match = None
168                and_match = text_query[placeholder:].find(' AND ')
169                or_match = text_query[placeholder:].find(' OR ')
170                if and_match != -1 and or_match != -1:
171                    # both found
172                    if and_match < or_match:
173                        # and match first
174                        match = 'AND '
175                        start_of_match = and_match
176                    else:
177                        # or match first
178                        match ='OR '
179                        start_of_match = or_match
180                elif and_match != -1:
181                    # and match only
182                    match ='AND '
183                    start_of_match = and_match
184                elif or_match != -1:
185                    # or match only
186                    match = 'OR '
187                    start_of_match = or_match
188                else:
189                    # neither
190                    match = None
191                    start_of_match = -1
192                # Add partial query to where and replacements
193                if match:
194                    where.append('lower(text) LIKE %s ' + match)
195                    replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%')
196                    # new start
197                    placeholder = placeholder + start_of_match + len(match)
198                else:
199                    where.append('lower(text) LIKE %s')
200                    replacements.append('%'+text_query[placeholder:].lower().strip()+'%')
201
202            if query.get("min_date", None):
203                try:
204                    if int(query.get("min_date")) > 0:
205                        where.append("AND created_at >= %s")
206                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date"))))
207                except ValueError:
208                    pass
209
210            if query.get("max_date", None):
211                try:
212                    if int(query.get("max_date")) > 0:
213                        where.append("AND created_at < %s")
214                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date"))))
215                except ValueError:
216                    pass
217
218            where = " ".join(where)
219            query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where
220            self.dataset.log('Query: %s' % query)
221            self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements]))
222            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
223            num_results = unbuffered_cursor.execute(query, replacements)
224            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
225            column_names = [description[0] for description in unbuffered_cursor.description]
226            for result in unbuffered_cursor.fetchall_unbuffered():
227                new_result = {k: v for k, v in zip(column_names, result)}
228                # Map tweet to 4CAT fields
229                new_result = self.tweet_mapping(new_result)
230                yield new_result
231
232    @staticmethod
233    def tweet_mapping(tweet):
234        """
235        Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to
236        mirror that mapped_tweet from twitterv2 datasource.
237
238        :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table
239        :return dict:
240        """
241        mapped_tweet = {'id': tweet.get('id', ''),
242                        # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is
243                        # retweet, or its own ID
244                        # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply
245                        # or reply to retweet.
246                        # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets
247                        # with added text)
248                        'thread_id': tweet.get("in_reply_to_status_id") if tweet.get(
249                                               "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get(
250                                               "retweet_id") else tweet.get("id"),
251                        'body': tweet.get('text', ''),
252                        # 'created_at': tweet.get('created_at'),
253                        'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type(
254                                                tweet.get('created_at')) == datetime.datetime else None,
255                        'subject': '',
256                        'author': tweet.get('from_user_name', ''),
257                        "author_fullname": tweet["from_user_realname"],
258                        "author_id": tweet["from_user_id"],
259                        "source": tweet["source"],
260                        "language_guess": tweet.get("lang"),
261
262                        "retweet_count": tweet["retweet_count"],
263                        "like_count": tweet["favorite_count"],
264                        "is_retweet": "yes" if tweet.get('retweet_id', False) else "no",
265                        "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no",
266                        "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None,
267                        "reply_to": tweet["to_user_name"],
268                        "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None,
269
270                        # 4CAT specifics
271                        "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
272                        "urls": ",".join(ural.urls_from_text(tweet["text"])),
273                        "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])),
274                        "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
275
276                        # Additional TCAT data (compared to twitterv2 map_item function)
277                        "filter_level": tweet['filter_level'],
278                        'location': tweet['location'],
279                        'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None,
280                        'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None,
281                        'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None,
282                        'author_description': tweet['from_user_description'],
283                        'author_url': tweet['from_user_url'],
284                        'author_profile_image': tweet['from_user_profile_image_url'],
285                        'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if
286                                                           tweet['from_user_utcoffset'] else 0)/60/60),
287                        'author_timezone_name': tweet['from_user_timezone'],
288                        'author_language': tweet['from_user_lang'],
289                        'author_tweet_count': tweet['from_user_tweetcount'],
290                        'author_follower_count': tweet['from_user_followercount'],
291                        'author_friend_following_count': tweet['from_user_friendcount'],
292                        'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table?
293                        'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None,
294                        'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table?
295                        'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table?
296
297                        # TODO find in other TCAT tables or does not exist
298                        # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no",
299                        # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no",
300                        # 'withheld_copyright': tweet['withheld_copyright'],  # TCAT may no collect this anymore
301                        # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore
302                        # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits
303
304                        }
305
306        # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet
307        mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id',
308                       'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount',
309                       'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description',
310                       'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang',
311                       'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count',
312                       'in_reply_to_status_id']
313        for key in tweet.keys():
314            if key not in mapped_keys:
315                index = ''
316                while key + index in mapped_tweet.keys():
317                    index += '_1'
318                mapped_tweet[key + index] = tweet.get(key)
319
320        return mapped_tweet
321
322
323    @classmethod
324    def collect_tcat_metadata(cls):
325        """
326        Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing
327        this data. To be used to infor the user of available TCAT bins and create the options from which a user will
328        select.
329
330        :return dict: All of the available bins from accessible TCAT instances
331        """
332
333        # todo: cache this somehow! and check for the cache
334        instances = config.get("dmi-tcatv2.instances", [])
335
336        all_bins = {}
337        for instance in instances:
338            # Query each instance for bins
339            db = MySQLDatabase(logger=None,
340                               dbname=instance.get('db_name'),
341                               user=instance.get('db_user'),
342                               password=instance.get('db_password'),
343                               host=instance.get('db_host'),
344                               port=instance.get('db_port'))
345            # try:
346            instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins')
347            # except:
348
349            instance_bins_metadata = {}
350            for instance_bin in instance_bins:
351                bin_data = {
352                    'instance': instance,
353                    'querybin': instance_bin['querybin'],
354                }
355
356                # Query for number of tweets
357                tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[
358                    'COUNT(id)']
359                bin_data['tweet_count'] = tweet_count
360
361                # Collect first and last tweet datetimes
362                first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at']
363                last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at']
364                bin_data['first_tweet_datetime'] = first_tweet_datetime
365                bin_data['last_tweet_datetime'] = last_tweet_datetime
366
367                # Could check if bin currently should be collecting
368                # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active')
369
370                # Could collect phrases or usernames...
371                # if instance_bin['type'] in []:
372                # elif instance_bin['type'] in []:
373
374                # Could collect all periods for nuanced metadata...
375                #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id'])
376
377                # Add bin_data to instance collection
378                instance_bins_metadata[instance_bin['querybin']] = bin_data
379            # Add bins to instance
380            all_bins[instance['tcat_name']] = instance_bins_metadata
381
382        return all_bins
383
384    @staticmethod
385    def validate_query(query, request, user):
386        """
387        Validate DMI-TCAT query input
388
389        :param dict query:  Query parameters, from client-side.
390        :param request:  Flask request
391        :param User user:  User object of user who has submitted the query
392        :return dict:  Safe query parameters
393        """
394        # no query 4 u
395        if not query.get("bin", "").strip():
396            raise QueryParametersException("You must choose a query bin to get tweets from.")
397
398        # the dates need to make sense as a range to search within
399        # and a date range is needed, to not make it too easy to just get all tweets
400        after, before = query.get("daterange")
401        if (after and before) and not before <= after:
402            raise QueryParametersException("A date range must start before it ends")
403
404        query["min_date"], query["max_date"] = query.get("daterange")
405        del query["daterange"]
406
407        # simple!
408        return query

Get Tweets via DMI-TCAT

This allows subsetting an existing query bin, similar to the 'Data Selection' panel in the DMI-TCAT analysis interface

type = 'dmi-tcatv2-search'
extension = 'csv'
title = 'TCAT Search (SQL)'
options = {'intro-1': {'type': 'info', 'help': 'This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet bin in that instance.'}, 'divider-1': {'type': 'divider'}, 'bin': {'type': 'info', 'help': 'Query bin'}, 'query_type': {'type': 'choice', 'help': 'Basic or Advanced Query', 'options': {'basic': 'Basic query all bin tweets for specific text (Query) and date (Date range)', 'advanced': 'Select queries on any TCAT twitter tables'}, 'default': 'basic', 'tooltip': 'Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary'}, 'query': {'type': 'string', 'help': 'Query text', 'tooltip': 'Match all tweets containing this text.'}, 'daterange': {'type': 'daterange', 'help': 'Date range'}}
config = {'dmi-tcatv2-search.database_instances': {'type': 'json', 'help': 'DMI-TCAT instances', 'tooltip': "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture','db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of these values need to be provided for each instance. This needs to be formatted as a JSON list of objects.", 'default': {}}}
@classmethod
def get_options(cls, parent_dataset=None, user=None):
 76    @classmethod
 77    def get_options(cls, parent_dataset=None, user=None):
 78        """
 79        Get data source options
 80
 81        This method takes the pre-defined options, but fills the 'bins' options
 82        with bins currently available from the configured TCAT instances.
 83
 84        :param DataSet parent_dataset:  An object representing the dataset that
 85        the processor would be run on
 86        :param User user:  Flask user the options will be displayed for, in
 87        case they are requested for display in the 4CAT web interface. This can
 88        be used to show some options only to privileges users.
 89        """
 90        options = cls.options
 91
 92        # Collect Metadata from TCAT instances
 93        all_bins = cls.collect_tcat_metadata()
 94
 95        options["bin"] = {
 96            "type": UserInput.OPTION_CHOICE,
 97            "options": {},
 98            "help": "Query bin"
 99        }
100
101        for instance_name, bins in all_bins.items():
102            for bin_name, bin in bins.items():
103                bin_key = "%s@%s" % (bin_name, instance_name)
104                display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}"
105                options["bin"]["options"][bin_key] = display_text
106
107        return options

Get data source options

This method takes the pre-defined options, but fills the 'bins' options with bins currently available from the configured TCAT instances.

Parameters
  • DataSet parent_dataset: An object representing the dataset that the processor would be run on
  • User user: Flask user the options will be displayed for, in case they are requested for display in the 4CAT web interface. This can be used to show some options only to privileges users.
def get_items(self, query):
109    def get_items(self, query):
110        """
111        Use the DMI-TCAT tweet export to retrieve tweets
112
113        :param query:
114        :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
115        """
116        bin = self.parameters.get("bin")
117        bin_name = bin.split("@")[0]
118        tcat_name = bin.split("@").pop()
119
120        available_instances = config.get("dmi-tcatv2-search.database_instances", [])
121        instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0]
122
123        db = MySQLDatabase(logger=self.log,
124                           dbname=instance.get('db_name'),
125                           user=instance.get('db_user'),
126                           password=instance.get('db_password'),
127                           host=instance.get('db_host'),
128                           port=instance.get('db_port'))
129
130        self.dataset.update_status("Searching for tweets on %s" % bin_name)
131        if self.parameters.get("query_type") == 'advanced':
132            # Advanced query should be simple from our perspective...
133            self.dataset.log('Query: %s' % self.parameters.get("query"))
134            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
135            try:
136                num_results = unbuffered_cursor.execute(self.parameters.get("query"))
137            except pymysql.err.ProgrammingError as e:
138                self.dataset.update_status("SQL query error: %s" % str(e), is_final=True)
139                return
140            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
141            # Get column names from cursor
142            column_names = [description[0] for description in unbuffered_cursor.description]
143            for result in unbuffered_cursor.fetchall_unbuffered():
144                # Reformat result (which is a tuple with each column in the row) to dict
145                new_result = {k: v for k, v in zip(column_names, result)}
146                # 4CAT necessary fieldnames
147                new_result['id'] = new_result.get('id', '')
148                new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get(
149                    "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get(
150                    "quoted_status_id") else new_result.get("id")
151                new_result['body'] = new_result.get('text', '')
152                new_result['timestamp'] = new_result.get('created_at', None)
153                new_result['subject'] = ''
154                new_result['author'] = new_result.get('from_user_name', '')
155                yield new_result
156
157        else:
158            # "Basic" query
159            text_query = self.parameters.get("query")
160
161            where = []
162            replacements = []
163            # Find AND and OR
164            placeholder = 0
165            start_of_match = 0
166            while start_of_match >= 0:
167                match = None
168                and_match = text_query[placeholder:].find(' AND ')
169                or_match = text_query[placeholder:].find(' OR ')
170                if and_match != -1 and or_match != -1:
171                    # both found
172                    if and_match < or_match:
173                        # and match first
174                        match = 'AND '
175                        start_of_match = and_match
176                    else:
177                        # or match first
178                        match ='OR '
179                        start_of_match = or_match
180                elif and_match != -1:
181                    # and match only
182                    match ='AND '
183                    start_of_match = and_match
184                elif or_match != -1:
185                    # or match only
186                    match = 'OR '
187                    start_of_match = or_match
188                else:
189                    # neither
190                    match = None
191                    start_of_match = -1
192                # Add partial query to where and replacements
193                if match:
194                    where.append('lower(text) LIKE %s ' + match)
195                    replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%')
196                    # new start
197                    placeholder = placeholder + start_of_match + len(match)
198                else:
199                    where.append('lower(text) LIKE %s')
200                    replacements.append('%'+text_query[placeholder:].lower().strip()+'%')
201
202            if query.get("min_date", None):
203                try:
204                    if int(query.get("min_date")) > 0:
205                        where.append("AND created_at >= %s")
206                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date"))))
207                except ValueError:
208                    pass
209
210            if query.get("max_date", None):
211                try:
212                    if int(query.get("max_date")) > 0:
213                        where.append("AND created_at < %s")
214                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date"))))
215                except ValueError:
216                    pass
217
218            where = " ".join(where)
219            query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where
220            self.dataset.log('Query: %s' % query)
221            self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements]))
222            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
223            num_results = unbuffered_cursor.execute(query, replacements)
224            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
225            column_names = [description[0] for description in unbuffered_cursor.description]
226            for result in unbuffered_cursor.fetchall_unbuffered():
227                new_result = {k: v for k, v in zip(column_names, result)}
228                # Map tweet to 4CAT fields
229                new_result = self.tweet_mapping(new_result)
230                yield new_result

Use the DMI-TCAT tweet export to retrieve tweets

Parameters
  • query: :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
@staticmethod
def tweet_mapping(tweet):
232    @staticmethod
233    def tweet_mapping(tweet):
234        """
235        Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to
236        mirror that mapped_tweet from twitterv2 datasource.
237
238        :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table
239        :return dict:
240        """
241        mapped_tweet = {'id': tweet.get('id', ''),
242                        # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is
243                        # retweet, or its own ID
244                        # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply
245                        # or reply to retweet.
246                        # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets
247                        # with added text)
248                        'thread_id': tweet.get("in_reply_to_status_id") if tweet.get(
249                                               "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get(
250                                               "retweet_id") else tweet.get("id"),
251                        'body': tweet.get('text', ''),
252                        # 'created_at': tweet.get('created_at'),
253                        'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type(
254                                                tweet.get('created_at')) == datetime.datetime else None,
255                        'subject': '',
256                        'author': tweet.get('from_user_name', ''),
257                        "author_fullname": tweet["from_user_realname"],
258                        "author_id": tweet["from_user_id"],
259                        "source": tweet["source"],
260                        "language_guess": tweet.get("lang"),
261
262                        "retweet_count": tweet["retweet_count"],
263                        "like_count": tweet["favorite_count"],
264                        "is_retweet": "yes" if tweet.get('retweet_id', False) else "no",
265                        "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no",
266                        "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None,
267                        "reply_to": tweet["to_user_name"],
268                        "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None,
269
270                        # 4CAT specifics
271                        "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
272                        "urls": ",".join(ural.urls_from_text(tweet["text"])),
273                        "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])),
274                        "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
275
276                        # Additional TCAT data (compared to twitterv2 map_item function)
277                        "filter_level": tweet['filter_level'],
278                        'location': tweet['location'],
279                        'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None,
280                        'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None,
281                        'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None,
282                        'author_description': tweet['from_user_description'],
283                        'author_url': tweet['from_user_url'],
284                        'author_profile_image': tweet['from_user_profile_image_url'],
285                        'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if
286                                                           tweet['from_user_utcoffset'] else 0)/60/60),
287                        'author_timezone_name': tweet['from_user_timezone'],
288                        'author_language': tweet['from_user_lang'],
289                        'author_tweet_count': tweet['from_user_tweetcount'],
290                        'author_follower_count': tweet['from_user_followercount'],
291                        'author_friend_following_count': tweet['from_user_friendcount'],
292                        'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table?
293                        'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None,
294                        'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table?
295                        'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table?
296
297                        # TODO find in other TCAT tables or does not exist
298                        # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no",
299                        # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no",
300                        # 'withheld_copyright': tweet['withheld_copyright'],  # TCAT may no collect this anymore
301                        # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore
302                        # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits
303
304                        }
305
306        # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet
307        mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id',
308                       'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount',
309                       'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description',
310                       'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang',
311                       'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count',
312                       'in_reply_to_status_id']
313        for key in tweet.keys():
314            if key not in mapped_keys:
315                index = ''
316                while key + index in mapped_tweet.keys():
317                    index += '_1'
318                mapped_tweet[key + index] = tweet.get(key)
319
320        return mapped_tweet

Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to mirror that mapped_tweet from twitterv2 datasource.

Parameters
  • dict tweet: TCAT dict returned from query; expected to be from bin tweets table
Returns
@classmethod
def collect_tcat_metadata(cls):
323    @classmethod
324    def collect_tcat_metadata(cls):
325        """
326        Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing
327        this data. To be used to infor the user of available TCAT bins and create the options from which a user will
328        select.
329
330        :return dict: All of the available bins from accessible TCAT instances
331        """
332
333        # todo: cache this somehow! and check for the cache
334        instances = config.get("dmi-tcatv2.instances", [])
335
336        all_bins = {}
337        for instance in instances:
338            # Query each instance for bins
339            db = MySQLDatabase(logger=None,
340                               dbname=instance.get('db_name'),
341                               user=instance.get('db_user'),
342                               password=instance.get('db_password'),
343                               host=instance.get('db_host'),
344                               port=instance.get('db_port'))
345            # try:
346            instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins')
347            # except:
348
349            instance_bins_metadata = {}
350            for instance_bin in instance_bins:
351                bin_data = {
352                    'instance': instance,
353                    'querybin': instance_bin['querybin'],
354                }
355
356                # Query for number of tweets
357                tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[
358                    'COUNT(id)']
359                bin_data['tweet_count'] = tweet_count
360
361                # Collect first and last tweet datetimes
362                first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at']
363                last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at']
364                bin_data['first_tweet_datetime'] = first_tweet_datetime
365                bin_data['last_tweet_datetime'] = last_tweet_datetime
366
367                # Could check if bin currently should be collecting
368                # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active')
369
370                # Could collect phrases or usernames...
371                # if instance_bin['type'] in []:
372                # elif instance_bin['type'] in []:
373
374                # Could collect all periods for nuanced metadata...
375                #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id'])
376
377                # Add bin_data to instance collection
378                instance_bins_metadata[instance_bin['querybin']] = bin_data
379            # Add bins to instance
380            all_bins[instance['tcat_name']] = instance_bins_metadata
381
382        return all_bins

Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing this data. To be used to infor the user of available TCAT bins and create the options from which a user will select.

Returns

All of the available bins from accessible TCAT instances

@staticmethod
def validate_query(query, request, user):
384    @staticmethod
385    def validate_query(query, request, user):
386        """
387        Validate DMI-TCAT query input
388
389        :param dict query:  Query parameters, from client-side.
390        :param request:  Flask request
391        :param User user:  User object of user who has submitted the query
392        :return dict:  Safe query parameters
393        """
394        # no query 4 u
395        if not query.get("bin", "").strip():
396            raise QueryParametersException("You must choose a query bin to get tweets from.")
397
398        # the dates need to make sense as a range to search within
399        # and a date range is needed, to not make it too easy to just get all tweets
400        after, before = query.get("daterange")
401        if (after and before) and not before <= after:
402            raise QueryParametersException("A date range must start before it ends")
403
404        query["min_date"], query["max_date"] = query.get("daterange")
405        del query["daterange"]
406
407        # simple!
408        return query

Validate DMI-TCAT query input

Parameters
  • dict query: Query parameters, from client-side.
  • request: Flask request
  • User user: User object of user who has submitted the query
Returns

Safe query parameters