Edit on GitHub

datasources.dmi-tcatv2.search_tcat_v2

Twitter search within a DMI-TCAT bin v2 Direct database connection

View Source

  1"""
  2Twitter search within a DMI-TCAT bin v2
  3Direct database connection
  4"""
  5import datetime
  6import re
  7import ural
  8import pymysql
  9
 10
 11from backend.lib.search import Search
 12from common.lib.exceptions import QueryParametersException
 13from common.lib.user_input import UserInput
 14from backend.lib.database_mysql import MySQLDatabase
 15
 16
 17class SearchWithinTCATBinsV2(Search):
 18    """
 19    Get Tweets via DMI-TCAT
 20
 21    This allows subsetting an existing query bin, similar to the 'Data
 22    Selection' panel in the DMI-TCAT analysis interface
 23    """
 24    type = "dmi-tcatv2-search"  # job ID
 25    extension = "csv"
 26    title = "TCAT Search (SQL)"
 27
 28    options = {
 29        "intro-1": {
 30            "type": UserInput.OPTION_INFO,
 31            "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet "
 32                    "bin in that instance."
 33        },
 34        "divider-1": {
 35            "type": UserInput.OPTION_DIVIDER
 36        },
 37        "bin": {
 38            "type": UserInput.OPTION_INFO,
 39            "help": "Query bin"
 40        },
 41        "query_type": {
 42            "type": UserInput.OPTION_CHOICE,
 43            "help": "Basic or Advanced Query",
 44            "options": {
 45                "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)",
 46                "advanced": "Select queries on any TCAT twitter tables"
 47            },
 48            "default": "basic",
 49            "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary"
 50        },
 51        "query": {
 52            "type": UserInput.OPTION_TEXT,
 53            "help": "Query text",
 54            "tooltip": "Match all tweets containing this text."
 55        },
 56        "daterange": {
 57            "type": UserInput.OPTION_DATERANGE,
 58            "help": "Date range"
 59        }
 60    }
 61
 62    config = {
 63        "dmi-tcatv2-search.database_instances": {
 64            "type": UserInput.OPTION_TEXT_JSON,
 65            "help": "DMI-TCAT instances",
 66            "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture',"
 67                    "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of "
 68                    "these values need to be provided for each instance. This needs to be formatted as a JSON list of "
 69                    "objects.",
 70            "default": {}
 71        }
 72    }
 73
 74    @classmethod
 75    def get_options(cls, parent_dataset=None, config=None):
 76        """
 77        Get data source options
 78
 79        This method takes the pre-defined options, but fills the 'bins' options
 80        with bins currently available from the configured TCAT instances.
 81
 82        :param config:
 83        :param DataSet parent_dataset:  An object representing the dataset that
 84        the processor would be run on
 85can
 86        be used to show some options only to privileges users.
 87        """
 88        options = cls.options
 89
 90        # Collect Metadata from TCAT instances
 91        all_bins = cls.collect_tcat_metadata(config)
 92
 93        options["bin"] = {
 94            "type": UserInput.OPTION_CHOICE,
 95            "options": {},
 96            "help": "Query bin"
 97        }
 98
 99        for instance_name, bins in all_bins.items():
100            for bin_name, bin in bins.items():
101                bin_key = "%s@%s" % (bin_name, instance_name)
102                display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}"
103                options["bin"]["options"][bin_key] = display_text
104
105        return options
106
107    def get_items(self, query):
108        """
109        Use the DMI-TCAT tweet export to retrieve tweets
110
111        :param query:
112        :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
113        """
114        bin = self.parameters.get("bin")
115        bin_name = bin.split("@")[0]
116        tcat_name = bin.split("@").pop()
117
118        available_instances = self.config.get("dmi-tcatv2-search.database_instances", [])
119        instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0]
120
121        db = MySQLDatabase(logger=self.log,
122                           dbname=instance.get('db_name'),
123                           user=instance.get('db_user'),
124                           password=instance.get('db_password'),
125                           host=instance.get('db_host'),
126                           port=instance.get('db_port'))
127
128        self.dataset.update_status("Searching for tweets on %s" % bin_name)
129        if self.parameters.get("query_type") == 'advanced':
130            # Advanced query should be simple from our perspective...
131            self.dataset.log('Query: %s' % self.parameters.get("query"))
132            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
133            try:
134                unbuffered_cursor.execute(self.parameters.get("query"))
135            except pymysql.err.ProgrammingError as e:
136                self.dataset.update_status("SQL query error: %s" % str(e), is_final=True)
137                return
138            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
139            # Get column names from cursor
140            column_names = [description[0] for description in unbuffered_cursor.description]
141            for result in unbuffered_cursor.fetchall_unbuffered():
142                # Reformat result (which is a tuple with each column in the row) to dict
143                new_result = {k: v for k, v in zip(column_names, result)}
144                # 4CAT necessary fieldnames
145                new_result['id'] = new_result.get('id', '')
146                new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get(
147                    "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get(
148                    "quoted_status_id") else new_result.get("id")
149                new_result['body'] = new_result.get('text', '')
150                new_result['timestamp'] = new_result.get('created_at', None)
151                new_result['subject'] = ''
152                new_result['author'] = new_result.get('from_user_name', '')
153                yield new_result
154
155        else:
156            # "Basic" query
157            text_query = self.parameters.get("query")
158
159            where = []
160            replacements = []
161            # Find AND and OR
162            placeholder = 0
163            start_of_match = 0
164            while start_of_match >= 0:
165                match = None
166                and_match = text_query[placeholder:].find(' AND ')
167                or_match = text_query[placeholder:].find(' OR ')
168                if and_match != -1 and or_match != -1:
169                    # both found
170                    if and_match < or_match:
171                        # and match first
172                        match = 'AND '
173                        start_of_match = and_match
174                    else:
175                        # or match first
176                        match ='OR '
177                        start_of_match = or_match
178                elif and_match != -1:
179                    # and match only
180                    match ='AND '
181                    start_of_match = and_match
182                elif or_match != -1:
183                    # or match only
184                    match = 'OR '
185                    start_of_match = or_match
186                else:
187                    # neither
188                    match = None
189                    start_of_match = -1
190                # Add partial query to where and replacements
191                if match:
192                    where.append('lower(text) LIKE %s ' + match)
193                    replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%')
194                    # new start
195                    placeholder = placeholder + start_of_match + len(match)
196                else:
197                    where.append('lower(text) LIKE %s')
198                    replacements.append('%'+text_query[placeholder:].lower().strip()+'%')
199
200            if query.get("min_date", None):
201                try:
202                    if int(query.get("min_date")) > 0:
203                        where.append("AND created_at >= %s")
204                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date"))))
205                except ValueError:
206                    pass
207
208            if query.get("max_date", None):
209                try:
210                    if int(query.get("max_date")) > 0:
211                        where.append("AND created_at < %s")
212                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date"))))
213                except ValueError:
214                    pass
215
216            where = " ".join(where)
217            query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where
218            self.dataset.log('Query: %s' % query)
219            self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements]))
220            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
221            unbuffered_cursor.execute(query, replacements)
222            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
223            column_names = [description[0] for description in unbuffered_cursor.description]
224            for result in unbuffered_cursor.fetchall_unbuffered():
225                new_result = {k: v for k, v in zip(column_names, result)}
226                # Map tweet to 4CAT fields
227                new_result = self.tweet_mapping(new_result)
228                yield new_result
229
230    @staticmethod
231    def tweet_mapping(tweet):
232        """
233        Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to
234        mirror that mapped_tweet from twitterv2 datasource.
235
236        :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table
237        :return dict:
238        """
239        mapped_tweet = {'id': tweet.get('id', ''),
240                        # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is
241                        # retweet, or its own ID
242                        # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply
243                        # or reply to retweet.
244                        # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets
245                        # with added text)
246                        'thread_id': tweet.get("in_reply_to_status_id") if tweet.get(
247                                               "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get(
248                                               "retweet_id") else tweet.get("id"),
249                        'body': tweet.get('text', ''),
250                        # 'created_at': tweet.get('created_at'),
251                        'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type(
252                                                tweet.get('created_at')) is datetime.datetime else None,
253                        'subject': '',
254                        'author': tweet.get('from_user_name', ''),
255                        "author_fullname": tweet["from_user_realname"],
256                        "author_id": tweet["from_user_id"],
257                        "source": tweet["source"],
258                        "language_guess": tweet.get("lang"),
259
260                        "retweet_count": tweet["retweet_count"],
261                        "like_count": tweet["favorite_count"],
262                        "is_retweet": "yes" if tweet.get('retweet_id', False) else "no",
263                        "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no",
264                        "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None,
265                        "reply_to": tweet["to_user_name"],
266                        "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None,
267
268                        # 4CAT specifics
269                        "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
270                        "urls": ",".join(ural.urls_from_text(tweet["text"])),
271                        "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])),
272                        "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
273
274                        # Additional TCAT data (compared to twitterv2 map_item function)
275                        "filter_level": tweet['filter_level'],
276                        'location': tweet['location'],
277                        'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None,
278                        'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None,
279                        'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None,
280                        'author_description': tweet['from_user_description'],
281                        'author_url': tweet['from_user_url'],
282                        'author_profile_image': tweet['from_user_profile_image_url'],
283                        'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if
284                                                           tweet['from_user_utcoffset'] else 0)/60/60),
285                        'author_timezone_name': tweet['from_user_timezone'],
286                        'author_language': tweet['from_user_lang'],
287                        'author_tweet_count': tweet['from_user_tweetcount'],
288                        'author_follower_count': tweet['from_user_followercount'],
289                        'author_friend_following_count': tweet['from_user_friendcount'],
290                        'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table?
291                        'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None,
292                        'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table?
293                        'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table?
294
295                        # TODO find in other TCAT tables or does not exist
296                        # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no",
297                        # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no",
298                        # 'withheld_copyright': tweet['withheld_copyright'],  # TCAT may no collect this anymore
299                        # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore
300                        # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits
301
302                        }
303
304        # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet
305        mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id',
306                       'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount',
307                       'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description',
308                       'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang',
309                       'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count',
310                       'in_reply_to_status_id']
311        for key in tweet.keys():
312            if key not in mapped_keys:
313                index = ''
314                while key + index in mapped_tweet.keys():
315                    index += '_1'
316                mapped_tweet[key + index] = tweet.get(key)
317
318        return mapped_tweet
319
320
321    @classmethod
322    def collect_tcat_metadata(cls, config):
323        """
324        Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing
325        this data. To be used to infor the user of available TCAT bins and create the options from which a user will
326        select.
327
328        :return dict: All of the available bins from accessible TCAT instances
329        """
330
331        # todo: cache this somehow! and check for the cache
332        instances = config.get("dmi-tcatv2.instances", [])
333
334        all_bins = {}
335        for instance in instances:
336            # Query each instance for bins
337            db = MySQLDatabase(logger=None,
338                               dbname=instance.get('db_name'),
339                               user=instance.get('db_user'),
340                               password=instance.get('db_password'),
341                               host=instance.get('db_host'),
342                               port=instance.get('db_port'))
343            # try:
344            instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins')
345            # except:
346
347            instance_bins_metadata = {}
348            for instance_bin in instance_bins:
349                bin_data = {
350                    'instance': instance,
351                    'querybin': instance_bin['querybin'],
352                }
353
354                # Query for number of tweets
355                tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[
356                    'COUNT(id)']
357                bin_data['tweet_count'] = tweet_count
358
359                # Collect first and last tweet datetimes
360                first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at']
361                last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at']
362                bin_data['first_tweet_datetime'] = first_tweet_datetime
363                bin_data['last_tweet_datetime'] = last_tweet_datetime
364
365                # Could check if bin currently should be collecting
366                # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active')
367
368                # Could collect phrases or usernames...
369                # if instance_bin['type'] in []:
370                # elif instance_bin['type'] in []:
371
372                # Could collect all periods for nuanced metadata...
373                #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id'])
374
375                # Add bin_data to instance collection
376                instance_bins_metadata[instance_bin['querybin']] = bin_data
377            # Add bins to instance
378            all_bins[instance['tcat_name']] = instance_bins_metadata
379
380        return all_bins
381
382    @staticmethod
383    def validate_query(query, request, config):
384        """
385        Validate DMI-TCAT query input
386
387        :param dict query:  Query parameters, from client-side.
388        :param request:  Flask request
389        :param ConfigManager|None config:  Configuration reader (context-aware)
390        :return dict:  Safe query parameters
391        """
392        # no query 4 u
393        if not query.get("bin", ""):
394            raise QueryParametersException("You must choose a query bin to get tweets from.")
395
396        # the dates need to make sense as a range to search within
397        # and a date range is needed, to not make it too easy to just get all tweets
398        after, before = query.get("daterange")
399        if (after and before) and not before <= after:
400            raise QueryParametersException("A date range must start before it ends")
401
402        query["min_date"], query["max_date"] = query.get("daterange")
403        query["bin"] = query.get("bin", "").strip()
404        del query["daterange"]
405
406        # simple!
407        return query

class SearchWithinTCATBinsV2(backend.lib.search.Search): View Source

 18class SearchWithinTCATBinsV2(Search):
 19    """
 20    Get Tweets via DMI-TCAT
 21
 22    This allows subsetting an existing query bin, similar to the 'Data
 23    Selection' panel in the DMI-TCAT analysis interface
 24    """
 25    type = "dmi-tcatv2-search"  # job ID
 26    extension = "csv"
 27    title = "TCAT Search (SQL)"
 28
 29    options = {
 30        "intro-1": {
 31            "type": UserInput.OPTION_INFO,
 32            "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet "
 33                    "bin in that instance."
 34        },
 35        "divider-1": {
 36            "type": UserInput.OPTION_DIVIDER
 37        },
 38        "bin": {
 39            "type": UserInput.OPTION_INFO,
 40            "help": "Query bin"
 41        },
 42        "query_type": {
 43            "type": UserInput.OPTION_CHOICE,
 44            "help": "Basic or Advanced Query",
 45            "options": {
 46                "basic": "Basic query all bin tweets for specific text (Query) and date (Date range)",
 47                "advanced": "Select queries on any TCAT twitter tables"
 48            },
 49            "default": "basic",
 50            "tooltip": "Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary"
 51        },
 52        "query": {
 53            "type": UserInput.OPTION_TEXT,
 54            "help": "Query text",
 55            "tooltip": "Match all tweets containing this text."
 56        },
 57        "daterange": {
 58            "type": UserInput.OPTION_DATERANGE,
 59            "help": "Date range"
 60        }
 61    }
 62
 63    config = {
 64        "dmi-tcatv2-search.database_instances": {
 65            "type": UserInput.OPTION_TEXT_JSON,
 66            "help": "DMI-TCAT instances",
 67            "tooltip": "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture',"
 68                    "'db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of "
 69                    "these values need to be provided for each instance. This needs to be formatted as a JSON list of "
 70                    "objects.",
 71            "default": {}
 72        }
 73    }
 74
 75    @classmethod
 76    def get_options(cls, parent_dataset=None, config=None):
 77        """
 78        Get data source options
 79
 80        This method takes the pre-defined options, but fills the 'bins' options
 81        with bins currently available from the configured TCAT instances.
 82
 83        :param config:
 84        :param DataSet parent_dataset:  An object representing the dataset that
 85        the processor would be run on
 86can
 87        be used to show some options only to privileges users.
 88        """
 89        options = cls.options
 90
 91        # Collect Metadata from TCAT instances
 92        all_bins = cls.collect_tcat_metadata(config)
 93
 94        options["bin"] = {
 95            "type": UserInput.OPTION_CHOICE,
 96            "options": {},
 97            "help": "Query bin"
 98        }
 99
100        for instance_name, bins in all_bins.items():
101            for bin_name, bin in bins.items():
102                bin_key = "%s@%s" % (bin_name, instance_name)
103                display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}"
104                options["bin"]["options"][bin_key] = display_text
105
106        return options
107
108    def get_items(self, query):
109        """
110        Use the DMI-TCAT tweet export to retrieve tweets
111
112        :param query:
113        :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
114        """
115        bin = self.parameters.get("bin")
116        bin_name = bin.split("@")[0]
117        tcat_name = bin.split("@").pop()
118
119        available_instances = self.config.get("dmi-tcatv2-search.database_instances", [])
120        instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0]
121
122        db = MySQLDatabase(logger=self.log,
123                           dbname=instance.get('db_name'),
124                           user=instance.get('db_user'),
125                           password=instance.get('db_password'),
126                           host=instance.get('db_host'),
127                           port=instance.get('db_port'))
128
129        self.dataset.update_status("Searching for tweets on %s" % bin_name)
130        if self.parameters.get("query_type") == 'advanced':
131            # Advanced query should be simple from our perspective...
132            self.dataset.log('Query: %s' % self.parameters.get("query"))
133            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
134            try:
135                unbuffered_cursor.execute(self.parameters.get("query"))
136            except pymysql.err.ProgrammingError as e:
137                self.dataset.update_status("SQL query error: %s" % str(e), is_final=True)
138                return
139            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
140            # Get column names from cursor
141            column_names = [description[0] for description in unbuffered_cursor.description]
142            for result in unbuffered_cursor.fetchall_unbuffered():
143                # Reformat result (which is a tuple with each column in the row) to dict
144                new_result = {k: v for k, v in zip(column_names, result)}
145                # 4CAT necessary fieldnames
146                new_result['id'] = new_result.get('id', '')
147                new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get(
148                    "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get(
149                    "quoted_status_id") else new_result.get("id")
150                new_result['body'] = new_result.get('text', '')
151                new_result['timestamp'] = new_result.get('created_at', None)
152                new_result['subject'] = ''
153                new_result['author'] = new_result.get('from_user_name', '')
154                yield new_result
155
156        else:
157            # "Basic" query
158            text_query = self.parameters.get("query")
159
160            where = []
161            replacements = []
162            # Find AND and OR
163            placeholder = 0
164            start_of_match = 0
165            while start_of_match >= 0:
166                match = None
167                and_match = text_query[placeholder:].find(' AND ')
168                or_match = text_query[placeholder:].find(' OR ')
169                if and_match != -1 and or_match != -1:
170                    # both found
171                    if and_match < or_match:
172                        # and match first
173                        match = 'AND '
174                        start_of_match = and_match
175                    else:
176                        # or match first
177                        match ='OR '
178                        start_of_match = or_match
179                elif and_match != -1:
180                    # and match only
181                    match ='AND '
182                    start_of_match = and_match
183                elif or_match != -1:
184                    # or match only
185                    match = 'OR '
186                    start_of_match = or_match
187                else:
188                    # neither
189                    match = None
190                    start_of_match = -1
191                # Add partial query to where and replacements
192                if match:
193                    where.append('lower(text) LIKE %s ' + match)
194                    replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%')
195                    # new start
196                    placeholder = placeholder + start_of_match + len(match)
197                else:
198                    where.append('lower(text) LIKE %s')
199                    replacements.append('%'+text_query[placeholder:].lower().strip()+'%')
200
201            if query.get("min_date", None):
202                try:
203                    if int(query.get("min_date")) > 0:
204                        where.append("AND created_at >= %s")
205                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date"))))
206                except ValueError:
207                    pass
208
209            if query.get("max_date", None):
210                try:
211                    if int(query.get("max_date")) > 0:
212                        where.append("AND created_at < %s")
213                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date"))))
214                except ValueError:
215                    pass
216
217            where = " ".join(where)
218            query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where
219            self.dataset.log('Query: %s' % query)
220            self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements]))
221            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
222            unbuffered_cursor.execute(query, replacements)
223            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
224            column_names = [description[0] for description in unbuffered_cursor.description]
225            for result in unbuffered_cursor.fetchall_unbuffered():
226                new_result = {k: v for k, v in zip(column_names, result)}
227                # Map tweet to 4CAT fields
228                new_result = self.tweet_mapping(new_result)
229                yield new_result
230
231    @staticmethod
232    def tweet_mapping(tweet):
233        """
234        Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to
235        mirror that mapped_tweet from twitterv2 datasource.
236
237        :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table
238        :return dict:
239        """
240        mapped_tweet = {'id': tweet.get('id', ''),
241                        # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is
242                        # retweet, or its own ID
243                        # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply
244                        # or reply to retweet.
245                        # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets
246                        # with added text)
247                        'thread_id': tweet.get("in_reply_to_status_id") if tweet.get(
248                                               "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get(
249                                               "retweet_id") else tweet.get("id"),
250                        'body': tweet.get('text', ''),
251                        # 'created_at': tweet.get('created_at'),
252                        'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type(
253                                                tweet.get('created_at')) is datetime.datetime else None,
254                        'subject': '',
255                        'author': tweet.get('from_user_name', ''),
256                        "author_fullname": tweet["from_user_realname"],
257                        "author_id": tweet["from_user_id"],
258                        "source": tweet["source"],
259                        "language_guess": tweet.get("lang"),
260
261                        "retweet_count": tweet["retweet_count"],
262                        "like_count": tweet["favorite_count"],
263                        "is_retweet": "yes" if tweet.get('retweet_id', False) else "no",
264                        "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no",
265                        "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None,
266                        "reply_to": tweet["to_user_name"],
267                        "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None,
268
269                        # 4CAT specifics
270                        "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
271                        "urls": ",".join(ural.urls_from_text(tweet["text"])),
272                        "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])),
273                        "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
274
275                        # Additional TCAT data (compared to twitterv2 map_item function)
276                        "filter_level": tweet['filter_level'],
277                        'location': tweet['location'],
278                        'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None,
279                        'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None,
280                        'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None,
281                        'author_description': tweet['from_user_description'],
282                        'author_url': tweet['from_user_url'],
283                        'author_profile_image': tweet['from_user_profile_image_url'],
284                        'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if
285                                                           tweet['from_user_utcoffset'] else 0)/60/60),
286                        'author_timezone_name': tweet['from_user_timezone'],
287                        'author_language': tweet['from_user_lang'],
288                        'author_tweet_count': tweet['from_user_tweetcount'],
289                        'author_follower_count': tweet['from_user_followercount'],
290                        'author_friend_following_count': tweet['from_user_friendcount'],
291                        'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table?
292                        'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None,
293                        'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table?
294                        'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table?
295
296                        # TODO find in other TCAT tables or does not exist
297                        # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no",
298                        # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no",
299                        # 'withheld_copyright': tweet['withheld_copyright'],  # TCAT may no collect this anymore
300                        # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore
301                        # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits
302
303                        }
304
305        # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet
306        mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id',
307                       'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount',
308                       'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description',
309                       'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang',
310                       'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count',
311                       'in_reply_to_status_id']
312        for key in tweet.keys():
313            if key not in mapped_keys:
314                index = ''
315                while key + index in mapped_tweet.keys():
316                    index += '_1'
317                mapped_tweet[key + index] = tweet.get(key)
318
319        return mapped_tweet
320
321
322    @classmethod
323    def collect_tcat_metadata(cls, config):
324        """
325        Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing
326        this data. To be used to infor the user of available TCAT bins and create the options from which a user will
327        select.
328
329        :return dict: All of the available bins from accessible TCAT instances
330        """
331
332        # todo: cache this somehow! and check for the cache
333        instances = config.get("dmi-tcatv2.instances", [])
334
335        all_bins = {}
336        for instance in instances:
337            # Query each instance for bins
338            db = MySQLDatabase(logger=None,
339                               dbname=instance.get('db_name'),
340                               user=instance.get('db_user'),
341                               password=instance.get('db_password'),
342                               host=instance.get('db_host'),
343                               port=instance.get('db_port'))
344            # try:
345            instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins')
346            # except:
347
348            instance_bins_metadata = {}
349            for instance_bin in instance_bins:
350                bin_data = {
351                    'instance': instance,
352                    'querybin': instance_bin['querybin'],
353                }
354
355                # Query for number of tweets
356                tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[
357                    'COUNT(id)']
358                bin_data['tweet_count'] = tweet_count
359
360                # Collect first and last tweet datetimes
361                first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at']
362                last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at']
363                bin_data['first_tweet_datetime'] = first_tweet_datetime
364                bin_data['last_tweet_datetime'] = last_tweet_datetime
365
366                # Could check if bin currently should be collecting
367                # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active')
368
369                # Could collect phrases or usernames...
370                # if instance_bin['type'] in []:
371                # elif instance_bin['type'] in []:
372
373                # Could collect all periods for nuanced metadata...
374                #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id'])
375
376                # Add bin_data to instance collection
377                instance_bins_metadata[instance_bin['querybin']] = bin_data
378            # Add bins to instance
379            all_bins[instance['tcat_name']] = instance_bins_metadata
380
381        return all_bins
382
383    @staticmethod
384    def validate_query(query, request, config):
385        """
386        Validate DMI-TCAT query input
387
388        :param dict query:  Query parameters, from client-side.
389        :param request:  Flask request
390        :param ConfigManager|None config:  Configuration reader (context-aware)
391        :return dict:  Safe query parameters
392        """
393        # no query 4 u
394        if not query.get("bin", ""):
395            raise QueryParametersException("You must choose a query bin to get tweets from.")
396
397        # the dates need to make sense as a range to search within
398        # and a date range is needed, to not make it too easy to just get all tweets
399        after, before = query.get("daterange")
400        if (after and before) and not before <= after:
401            raise QueryParametersException("A date range must start before it ends")
402
403        query["min_date"], query["max_date"] = query.get("daterange")
404        query["bin"] = query.get("bin", "").strip()
405        del query["daterange"]
406
407        # simple!
408        return query

Get Tweets via DMI-TCAT

This allows subsetting an existing query bin, similar to the 'Data Selection' panel in the DMI-TCAT analysis interface

type = 'dmi-tcatv2-search'

extension = 'csv'

title = 'TCAT Search (SQL)'

options = {'intro-1': {'type': 'info', 'help': 'This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet bin in that instance.'}, 'divider-1': {'type': 'divider'}, 'bin': {'type': 'info', 'help': 'Query bin'}, 'query_type': {'type': 'choice', 'help': 'Basic or Advanced Query', 'options': {'basic': 'Basic query all bin tweets for specific text (Query) and date (Date range)', 'advanced': 'Select queries on any TCAT twitter tables'}, 'default': 'basic', 'tooltip': 'Advanced queries do not provide scaffolding, so understanding TCAT database structure is necessary'}, 'query': {'type': 'string', 'help': 'Query text', 'tooltip': 'Match all tweets containing this text.'}, 'daterange': {'type': 'daterange', 'help': 'Date range'}}

config = {'dmi-tcatv2-search.database_instances': {'type': 'json', 'help': 'DMI-TCAT instances', 'tooltip': "List of DMI-TCAT instance metadata, e.g. [{'tcat_name': 'tcat2','db_name': 'twittercapture','db_user': 'username','db_password': 'password','db_host': '127.0.0.1','db_port': 3306}] All of these values need to be provided for each instance. This needs to be formatted as a JSON list of objects.", 'default': {}}}

@classmethod

def get_options(cls, parent_dataset=None, config=None): View Source

 75    @classmethod
 76    def get_options(cls, parent_dataset=None, config=None):
 77        """
 78        Get data source options
 79
 80        This method takes the pre-defined options, but fills the 'bins' options
 81        with bins currently available from the configured TCAT instances.
 82
 83        :param config:
 84        :param DataSet parent_dataset:  An object representing the dataset that
 85        the processor would be run on
 86can
 87        be used to show some options only to privileges users.
 88        """
 89        options = cls.options
 90
 91        # Collect Metadata from TCAT instances
 92        all_bins = cls.collect_tcat_metadata(config)
 93
 94        options["bin"] = {
 95            "type": UserInput.OPTION_CHOICE,
 96            "options": {},
 97            "help": "Query bin"
 98        }
 99
100        for instance_name, bins in all_bins.items():
101            for bin_name, bin in bins.items():
102                bin_key = "%s@%s" % (bin_name, instance_name)
103                display_text = f"{bin_name}: {bin.get('tweet_count')} tweets from {bin.get('first_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')} to {bin.get('last_tweet_datetime').strftime('%Y-%m-%d %H:%M:%S')}"
104                options["bin"]["options"][bin_key] = display_text
105
106        return options

Get data source options

    This method takes the pre-defined options, but fills the 'bins' options
    with bins currently available from the configured TCAT instances.

    :param config:
    :param DataSet parent_dataset:  An object representing the dataset that
    the processor would be run on

can be used to show some options only to privileges users.

def get_items(self, query): View Source

108    def get_items(self, query):
109        """
110        Use the DMI-TCAT tweet export to retrieve tweets
111
112        :param query:
113        :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result
114        """
115        bin = self.parameters.get("bin")
116        bin_name = bin.split("@")[0]
117        tcat_name = bin.split("@").pop()
118
119        available_instances = self.config.get("dmi-tcatv2-search.database_instances", [])
120        instance = [instance for instance in available_instances if instance.get('tcat_name') == tcat_name][0]
121
122        db = MySQLDatabase(logger=self.log,
123                           dbname=instance.get('db_name'),
124                           user=instance.get('db_user'),
125                           password=instance.get('db_password'),
126                           host=instance.get('db_host'),
127                           port=instance.get('db_port'))
128
129        self.dataset.update_status("Searching for tweets on %s" % bin_name)
130        if self.parameters.get("query_type") == 'advanced':
131            # Advanced query should be simple from our perspective...
132            self.dataset.log('Query: %s' % self.parameters.get("query"))
133            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
134            try:
135                unbuffered_cursor.execute(self.parameters.get("query"))
136            except pymysql.err.ProgrammingError as e:
137                self.dataset.update_status("SQL query error: %s" % str(e), is_final=True)
138                return
139            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
140            # Get column names from cursor
141            column_names = [description[0] for description in unbuffered_cursor.description]
142            for result in unbuffered_cursor.fetchall_unbuffered():
143                # Reformat result (which is a tuple with each column in the row) to dict
144                new_result = {k: v for k, v in zip(column_names, result)}
145                # 4CAT necessary fieldnames
146                new_result['id'] = new_result.get('id', '')
147                new_result['thread_id'] = new_result.get("in_reply_to_status_id") if new_result.get(
148                    "in_reply_to_status_id") else new_result.get("quoted_status_id") if new_result.get(
149                    "quoted_status_id") else new_result.get("id")
150                new_result['body'] = new_result.get('text', '')
151                new_result['timestamp'] = new_result.get('created_at', None)
152                new_result['subject'] = ''
153                new_result['author'] = new_result.get('from_user_name', '')
154                yield new_result
155
156        else:
157            # "Basic" query
158            text_query = self.parameters.get("query")
159
160            where = []
161            replacements = []
162            # Find AND and OR
163            placeholder = 0
164            start_of_match = 0
165            while start_of_match >= 0:
166                match = None
167                and_match = text_query[placeholder:].find(' AND ')
168                or_match = text_query[placeholder:].find(' OR ')
169                if and_match != -1 and or_match != -1:
170                    # both found
171                    if and_match < or_match:
172                        # and match first
173                        match = 'AND '
174                        start_of_match = and_match
175                    else:
176                        # or match first
177                        match ='OR '
178                        start_of_match = or_match
179                elif and_match != -1:
180                    # and match only
181                    match ='AND '
182                    start_of_match = and_match
183                elif or_match != -1:
184                    # or match only
185                    match = 'OR '
186                    start_of_match = or_match
187                else:
188                    # neither
189                    match = None
190                    start_of_match = -1
191                # Add partial query to where and replacements
192                if match:
193                    where.append('lower(text) LIKE %s ' + match)
194                    replacements.append('%'+text_query[placeholder:placeholder+start_of_match].lower().strip()+'%')
195                    # new start
196                    placeholder = placeholder + start_of_match + len(match)
197                else:
198                    where.append('lower(text) LIKE %s')
199                    replacements.append('%'+text_query[placeholder:].lower().strip()+'%')
200
201            if query.get("min_date", None):
202                try:
203                    if int(query.get("min_date")) > 0:
204                        where.append("AND created_at >= %s")
205                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("min_date"))))
206                except ValueError:
207                    pass
208
209            if query.get("max_date", None):
210                try:
211                    if int(query.get("max_date")) > 0:
212                        where.append("AND created_at < %s")
213                        replacements.append(datetime.datetime.fromtimestamp(int(query.get("max_date"))))
214                except ValueError:
215                    pass
216
217            where = " ".join(where)
218            query = 'SELECT * FROM ' + bin_name + '_tweets WHERE ' + where
219            self.dataset.log('Query: %s' % query)
220            self.dataset.log('Replacements: %s' % ', '.join([str(i) for i in replacements]))
221            unbuffered_cursor = db.connection.cursor(pymysql.cursors.SSCursor)
222            unbuffered_cursor.execute(query, replacements)
223            # self.dataset.update_status("Retrieving %i results" % int(num_results)) # num_results is CLEARLY not what I thought
224            column_names = [description[0] for description in unbuffered_cursor.description]
225            for result in unbuffered_cursor.fetchall_unbuffered():
226                new_result = {k: v for k, v in zip(column_names, result)}
227                # Map tweet to 4CAT fields
228                new_result = self.tweet_mapping(new_result)
229                yield new_result

Use the DMI-TCAT tweet export to retrieve tweets

Parameters

query: :yield dict: mapped_tweet for any "basic" query else for "advanced" queries a dictionary with mysql result

@staticmethod

def tweet_mapping(tweet): View Source

231    @staticmethod
232    def tweet_mapping(tweet):
233        """
234        Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to
235        mirror that mapped_tweet from twitterv2 datasource.
236
237        :param dict tweet: TCAT dict returned from query; expected to be from bin tweets table
238        :return dict:
239        """
240        mapped_tweet = {'id': tweet.get('id', ''),
241                        # For thread_id, we use in_reply_to_status_id if tweet is reply, retweet_id if tweet is
242                        # retweet, or its own ID
243                        # Note: tweets can have BOTH in_reply_to_status_id and retweet_id as you can retweet a reply
244                        # or reply to retweet.
245                        # THIS IS DIFFERENT from Twitter APIv2 as there does not appear to be a quote ID (for retweets
246                        # with added text)
247                        'thread_id': tweet.get("in_reply_to_status_id") if tweet.get(
248                                               "in_reply_to_status_id") else tweet.get("retweet_id") if tweet.get(
249                                               "retweet_id") else tweet.get("id"),
250                        'body': tweet.get('text', ''),
251                        # 'created_at': tweet.get('created_at'),
252                        'timestamp': int(datetime.datetime.timestamp(tweet.get('created_at'))) if type(
253                                                tweet.get('created_at')) is datetime.datetime else None,
254                        'subject': '',
255                        'author': tweet.get('from_user_name', ''),
256                        "author_fullname": tweet["from_user_realname"],
257                        "author_id": tweet["from_user_id"],
258                        "source": tweet["source"],
259                        "language_guess": tweet.get("lang"),
260
261                        "retweet_count": tweet["retweet_count"],
262                        "like_count": tweet["favorite_count"],
263                        "is_retweet": "yes" if tweet.get('retweet_id', False) else "no",
264                        "is_reply": "yes" if tweet["in_reply_to_status_id"] else "no",
265                        "in_reply_to_status_id": tweet["in_reply_to_status_id"] if tweet["in_reply_to_status_id"] else None,
266                        "reply_to": tweet["to_user_name"],
267                        "reply_to_id": tweet.get('to_user_id') if tweet.get('to_user_id') else None,
268
269                        # 4CAT specifics
270                        "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
271                        "urls": ",".join(ural.urls_from_text(tweet["text"])),
272                        "images": ",".join(re.findall(r"https://t\.co/[a-zA-Z0-9]+$", tweet["text"])),
273                        "mentions": ",".join(re.findall(r"@([^\s!@#$%^&*()+{}:\"|<>?\[\];'\,./`~]+)", tweet["text"])),
274
275                        # Additional TCAT data (compared to twitterv2 map_item function)
276                        "filter_level": tweet['filter_level'],
277                        'location': tweet['location'],
278                        'latitude': tweet['geo_lat'] if tweet['geo_lat'] else None,
279                        'longitude': tweet['geo_lng'] if tweet['geo_lng'] else None,
280                        'author_verified': tweet['from_user_verified'] if tweet['from_user_verified'] else None,
281                        'author_description': tweet['from_user_description'],
282                        'author_url': tweet['from_user_url'],
283                        'author_profile_image': tweet['from_user_profile_image_url'],
284                        'author_timezone_UTC_offset': int((int(tweet['from_user_utcoffset']) if
285                                                           tweet['from_user_utcoffset'] else 0)/60/60),
286                        'author_timezone_name': tweet['from_user_timezone'],
287                        'author_language': tweet['from_user_lang'],
288                        'author_tweet_count': tweet['from_user_tweetcount'],
289                        'author_follower_count': tweet['from_user_followercount'],
290                        'author_friend_following_count': tweet['from_user_friendcount'],
291                        'author_favorite_count': tweet.get('from_user_favourites_count'), # NOT in tweets table?
292                        'author_listed_count': tweet['from_user_listed'] if tweet['from_user_listed'] else None,
293                        'author_withheld_scope': tweet.get('from_user_withheld_scope'), # NOT in tweets table?
294                        'author_created_at': tweet.get('from_user_created_at'), # NOT in tweets table?
295
296                        # TODO find in other TCAT tables or does not exist
297                        # "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") not in ("", "0") else "no",
298                        # "is_quote_tweet": "yes" if tweet["quoted_status_id"] else "no",
299                        # 'withheld_copyright': tweet['withheld_copyright'],  # TCAT may no collect this anymore
300                        # 'withheld_scope': tweet['withheld_scope'], # TCAT may no collect this anymore
301                        # 'truncated': tweet['truncated'], # Older tweets could be truncated meaning their text was cut off due to Twitter/TCAT db character limits
302
303                        }
304
305        # Ensure that any keys not specifically mapped to another field are added to the new mapped_tweet
306        mapped_keys = ['id', 'text', 'created_at', 'from_user_name', 'from_user_realname', 'from_user_id',
307                       'from_user_lang', 'from_user_tweetcount', 'from_user_followercount', 'from_user_friendcount',
308                       'from_user_listed', 'from_user_utcoffset', 'from_user_timezone', 'from_user_description',
309                       'from_user_url', 'from_user_verified', 'from_user_profile_image_url', 'source', 'lang',
310                       'filter_level', 'location', 'to_user_name', 'to_user_id', 'geo_lat', 'geo_lng', 'retweet_count',
311                       'in_reply_to_status_id']
312        for key in tweet.keys():
313            if key not in mapped_keys:
314                index = ''
315                while key + index in mapped_tweet.keys():
316                    index += '_1'
317                mapped_tweet[key + index] = tweet.get(key)
318
319        return mapped_tweet

Takes TCAT output from specific tables and maps them to 4CAT expected fields. The expected fields attempt to mirror that mapped_tweet from twitterv2 datasource.

Parameters

dict tweet: TCAT dict returned from query; expected to be from bin tweets table

Returns

@classmethod

def collect_tcat_metadata(cls, config): View Source

322    @classmethod
323    def collect_tcat_metadata(cls, config):
324        """
325        Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing
326        this data. To be used to infor the user of available TCAT bins and create the options from which a user will
327        select.
328
329        :return dict: All of the available bins from accessible TCAT instances
330        """
331
332        # todo: cache this somehow! and check for the cache
333        instances = config.get("dmi-tcatv2.instances", [])
334
335        all_bins = {}
336        for instance in instances:
337            # Query each instance for bins
338            db = MySQLDatabase(logger=None,
339                               dbname=instance.get('db_name'),
340                               user=instance.get('db_user'),
341                               password=instance.get('db_password'),
342                               host=instance.get('db_host'),
343                               port=instance.get('db_port'))
344            # try:
345            instance_bins = db.fetchall('SELECT id, querybin, type from tcat_query_bins')
346            # except:
347
348            instance_bins_metadata = {}
349            for instance_bin in instance_bins:
350                bin_data = {
351                    'instance': instance,
352                    'querybin': instance_bin['querybin'],
353                }
354
355                # Query for number of tweets
356                tweet_count = db.fetchone('SELECT COUNT(id) from ' + instance_bin['querybin'] + '_tweets')[
357                    'COUNT(id)']
358                bin_data['tweet_count'] = tweet_count
359
360                # Collect first and last tweet datetimes
361                first_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at ASC LIMIT 1')['created_at']
362                last_tweet_datetime = db.fetchone('SELECT created_at from ' + instance_bin['querybin'] + '_tweets ORDER BY created_at DESC LIMIT 1')['created_at']
363                bin_data['first_tweet_datetime'] = first_tweet_datetime
364                bin_data['last_tweet_datetime'] = last_tweet_datetime
365
366                # Could check if bin currently should be collecting
367                # db.fetchall('SELECT EXISTS ( SELECT endtime from tcat_query_bins_periods WHERE querybin_id = ' + str(instance_bin['id']) + ' and endtime = "0000-00-00 00:00:00" ) as active')
368
369                # Could collect phrases or usernames...
370                # if instance_bin['type'] in []:
371                # elif instance_bin['type'] in []:
372
373                # Could collect all periods for nuanced metadata...
374                #periods = db.fetchall('SELECT starttime, endtime from tcat_query_bins_periods WHERE query_bin_id = ' + instance_bin['id'])
375
376                # Add bin_data to instance collection
377                instance_bins_metadata[instance_bin['querybin']] = bin_data
378            # Add bins to instance
379            all_bins[instance['tcat_name']] = instance_bins_metadata
380
381        return all_bins

Collect specific metadata from TCAT instances listed in the configuration and return a dictionary containing this data. To be used to infor the user of available TCAT bins and create the options from which a user will select.

Returns

All of the available bins from accessible TCAT instances

@staticmethod

def validate_query(query, request, config): View Source

383    @staticmethod
384    def validate_query(query, request, config):
385        """
386        Validate DMI-TCAT query input
387
388        :param dict query:  Query parameters, from client-side.
389        :param request:  Flask request
390        :param ConfigManager|None config:  Configuration reader (context-aware)
391        :return dict:  Safe query parameters
392        """
393        # no query 4 u
394        if not query.get("bin", ""):
395            raise QueryParametersException("You must choose a query bin to get tweets from.")
396
397        # the dates need to make sense as a range to search within
398        # and a date range is needed, to not make it too easy to just get all tweets
399        after, before = query.get("daterange")
400        if (after and before) and not before <= after:
401            raise QueryParametersException("A date range must start before it ends")
402
403        query["min_date"], query["max_date"] = query.get("daterange")
404        query["bin"] = query.get("bin", "").strip()
405        del query["daterange"]
406
407        # simple!
408        return query

Validate DMI-TCAT query input

Parameters

dict query: Query parameters, from client-side.
request: Flask request
ConfigManager|None config: Configuration reader (context-aware)

Returns

Safe query parameters

Inherited Members

backend.lib.worker.BasicWorker: BasicWorker; INTERRUPT_NONE; INTERRUPT_RETRY; INTERRUPT_CANCEL; queue; log; manager; interrupted; modules; init_time; name; run; clean_up; request_interrupt; is_4cat_class
backend.lib.search.Search: max_workers; prefix; return_cols; import_error_count; import_warning_count; process; search; import_from_file; items_to_csv; items_to_ndjson; items_to_archive
backend.lib.processor.BasicProcessor: db; job; dataset; owner; source_dataset; source_file; description; category; is_running_in_preset; is_hidden; filepath; work; after_process; remove_files; abort; iterate_proxied_requests; push_proxied_request; flush_proxied_requests; iterate_archive_contents; unpack_archive_contents; extract_archived_file_by_name; write_csv_items_and_finish; write_archive_and_finish; create_standalone; save_annotations; map_item_method_available; get_mapped_item; is_filter; get_status; is_top_dataset; is_from_collector; get_extension; is_rankable; exclude_followup_processors; is_4cat_processor