Edit on GitHub

datasources.dmi-tcat.search_tcat

Twitter search within a DMI-TCAT bin; connect via TCAT frontend

  1"""
  2Twitter search within a DMI-TCAT bin; connect via TCAT frontend
  3"""
  4import requests
  5import datetime
  6import csv
  7import json
  8import re
  9import io
 10
 11from backend.lib.search import Search
 12from common.lib.exceptions import QueryParametersException
 13from common.lib.user_input import UserInput
 14from common.lib.helpers import sniff_encoding
 15from common.lib.item_mapping import MappedItem
 16from common.config_manager import config
 17
 18from datasources.twitterv2.search_twitter import SearchWithTwitterAPIv2
 19
 20
 21class SearchWithinTCATBins(Search):
 22    """
 23    Get Tweets via DMI-TCAT
 24
 25    This allows subsetting an existing query bin, similar to the 'Data
 26    Selection' panel in the DMI-TCAT analysis interface
 27    """
 28    type = "dmi-tcat-search"  # job ID
 29    extension = "ndjson"
 30    title = "TCAT Search (HTTP)"
 31
 32    # TCAT has a few fields that do not exist in APIv2
 33    additional_TCAT_fields = ["to_user_name", "filter_level", "favorite_count", "truncated", "from_user_favourites_count", "from_user_lang", "from_user_utcoffset",
 34                              "from_user_timezone"]
 35
 36    options = {
 37        "intro-1": {
 38            "type": UserInput.OPTION_INFO,
 39            "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet "
 40                    "bin in that instance."
 41        },
 42        "divider-1": {
 43            "type": UserInput.OPTION_DIVIDER
 44        },
 45        "bin": {
 46            "type": UserInput.OPTION_INFO,
 47            "help": "Query bin"
 48        },
 49        "query": {
 50            "type": UserInput.OPTION_TEXT,
 51            "help": "Query text",
 52            "tooltip": "Match all tweets containing this text."
 53        },
 54        "query-exclude": {
 55            "type": UserInput.OPTION_TEXT,
 56            "help": "Exclude text",
 57            "tooltip": "Match all tweets that do NOT contain this text."
 58        },
 59        "user-name": {
 60            "type": UserInput.OPTION_TEXT,
 61            "help": "From user",
 62            "tooltip": "Match all tweets from this username."
 63        },
 64        "user-exclude": {
 65            "type": UserInput.OPTION_TEXT,
 66            "help": "Exclude user",
 67            "tooltip": "Match all tweets NOT from this username."
 68        },
 69        "exclude-replies": {
 70            "type": UserInput.OPTION_CHOICE,
 71            "options": {
 72                "exclude": "Exclude replies",
 73                "include": "Include replies"
 74            },
 75            "help": "Reply tweets",
 76            "default": "include",
 77            "tooltip": "Choose to exclude or include tweets that are replies from the data"
 78        },
 79        "daterange": {
 80            "type": UserInput.OPTION_DATERANGE,
 81            "help": "Date range"
 82        },
 83        # Advanced Options Section
 84        "divider-2": {
 85            "type": UserInput.OPTION_DIVIDER
 86        },
 87        "advanced_options_info": {
 88            "type": UserInput.OPTION_INFO,
 89            "help": "Advanced Query Options can further refine your query"
 90        },
 91        "user-bio": {
 92            "type": UserInput.OPTION_TEXT,
 93            "help": "User bio text",
 94            "tooltip": "Match all tweets from users with biographies containing this text."
 95        },
 96        "user-language": {
 97            "type": UserInput.OPTION_TEXT,
 98            "help": "User language",
 99            "tooltip": "Match all tweets from users using this language (as detected by Twitter)."
100        },
101        "tweet-language": {
102            "type": UserInput.OPTION_TEXT,
103            "help": "Tweet language",
104            "tooltip": "Match all tweets from users with this language (as detected by Twitter)."
105        },
106        "tweet-client": {
107            "type": UserInput.OPTION_TEXT,
108            "help": "Twitter client URL/descr",
109            "tooltip": "Match all tweets from clients that match this text."
110        },
111        "url": {
112            "type": UserInput.OPTION_TEXT,
113            "help": "(Part of) URL",
114            "tooltip": "Match all tweets containing this (partial) URL."
115        },
116        "url-media": {
117            "type": UserInput.OPTION_TEXT,
118            "help": "(Part of) media URL",
119            "tooltip": "Match all tweets containing this (partial) media URL."
120        },
121    }
122
123    config = {
124        "dmi-tcat-search.instances": {
125            "type": UserInput.OPTION_TEXT_JSON,
126            "help": "DMI-TCAT instances",
127            "tooltip": 'List of DMI-TCAT instance URLs, e.g. ["http://username:password@tcat.instance.webpage.net"]. '
128                       'This  needs to be formatted as a JSON list of strings.',
129            "default": {}
130        }
131    }
132
133    bin_data = {
134        "all_bins": {},
135        "last_collected": {},
136    }
137
138    @classmethod
139    def collect_all_bins(cls, force_update=False):
140        """
141        Requests bin information from TCAT instances
142        """
143        instances = config.get("dmi-tcat-search.instances", [])
144        for instance in instances:
145            # query each configured TCAT instance for a list of bins that can
146            # be subsetted
147            instance = instance.rstrip("/")
148            api_url = instance + "/api/bin-stats.php"
149
150            if force_update or instance not in cls.bin_data["last_collected"] or datetime.datetime.now()-datetime.timedelta(days=1) >= cls.bin_data["last_collected"][instance]:
151                # Collect Instance data
152                try:
153                    api_request = requests.get(api_url, timeout=5)
154                    instance_bins = json.loads(api_request.content)
155                    cls.bin_data["all_bins"][instance] = {k: instance_bins[k] for k in sorted(instance_bins)}
156                    cls.bin_data["last_collected"][instance] = datetime.datetime.now()
157                except (requests.RequestException, json.JSONDecodeError):
158                    cls.bin_data["all_bins"][instance] = {"failed": True}
159                    # TODO: No logger here as nothing has been initialized
160                    # print(f"WARNING, unable to collect TCAT bins from instance {instance}")
161                    pass
162
163    @classmethod
164    def get_options(cls, parent_dataset=None, user=None):
165        """
166        Get data source options
167
168        This method takes the pre-defined options, but fills the 'bins' options
169        with bins currently available from the configured TCAT instances.
170
171        :param DataSet parent_dataset:  An object representing the dataset that
172        the processor would be run on
173        :param User user:  Flask user the options will be displayed for, in
174        case they are requested for display in the 4CAT web interface. This can
175        be used to show some options only to privileges users.
176        """
177        options = cls.options
178
179        cls.collect_all_bins()
180        if all([data.get("failed", False) for instance, data in cls.bin_data["all_bins"].items()]):
181            options["bin"] = {
182                "type": UserInput.OPTION_INFO,
183                "help": "Could not connect to DMI-TCAT instance(s)."
184            }
185            return options
186
187        options["bin"] = {
188            "type": UserInput.OPTION_CHOICE,
189            "options": {},
190            "help": "Query bin"
191        }
192
193        for instance, bins in cls.bin_data["all_bins"].items():
194            # make the host somewhat human-readable
195            # also strip out embedded HTTP auths
196            host = re.sub(r"^https?://", "", instance).split("@").pop()
197            for bin_name, bin in bins.items():
198                bin_key = "%s@%s" % (bin_name, host)
199                display_text = f"{bin_name}: {bin.get('tweets_approximate')} tweets from {bin.get('range').get('first_tweet')} to {bin.get('range').get('last_tweet')}"
200                options["bin"]["options"][bin_key] = display_text
201
202        return options
203
204    def get_items(self, query):
205        """
206        Use the DMI-TCAT tweet export to retrieve tweets
207
208        :param query:
209        :return:
210        """
211        bin = self.parameters.get("bin")
212        bin_name = bin.split("@")[0]
213        bin_host = bin.split("@").pop()
214
215        # we cannot store the full instance URL as a parameter, because it may
216        # contain sensitive information (e.g. HTTP auth) - so we find the full
217        # instance URL again here
218        # while the parameter could be marked 'sensitive', the values would
219        # still show up in e.g. the HTML of the 'create dataset' form
220        available_instances = config.get("dmi-tcat-search.instances", [])
221        instance_url = ""
222        instance = None
223        for available_instance in available_instances:
224            hostname = re.sub(r"https?://", "", available_instance).split("@").pop().rstrip("/")
225            if hostname == bin_host:
226                instance_url = available_instance
227                instance = available_instance.rstrip("/")
228                break
229
230        if not instance_url:
231            return self.dataset.finish_with_error("Invalid DMI-TCAT instance name '%s'" % bin_host)
232
233        # Collect the bins again (ensure we have updated info in case bin is still active)
234        self.collect_all_bins(force_update=True)
235        # Add metadata to parameters
236        try:
237            current_bin = self.bin_data["all_bins"][instance][bin_name]
238        except KeyError:
239            return self.dataset.finish_with_error(f"Lost connection to TCAT instance {bin_host}")
240        # Add TCAT metadata to dataset
241        self.dataset.tcat_bin_data = current_bin
242        if current_bin.get("type") in ["follow", "track", "timeline", "geotrack"] and ("phrase_times" not in current_bin or not "user_times" not in current_bin):
243            self.dataset.update_status("Warning: TCAT not updated to send phrase and user time ranges; consider updating if you would like to retain this BIN metadata.")
244
245        # now get the parameters...
246        request_url = instance_url.rstrip("/") + "/analysis/mod.export_tweets.php"
247
248        # Allow for blank dates
249        if self.parameters.get("min_date"):
250            start_date = datetime.datetime.fromtimestamp(self.parameters.get("min_date")).strftime("%Y-%m-%d")
251        else:
252            first_tweet_timestamp = current_bin.get('range').get('first_tweet')
253            start_date = datetime.datetime.strptime(first_tweet_timestamp, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d")
254
255        end_date = datetime.datetime.fromtimestamp(self.parameters.get("max_date")).strftime("%Y-%m-%d") if self.parameters.get("max_date") else (datetime.datetime.now() + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
256        parameters = {
257            "dataset": bin_name,
258            "query": self.parameters.get("query"),
259            "url_query": self.parameters.get("url"),
260            "media_url_query": self.parameters.get("url-media"),
261            "exclude": self.parameters.get("query-exclude"),
262            "from_user_name": self.parameters.get("user-name"),
263            "from_user_lang": self.parameters.get("user-language"),
264            "lang": self.parameters.get("tweet-language"),
265            "exclude_from_user_name": self.parameters.get("user-exclude"),
266            "from_source": re.sub(r"<[^>]+>", "", self.parameters.get("tweet-client")),
267            "startdate": start_date,
268            "enddate": end_date,
269            "replyto": "yes" if self.parameters.get("exclude-replies") == "exclude" else "no",
270            "whattodo": "",
271            "exportSettings": "urls,mentions,hashtags,media,",
272            "graph_resolution": "day",
273            "outputformat": "csv"
274        }
275
276        # for now we simply request the full CSV export of the bin with the
277        # given parameters, letting TCAT handle the full text search and so
278        # on
279        self.dataset.update_status("Searching for tweets on %s" % bin_host)
280        response = requests.get(request_url, params=parameters, stream=True)
281        if response.status_code != 200:
282            return self.dataset.finish_with_error("Query bin not available: received HTTP Error %i" % response.status_code)
283
284        # process the file in 1kB chunks, buffer as we go
285        # If a newline is encountered, the buffer is processed as a row of csv
286        # data. This works as long as there are no newlines in the csv itself,
287        # which is the case for TCAT exports. Processing as a stream is needed
288        # to avoid having to load the full file in memory
289        buffer = bytearray()
290        fieldnames = None
291        items = 0
292        encoding = None
293        for chunk in response.iter_content(chunk_size=1024):
294            # see if this chunk contains a newline, in which case we have a
295            # full line to process (e.g. as a tweet)
296            lines = []
297            buffer += bytearray(chunk)
298
299            if not encoding and len(buffer) > 3:
300                # response.encoding is not correct sometimes, since it does not
301                # indicate that the file uses a BOM, so sniff it instead once
302                # we have some bytes
303                encoding = sniff_encoding(buffer)
304
305            # split buffer by newlines and process each full line
306            # the last line is always carried over, since it may be incomplete
307            if b"\n" in buffer:
308                buffered_lines = buffer.split(b"\n")
309                lines = buffered_lines[:-1]
310                buffer = buffered_lines.pop()
311            elif not chunk:
312                # eof, process left-over data
313                lines = buffer.split(b"\n")
314
315            # and finally we can process the data
316            for line in lines:
317                # use a dummy csv reader to abstract away the annoying csv parsing
318                # this is quite a bit of overhead, but beats implementing csv parsing
319                # manually, and it's still reasonably fast (about 10k/second)
320                dummy_file = io.TextIOWrapper(io.BytesIO(line.replace(b"\0", b"")), encoding=encoding)
321                reader = csv.reader(dummy_file,
322                                    delimiter=",",
323                                    quotechar='"',
324                                    doublequote=True,
325                                    quoting=csv.QUOTE_MINIMAL)
326                row_data = next(reader)
327
328                if row_data and not fieldnames:
329                    # first line in file
330                    fieldnames = row_data.copy()
331
332                elif row_data:
333                    tweet = dict(zip(fieldnames, row_data))
334                    items += 1
335
336                    if items % 250 == 0:
337                        self.dataset.update_status("Loaded %i tweets from bin %s@%s" % (items, bin_name, bin_host))
338
339                    yield self.tcat_to_APIv2(tweet)
340
341            if not chunk:
342                # end of file
343                break
344
345    @ staticmethod
346    def tcat_to_4cat_time(tcat_time):
347        """
348        Twitter APIv2 time is in format "%Y-%m-%dT%H:%M:%S.000Z" while TCAT uses "%Y-%m-%d %H:%M:%S" and a timestamp.
349
350        :return datetime:
351        """
352        try:
353            tcat_time = int(tcat_time)
354            return datetime.datetime.fromtimestamp(tcat_time).strftime("%Y-%m-%dT%H:%M:%S.000Z")
355        except ValueError:
356            return datetime.datetime.strptime(tcat_time, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S.000Z")
357
358    @staticmethod
359    def tcat_to_APIv2(tcat_tweet):
360        """
361        Attempt to construct a 4CAT tweet gathered from APIv2 to allow for use of Twitter specific processors!
362
363        A great deal of information is missing so there may result in some issues. Notes are kept for the expected
364        type and, if the data is missing in TCAT, None is used. Therefor it should be possible to refactor processors
365        to handle None if necessary.
366        """
367        # We're missing lots of data here...
368
369        urls = [url.strip() for url in (tcat_tweet["urls_expanded"].split(";") if tcat_tweet["urls_expanded"] else tcat_tweet["urls_followed"].split(";") if tcat_tweet["urls_followed"] else tcat_tweet["urls_followed"].split(";")) if url]
370        # TCAT media_id: 7 = video, 3 = photo, 16 = animated_gif
371        media_type = "video" if tcat_tweet["media_id"] == "7" else "photo" if tcat_tweet["media_id"] == "3" else "animated_gif" if tcat_tweet["media_id"] == "16" else tcat_tweet["media_id"]
372
373        # 4CAT Twitter APIv2 result data structure
374        APIv2_tweet = {
375            "lang": tcat_tweet["lang"],  # str
376            "source": tcat_tweet["source"],  # REMOVED FROM TWITTER API v2
377            "possibly_sensitive": True if tcat_tweet["possibly_sensitive"] == 1 else False if tcat_tweet["possibly_sensitive"] == 0 else None,  # bool
378            "text": tcat_tweet["text"],  # str
379            "edit_history_tweet_ids": None,  # list; Missing in TCAT data
380            "public_metrics": {
381                "retweet_count": tcat_tweet["retweet_count"],  # int
382                "reply_count": None,  # int; Missing in TCAT data
383                "like_count": tcat_tweet["favorite_count"],  # int
384                "quote_count": None,  # int; Missing in TCAT data
385                "impression_count": None,  # int; Missing in TCAT data
386                # TCAT has also favorite_count
387            },
388            "entities": {
389                "mentions": [{
390                    "id": None,  # str; Missing in TCAT data
391                    "username": mention.strip(),  # str
392                    # Twitter v2 API has additional user fields
393                } for mention in tcat_tweet["mentions"].split(";") if mention],
394                "annotations": None,  # list; Missing in TCAT data
395                "urls": [{
396                    "url": url,  # str
397                    "expanded_url": url,  # str
398                    # Twitter v2 API has additional URL fields
399                } for url in urls],
400                "hashtags": [{
401                    "tag": hashtag.strip(),  # str
402                    "start": None,  # int; Missing in TCAT data
403                    "end": None,  # int; Missing in TCAT data
404                } for hashtag in tcat_tweet["hashtags"].split(";") if hashtag],
405                "cashtags": None,  # list; Missing in TCAT data
406            },
407            "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["time"]),  # str
408            "id": tcat_tweet["id"],  # str
409            "author_id": tcat_tweet["from_user_id"],  # str
410            "context_annotations": None,  # list; Missing in TCAT data
411            "reply_settings": None,  # str; Missing in TCAT data
412            "conversation_id": None,  # str; TCAT has a in_reply_to_status_id but this is not necessarily the original Tweet that started the conversation
413            "author_user": {
414                "protected": None,  # bool; Missing in TCAT data
415                "verified": True if tcat_tweet["from_user_verified"] == 1 else False if tcat_tweet["from_user_verified"] == 0 else None,  # bool
416                "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["from_user_created_at"]),  # str
417                "name": tcat_tweet["from_user_realname"],  # str
418                "entities": {
419                    "description": None,  # dict; contains entities from author description such as mentions, URLs, etc.; Missing in TCAT data
420                    "url": None,  # dict; containers entities from author url e.g. URL data; Missing in TCAT data
421                },
422                "description": tcat_tweet["from_user_description"],  # str
423                "pinned_tweet_id": None,  # str; Missing in TCAT data
424                "profile_image_url": tcat_tweet["from_user_profile_image_url"],  # str
425                "url": tcat_tweet["from_user_url"],  # str
426                "username": tcat_tweet["from_user_name"],  # str
427                "id": tcat_tweet["from_user_id"],  # str
428                "location": None,  # str; Missing in TCAT data
429                "public_metrics": {
430                    "followers_count": tcat_tweet["from_user_followercount"],  # int
431                    "following_count": tcat_tweet["from_user_friendcount"],  # int
432                    "tweet_count": tcat_tweet["from_user_tweetcount"],  # int
433                    "listed_count": tcat_tweet["from_user_listed"],  # int
434                    # TCAT has also from_user_favourites_count
435                },
436                "withheld": {
437                    "country_codes": tcat_tweet["from_user_withheld_scope"].split(";"),  # list; TODO TCAT has column, but have not seen it populated in testing... This is guess
438                },
439                # TCAT has also from_user_lang, from_user_utcoffset, from_user_timezone
440            },
441            "attachments": {
442                # TCAT has some media data, but not the URLs listed
443                "media_keys": [{
444                    "type": media_type,
445                    "url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["photo"]]),  # str; TCAT does not have the URL though it may be in the list of URLs
446                    "variants": [{"url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["video"]]), "bit_rate":0}]  # list; This is not the expected direct link to video, but it is a URL to the video
447                    # Twitter API v2 has additional data
448                }],  # list; TCAT seems to only have one type of media per tweet
449                "poll_ids": None,  # list; Missing from TCAT data
450            },
451            "geo": {
452                "place_id": None,  # str; Missing from TCAT data
453                "place": {
454                    "country": None,  # str; Missing from TCAT data
455                    "id": None,  # str; Missing from TCAT data
456                    "geo": {
457
458                    },
459                    "country_code": None,  # str; Missing from TCAT data
460                    "name": tcat_tweet["location"],  # str
461                    "place_type": None,  # str; Missing from TCAT data
462                    "full_name": tcat_tweet["location"],  # str
463                },
464                "coordindates": {
465                    "type": None,  # str; Missing from TCAT data
466                    "coordinates": [tcat_tweet["lng"], tcat_tweet["lat"]],  # list i.e. [longitude, latitude]
467                },
468            },
469            "withheld": {
470                "copyright": True if tcat_tweet["withheld_copyright"] == 1 else False if tcat_tweet["withheld_copyright"] == 0 else None,  # bool; TODO TCAT has column, but have not seen it populated in testing... This is guess
471                "country_codes": tcat_tweet["withheld_scope"].split(";"),  # list; TODO TCAT has column, but have not seen it populated in testing... This is guess
472            },
473        }
474
475        # Referenced Tweets; Twitter API v2 has entire tweet data here which we will be missing
476        referenced_tweets = []
477        if tcat_tweet["text"][:4] == "RT @":
478            # Retweet
479            referenced_tweets.append({
480                "type": "retweeted",
481                "id": None,  # str; Missing in TCAT data
482            })
483        if tcat_tweet["quoted_status_id"]:
484            # Quote
485            referenced_tweets.append({
486                "type": "quoted",
487                "id": tcat_tweet["quoted_status_id"],  # str; Missing in TCAT data
488            })
489        if tcat_tweet["in_reply_to_status_id"]:
490            # Reply
491            referenced_tweets.append({
492                "type": "replied_to",
493                "id": tcat_tweet["in_reply_to_status_id"],  # str; Missing in TCAT data
494            })
495            # These should NOT be None in case a processor/user attempts to identify a reply using these
496            APIv2_tweet["in_reply_to_user_id"] = "UNKNOWN"  # str; Missing from TCAT data
497            APIv2_tweet["in_reply_to_user"] = {"username": "UNKNOWN"}  # dict; Missing from TCAT data
498
499        APIv2_tweet["referenced_tweets"] = referenced_tweets  # list
500
501        # Append any extra TCAT data
502        additional_TCAT_data = {}
503        for field in SearchWithinTCATBins.additional_TCAT_fields:
504            additional_TCAT_data["TCAT_"+field] = tcat_tweet[field]
505        APIv2_tweet.update(additional_TCAT_data)
506
507        return APIv2_tweet
508
509    @staticmethod
510    def validate_query(query, request, user):
511        """
512        Validate DMI-TCAT query input
513
514        :param dict query:  Query parameters, from client-side.
515        :param request:  Flask request
516        :param User user:  User object of user who has submitted the query
517        :return dict:  Safe query parameters
518        """
519        # no query 4 u
520        if not query.get("bin", "").strip():
521            raise QueryParametersException("You must choose a query bin to get tweets from.")
522
523        # Dates need to make sense as a range to search within
524        after, before = query.get("daterange")
525        if (after and before) and before <= after:
526            raise QueryParametersException("A date range must start before it ends")
527
528        query["min_date"], query["max_date"] = query.get("daterange")
529        del query["daterange"]
530
531        # simple!
532        return query
533
534    @staticmethod
535    def map_item(item):
536        """
537        Use Twitter APIv2 map_item
538        """
539        mapped_tweet = SearchWithTwitterAPIv2.map_item(item)
540
541        # Add TCAT extra data
542        data = mapped_tweet.get_item_data()
543        message = mapped_tweet.get_message()
544        for field in SearchWithinTCATBins.additional_TCAT_fields:
545            data["TCAT_" + field] = item.get("TCAT_" + field)
546
547        return MappedItem(data, message)
class SearchWithinTCATBins(backend.lib.search.Search):
 22class SearchWithinTCATBins(Search):
 23    """
 24    Get Tweets via DMI-TCAT
 25
 26    This allows subsetting an existing query bin, similar to the 'Data
 27    Selection' panel in the DMI-TCAT analysis interface
 28    """
 29    type = "dmi-tcat-search"  # job ID
 30    extension = "ndjson"
 31    title = "TCAT Search (HTTP)"
 32
 33    # TCAT has a few fields that do not exist in APIv2
 34    additional_TCAT_fields = ["to_user_name", "filter_level", "favorite_count", "truncated", "from_user_favourites_count", "from_user_lang", "from_user_utcoffset",
 35                              "from_user_timezone"]
 36
 37    options = {
 38        "intro-1": {
 39            "type": UserInput.OPTION_INFO,
 40            "help": "This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet "
 41                    "bin in that instance."
 42        },
 43        "divider-1": {
 44            "type": UserInput.OPTION_DIVIDER
 45        },
 46        "bin": {
 47            "type": UserInput.OPTION_INFO,
 48            "help": "Query bin"
 49        },
 50        "query": {
 51            "type": UserInput.OPTION_TEXT,
 52            "help": "Query text",
 53            "tooltip": "Match all tweets containing this text."
 54        },
 55        "query-exclude": {
 56            "type": UserInput.OPTION_TEXT,
 57            "help": "Exclude text",
 58            "tooltip": "Match all tweets that do NOT contain this text."
 59        },
 60        "user-name": {
 61            "type": UserInput.OPTION_TEXT,
 62            "help": "From user",
 63            "tooltip": "Match all tweets from this username."
 64        },
 65        "user-exclude": {
 66            "type": UserInput.OPTION_TEXT,
 67            "help": "Exclude user",
 68            "tooltip": "Match all tweets NOT from this username."
 69        },
 70        "exclude-replies": {
 71            "type": UserInput.OPTION_CHOICE,
 72            "options": {
 73                "exclude": "Exclude replies",
 74                "include": "Include replies"
 75            },
 76            "help": "Reply tweets",
 77            "default": "include",
 78            "tooltip": "Choose to exclude or include tweets that are replies from the data"
 79        },
 80        "daterange": {
 81            "type": UserInput.OPTION_DATERANGE,
 82            "help": "Date range"
 83        },
 84        # Advanced Options Section
 85        "divider-2": {
 86            "type": UserInput.OPTION_DIVIDER
 87        },
 88        "advanced_options_info": {
 89            "type": UserInput.OPTION_INFO,
 90            "help": "Advanced Query Options can further refine your query"
 91        },
 92        "user-bio": {
 93            "type": UserInput.OPTION_TEXT,
 94            "help": "User bio text",
 95            "tooltip": "Match all tweets from users with biographies containing this text."
 96        },
 97        "user-language": {
 98            "type": UserInput.OPTION_TEXT,
 99            "help": "User language",
100            "tooltip": "Match all tweets from users using this language (as detected by Twitter)."
101        },
102        "tweet-language": {
103            "type": UserInput.OPTION_TEXT,
104            "help": "Tweet language",
105            "tooltip": "Match all tweets from users with this language (as detected by Twitter)."
106        },
107        "tweet-client": {
108            "type": UserInput.OPTION_TEXT,
109            "help": "Twitter client URL/descr",
110            "tooltip": "Match all tweets from clients that match this text."
111        },
112        "url": {
113            "type": UserInput.OPTION_TEXT,
114            "help": "(Part of) URL",
115            "tooltip": "Match all tweets containing this (partial) URL."
116        },
117        "url-media": {
118            "type": UserInput.OPTION_TEXT,
119            "help": "(Part of) media URL",
120            "tooltip": "Match all tweets containing this (partial) media URL."
121        },
122    }
123
124    config = {
125        "dmi-tcat-search.instances": {
126            "type": UserInput.OPTION_TEXT_JSON,
127            "help": "DMI-TCAT instances",
128            "tooltip": 'List of DMI-TCAT instance URLs, e.g. ["http://username:password@tcat.instance.webpage.net"]. '
129                       'This  needs to be formatted as a JSON list of strings.',
130            "default": {}
131        }
132    }
133
134    bin_data = {
135        "all_bins": {},
136        "last_collected": {},
137    }
138
139    @classmethod
140    def collect_all_bins(cls, force_update=False):
141        """
142        Requests bin information from TCAT instances
143        """
144        instances = config.get("dmi-tcat-search.instances", [])
145        for instance in instances:
146            # query each configured TCAT instance for a list of bins that can
147            # be subsetted
148            instance = instance.rstrip("/")
149            api_url = instance + "/api/bin-stats.php"
150
151            if force_update or instance not in cls.bin_data["last_collected"] or datetime.datetime.now()-datetime.timedelta(days=1) >= cls.bin_data["last_collected"][instance]:
152                # Collect Instance data
153                try:
154                    api_request = requests.get(api_url, timeout=5)
155                    instance_bins = json.loads(api_request.content)
156                    cls.bin_data["all_bins"][instance] = {k: instance_bins[k] for k in sorted(instance_bins)}
157                    cls.bin_data["last_collected"][instance] = datetime.datetime.now()
158                except (requests.RequestException, json.JSONDecodeError):
159                    cls.bin_data["all_bins"][instance] = {"failed": True}
160                    # TODO: No logger here as nothing has been initialized
161                    # print(f"WARNING, unable to collect TCAT bins from instance {instance}")
162                    pass
163
164    @classmethod
165    def get_options(cls, parent_dataset=None, user=None):
166        """
167        Get data source options
168
169        This method takes the pre-defined options, but fills the 'bins' options
170        with bins currently available from the configured TCAT instances.
171
172        :param DataSet parent_dataset:  An object representing the dataset that
173        the processor would be run on
174        :param User user:  Flask user the options will be displayed for, in
175        case they are requested for display in the 4CAT web interface. This can
176        be used to show some options only to privileges users.
177        """
178        options = cls.options
179
180        cls.collect_all_bins()
181        if all([data.get("failed", False) for instance, data in cls.bin_data["all_bins"].items()]):
182            options["bin"] = {
183                "type": UserInput.OPTION_INFO,
184                "help": "Could not connect to DMI-TCAT instance(s)."
185            }
186            return options
187
188        options["bin"] = {
189            "type": UserInput.OPTION_CHOICE,
190            "options": {},
191            "help": "Query bin"
192        }
193
194        for instance, bins in cls.bin_data["all_bins"].items():
195            # make the host somewhat human-readable
196            # also strip out embedded HTTP auths
197            host = re.sub(r"^https?://", "", instance).split("@").pop()
198            for bin_name, bin in bins.items():
199                bin_key = "%s@%s" % (bin_name, host)
200                display_text = f"{bin_name}: {bin.get('tweets_approximate')} tweets from {bin.get('range').get('first_tweet')} to {bin.get('range').get('last_tweet')}"
201                options["bin"]["options"][bin_key] = display_text
202
203        return options
204
205    def get_items(self, query):
206        """
207        Use the DMI-TCAT tweet export to retrieve tweets
208
209        :param query:
210        :return:
211        """
212        bin = self.parameters.get("bin")
213        bin_name = bin.split("@")[0]
214        bin_host = bin.split("@").pop()
215
216        # we cannot store the full instance URL as a parameter, because it may
217        # contain sensitive information (e.g. HTTP auth) - so we find the full
218        # instance URL again here
219        # while the parameter could be marked 'sensitive', the values would
220        # still show up in e.g. the HTML of the 'create dataset' form
221        available_instances = config.get("dmi-tcat-search.instances", [])
222        instance_url = ""
223        instance = None
224        for available_instance in available_instances:
225            hostname = re.sub(r"https?://", "", available_instance).split("@").pop().rstrip("/")
226            if hostname == bin_host:
227                instance_url = available_instance
228                instance = available_instance.rstrip("/")
229                break
230
231        if not instance_url:
232            return self.dataset.finish_with_error("Invalid DMI-TCAT instance name '%s'" % bin_host)
233
234        # Collect the bins again (ensure we have updated info in case bin is still active)
235        self.collect_all_bins(force_update=True)
236        # Add metadata to parameters
237        try:
238            current_bin = self.bin_data["all_bins"][instance][bin_name]
239        except KeyError:
240            return self.dataset.finish_with_error(f"Lost connection to TCAT instance {bin_host}")
241        # Add TCAT metadata to dataset
242        self.dataset.tcat_bin_data = current_bin
243        if current_bin.get("type") in ["follow", "track", "timeline", "geotrack"] and ("phrase_times" not in current_bin or not "user_times" not in current_bin):
244            self.dataset.update_status("Warning: TCAT not updated to send phrase and user time ranges; consider updating if you would like to retain this BIN metadata.")
245
246        # now get the parameters...
247        request_url = instance_url.rstrip("/") + "/analysis/mod.export_tweets.php"
248
249        # Allow for blank dates
250        if self.parameters.get("min_date"):
251            start_date = datetime.datetime.fromtimestamp(self.parameters.get("min_date")).strftime("%Y-%m-%d")
252        else:
253            first_tweet_timestamp = current_bin.get('range').get('first_tweet')
254            start_date = datetime.datetime.strptime(first_tweet_timestamp, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d")
255
256        end_date = datetime.datetime.fromtimestamp(self.parameters.get("max_date")).strftime("%Y-%m-%d") if self.parameters.get("max_date") else (datetime.datetime.now() + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
257        parameters = {
258            "dataset": bin_name,
259            "query": self.parameters.get("query"),
260            "url_query": self.parameters.get("url"),
261            "media_url_query": self.parameters.get("url-media"),
262            "exclude": self.parameters.get("query-exclude"),
263            "from_user_name": self.parameters.get("user-name"),
264            "from_user_lang": self.parameters.get("user-language"),
265            "lang": self.parameters.get("tweet-language"),
266            "exclude_from_user_name": self.parameters.get("user-exclude"),
267            "from_source": re.sub(r"<[^>]+>", "", self.parameters.get("tweet-client")),
268            "startdate": start_date,
269            "enddate": end_date,
270            "replyto": "yes" if self.parameters.get("exclude-replies") == "exclude" else "no",
271            "whattodo": "",
272            "exportSettings": "urls,mentions,hashtags,media,",
273            "graph_resolution": "day",
274            "outputformat": "csv"
275        }
276
277        # for now we simply request the full CSV export of the bin with the
278        # given parameters, letting TCAT handle the full text search and so
279        # on
280        self.dataset.update_status("Searching for tweets on %s" % bin_host)
281        response = requests.get(request_url, params=parameters, stream=True)
282        if response.status_code != 200:
283            return self.dataset.finish_with_error("Query bin not available: received HTTP Error %i" % response.status_code)
284
285        # process the file in 1kB chunks, buffer as we go
286        # If a newline is encountered, the buffer is processed as a row of csv
287        # data. This works as long as there are no newlines in the csv itself,
288        # which is the case for TCAT exports. Processing as a stream is needed
289        # to avoid having to load the full file in memory
290        buffer = bytearray()
291        fieldnames = None
292        items = 0
293        encoding = None
294        for chunk in response.iter_content(chunk_size=1024):
295            # see if this chunk contains a newline, in which case we have a
296            # full line to process (e.g. as a tweet)
297            lines = []
298            buffer += bytearray(chunk)
299
300            if not encoding and len(buffer) > 3:
301                # response.encoding is not correct sometimes, since it does not
302                # indicate that the file uses a BOM, so sniff it instead once
303                # we have some bytes
304                encoding = sniff_encoding(buffer)
305
306            # split buffer by newlines and process each full line
307            # the last line is always carried over, since it may be incomplete
308            if b"\n" in buffer:
309                buffered_lines = buffer.split(b"\n")
310                lines = buffered_lines[:-1]
311                buffer = buffered_lines.pop()
312            elif not chunk:
313                # eof, process left-over data
314                lines = buffer.split(b"\n")
315
316            # and finally we can process the data
317            for line in lines:
318                # use a dummy csv reader to abstract away the annoying csv parsing
319                # this is quite a bit of overhead, but beats implementing csv parsing
320                # manually, and it's still reasonably fast (about 10k/second)
321                dummy_file = io.TextIOWrapper(io.BytesIO(line.replace(b"\0", b"")), encoding=encoding)
322                reader = csv.reader(dummy_file,
323                                    delimiter=",",
324                                    quotechar='"',
325                                    doublequote=True,
326                                    quoting=csv.QUOTE_MINIMAL)
327                row_data = next(reader)
328
329                if row_data and not fieldnames:
330                    # first line in file
331                    fieldnames = row_data.copy()
332
333                elif row_data:
334                    tweet = dict(zip(fieldnames, row_data))
335                    items += 1
336
337                    if items % 250 == 0:
338                        self.dataset.update_status("Loaded %i tweets from bin %s@%s" % (items, bin_name, bin_host))
339
340                    yield self.tcat_to_APIv2(tweet)
341
342            if not chunk:
343                # end of file
344                break
345
346    @ staticmethod
347    def tcat_to_4cat_time(tcat_time):
348        """
349        Twitter APIv2 time is in format "%Y-%m-%dT%H:%M:%S.000Z" while TCAT uses "%Y-%m-%d %H:%M:%S" and a timestamp.
350
351        :return datetime:
352        """
353        try:
354            tcat_time = int(tcat_time)
355            return datetime.datetime.fromtimestamp(tcat_time).strftime("%Y-%m-%dT%H:%M:%S.000Z")
356        except ValueError:
357            return datetime.datetime.strptime(tcat_time, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S.000Z")
358
359    @staticmethod
360    def tcat_to_APIv2(tcat_tweet):
361        """
362        Attempt to construct a 4CAT tweet gathered from APIv2 to allow for use of Twitter specific processors!
363
364        A great deal of information is missing so there may result in some issues. Notes are kept for the expected
365        type and, if the data is missing in TCAT, None is used. Therefor it should be possible to refactor processors
366        to handle None if necessary.
367        """
368        # We're missing lots of data here...
369
370        urls = [url.strip() for url in (tcat_tweet["urls_expanded"].split(";") if tcat_tweet["urls_expanded"] else tcat_tweet["urls_followed"].split(";") if tcat_tweet["urls_followed"] else tcat_tweet["urls_followed"].split(";")) if url]
371        # TCAT media_id: 7 = video, 3 = photo, 16 = animated_gif
372        media_type = "video" if tcat_tweet["media_id"] == "7" else "photo" if tcat_tweet["media_id"] == "3" else "animated_gif" if tcat_tweet["media_id"] == "16" else tcat_tweet["media_id"]
373
374        # 4CAT Twitter APIv2 result data structure
375        APIv2_tweet = {
376            "lang": tcat_tweet["lang"],  # str
377            "source": tcat_tweet["source"],  # REMOVED FROM TWITTER API v2
378            "possibly_sensitive": True if tcat_tweet["possibly_sensitive"] == 1 else False if tcat_tweet["possibly_sensitive"] == 0 else None,  # bool
379            "text": tcat_tweet["text"],  # str
380            "edit_history_tweet_ids": None,  # list; Missing in TCAT data
381            "public_metrics": {
382                "retweet_count": tcat_tweet["retweet_count"],  # int
383                "reply_count": None,  # int; Missing in TCAT data
384                "like_count": tcat_tweet["favorite_count"],  # int
385                "quote_count": None,  # int; Missing in TCAT data
386                "impression_count": None,  # int; Missing in TCAT data
387                # TCAT has also favorite_count
388            },
389            "entities": {
390                "mentions": [{
391                    "id": None,  # str; Missing in TCAT data
392                    "username": mention.strip(),  # str
393                    # Twitter v2 API has additional user fields
394                } for mention in tcat_tweet["mentions"].split(";") if mention],
395                "annotations": None,  # list; Missing in TCAT data
396                "urls": [{
397                    "url": url,  # str
398                    "expanded_url": url,  # str
399                    # Twitter v2 API has additional URL fields
400                } for url in urls],
401                "hashtags": [{
402                    "tag": hashtag.strip(),  # str
403                    "start": None,  # int; Missing in TCAT data
404                    "end": None,  # int; Missing in TCAT data
405                } for hashtag in tcat_tweet["hashtags"].split(";") if hashtag],
406                "cashtags": None,  # list; Missing in TCAT data
407            },
408            "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["time"]),  # str
409            "id": tcat_tweet["id"],  # str
410            "author_id": tcat_tweet["from_user_id"],  # str
411            "context_annotations": None,  # list; Missing in TCAT data
412            "reply_settings": None,  # str; Missing in TCAT data
413            "conversation_id": None,  # str; TCAT has a in_reply_to_status_id but this is not necessarily the original Tweet that started the conversation
414            "author_user": {
415                "protected": None,  # bool; Missing in TCAT data
416                "verified": True if tcat_tweet["from_user_verified"] == 1 else False if tcat_tweet["from_user_verified"] == 0 else None,  # bool
417                "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["from_user_created_at"]),  # str
418                "name": tcat_tweet["from_user_realname"],  # str
419                "entities": {
420                    "description": None,  # dict; contains entities from author description such as mentions, URLs, etc.; Missing in TCAT data
421                    "url": None,  # dict; containers entities from author url e.g. URL data; Missing in TCAT data
422                },
423                "description": tcat_tweet["from_user_description"],  # str
424                "pinned_tweet_id": None,  # str; Missing in TCAT data
425                "profile_image_url": tcat_tweet["from_user_profile_image_url"],  # str
426                "url": tcat_tweet["from_user_url"],  # str
427                "username": tcat_tweet["from_user_name"],  # str
428                "id": tcat_tweet["from_user_id"],  # str
429                "location": None,  # str; Missing in TCAT data
430                "public_metrics": {
431                    "followers_count": tcat_tweet["from_user_followercount"],  # int
432                    "following_count": tcat_tweet["from_user_friendcount"],  # int
433                    "tweet_count": tcat_tweet["from_user_tweetcount"],  # int
434                    "listed_count": tcat_tweet["from_user_listed"],  # int
435                    # TCAT has also from_user_favourites_count
436                },
437                "withheld": {
438                    "country_codes": tcat_tweet["from_user_withheld_scope"].split(";"),  # list; TODO TCAT has column, but have not seen it populated in testing... This is guess
439                },
440                # TCAT has also from_user_lang, from_user_utcoffset, from_user_timezone
441            },
442            "attachments": {
443                # TCAT has some media data, but not the URLs listed
444                "media_keys": [{
445                    "type": media_type,
446                    "url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["photo"]]),  # str; TCAT does not have the URL though it may be in the list of URLs
447                    "variants": [{"url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["video"]]), "bit_rate":0}]  # list; This is not the expected direct link to video, but it is a URL to the video
448                    # Twitter API v2 has additional data
449                }],  # list; TCAT seems to only have one type of media per tweet
450                "poll_ids": None,  # list; Missing from TCAT data
451            },
452            "geo": {
453                "place_id": None,  # str; Missing from TCAT data
454                "place": {
455                    "country": None,  # str; Missing from TCAT data
456                    "id": None,  # str; Missing from TCAT data
457                    "geo": {
458
459                    },
460                    "country_code": None,  # str; Missing from TCAT data
461                    "name": tcat_tweet["location"],  # str
462                    "place_type": None,  # str; Missing from TCAT data
463                    "full_name": tcat_tweet["location"],  # str
464                },
465                "coordindates": {
466                    "type": None,  # str; Missing from TCAT data
467                    "coordinates": [tcat_tweet["lng"], tcat_tweet["lat"]],  # list i.e. [longitude, latitude]
468                },
469            },
470            "withheld": {
471                "copyright": True if tcat_tweet["withheld_copyright"] == 1 else False if tcat_tweet["withheld_copyright"] == 0 else None,  # bool; TODO TCAT has column, but have not seen it populated in testing... This is guess
472                "country_codes": tcat_tweet["withheld_scope"].split(";"),  # list; TODO TCAT has column, but have not seen it populated in testing... This is guess
473            },
474        }
475
476        # Referenced Tweets; Twitter API v2 has entire tweet data here which we will be missing
477        referenced_tweets = []
478        if tcat_tweet["text"][:4] == "RT @":
479            # Retweet
480            referenced_tweets.append({
481                "type": "retweeted",
482                "id": None,  # str; Missing in TCAT data
483            })
484        if tcat_tweet["quoted_status_id"]:
485            # Quote
486            referenced_tweets.append({
487                "type": "quoted",
488                "id": tcat_tweet["quoted_status_id"],  # str; Missing in TCAT data
489            })
490        if tcat_tweet["in_reply_to_status_id"]:
491            # Reply
492            referenced_tweets.append({
493                "type": "replied_to",
494                "id": tcat_tweet["in_reply_to_status_id"],  # str; Missing in TCAT data
495            })
496            # These should NOT be None in case a processor/user attempts to identify a reply using these
497            APIv2_tweet["in_reply_to_user_id"] = "UNKNOWN"  # str; Missing from TCAT data
498            APIv2_tweet["in_reply_to_user"] = {"username": "UNKNOWN"}  # dict; Missing from TCAT data
499
500        APIv2_tweet["referenced_tweets"] = referenced_tweets  # list
501
502        # Append any extra TCAT data
503        additional_TCAT_data = {}
504        for field in SearchWithinTCATBins.additional_TCAT_fields:
505            additional_TCAT_data["TCAT_"+field] = tcat_tweet[field]
506        APIv2_tweet.update(additional_TCAT_data)
507
508        return APIv2_tweet
509
510    @staticmethod
511    def validate_query(query, request, user):
512        """
513        Validate DMI-TCAT query input
514
515        :param dict query:  Query parameters, from client-side.
516        :param request:  Flask request
517        :param User user:  User object of user who has submitted the query
518        :return dict:  Safe query parameters
519        """
520        # no query 4 u
521        if not query.get("bin", "").strip():
522            raise QueryParametersException("You must choose a query bin to get tweets from.")
523
524        # Dates need to make sense as a range to search within
525        after, before = query.get("daterange")
526        if (after and before) and before <= after:
527            raise QueryParametersException("A date range must start before it ends")
528
529        query["min_date"], query["max_date"] = query.get("daterange")
530        del query["daterange"]
531
532        # simple!
533        return query
534
535    @staticmethod
536    def map_item(item):
537        """
538        Use Twitter APIv2 map_item
539        """
540        mapped_tweet = SearchWithTwitterAPIv2.map_item(item)
541
542        # Add TCAT extra data
543        data = mapped_tweet.get_item_data()
544        message = mapped_tweet.get_message()
545        for field in SearchWithinTCATBins.additional_TCAT_fields:
546            data["TCAT_" + field] = item.get("TCAT_" + field)
547
548        return MappedItem(data, message)

Get Tweets via DMI-TCAT

This allows subsetting an existing query bin, similar to the 'Data Selection' panel in the DMI-TCAT analysis interface

type = 'dmi-tcat-search'
extension = 'ndjson'
title = 'TCAT Search (HTTP)'
additional_TCAT_fields = ['to_user_name', 'filter_level', 'favorite_count', 'truncated', 'from_user_favourites_count', 'from_user_lang', 'from_user_utcoffset', 'from_user_timezone']
options = {'intro-1': {'type': 'info', 'help': 'This data source interfaces with a DMI-TCAT instance to allow subsetting of tweets from a tweet bin in that instance.'}, 'divider-1': {'type': 'divider'}, 'bin': {'type': 'info', 'help': 'Query bin'}, 'query': {'type': 'string', 'help': 'Query text', 'tooltip': 'Match all tweets containing this text.'}, 'query-exclude': {'type': 'string', 'help': 'Exclude text', 'tooltip': 'Match all tweets that do NOT contain this text.'}, 'user-name': {'type': 'string', 'help': 'From user', 'tooltip': 'Match all tweets from this username.'}, 'user-exclude': {'type': 'string', 'help': 'Exclude user', 'tooltip': 'Match all tweets NOT from this username.'}, 'exclude-replies': {'type': 'choice', 'options': {'exclude': 'Exclude replies', 'include': 'Include replies'}, 'help': 'Reply tweets', 'default': 'include', 'tooltip': 'Choose to exclude or include tweets that are replies from the data'}, 'daterange': {'type': 'daterange', 'help': 'Date range'}, 'divider-2': {'type': 'divider'}, 'advanced_options_info': {'type': 'info', 'help': 'Advanced Query Options can further refine your query'}, 'user-bio': {'type': 'string', 'help': 'User bio text', 'tooltip': 'Match all tweets from users with biographies containing this text.'}, 'user-language': {'type': 'string', 'help': 'User language', 'tooltip': 'Match all tweets from users using this language (as detected by Twitter).'}, 'tweet-language': {'type': 'string', 'help': 'Tweet language', 'tooltip': 'Match all tweets from users with this language (as detected by Twitter).'}, 'tweet-client': {'type': 'string', 'help': 'Twitter client URL/descr', 'tooltip': 'Match all tweets from clients that match this text.'}, 'url': {'type': 'string', 'help': '(Part of) URL', 'tooltip': 'Match all tweets containing this (partial) URL.'}, 'url-media': {'type': 'string', 'help': '(Part of) media URL', 'tooltip': 'Match all tweets containing this (partial) media URL.'}}
config = {'dmi-tcat-search.instances': {'type': 'json', 'help': 'DMI-TCAT instances', 'tooltip': 'List of DMI-TCAT instance URLs, e.g. ["http://username:password@tcat.instance.webpage.net"]. This needs to be formatted as a JSON list of strings.', 'default': {}}}
bin_data = {'all_bins': {}, 'last_collected': {}}
@classmethod
def collect_all_bins(cls, force_update=False):
139    @classmethod
140    def collect_all_bins(cls, force_update=False):
141        """
142        Requests bin information from TCAT instances
143        """
144        instances = config.get("dmi-tcat-search.instances", [])
145        for instance in instances:
146            # query each configured TCAT instance for a list of bins that can
147            # be subsetted
148            instance = instance.rstrip("/")
149            api_url = instance + "/api/bin-stats.php"
150
151            if force_update or instance not in cls.bin_data["last_collected"] or datetime.datetime.now()-datetime.timedelta(days=1) >= cls.bin_data["last_collected"][instance]:
152                # Collect Instance data
153                try:
154                    api_request = requests.get(api_url, timeout=5)
155                    instance_bins = json.loads(api_request.content)
156                    cls.bin_data["all_bins"][instance] = {k: instance_bins[k] for k in sorted(instance_bins)}
157                    cls.bin_data["last_collected"][instance] = datetime.datetime.now()
158                except (requests.RequestException, json.JSONDecodeError):
159                    cls.bin_data["all_bins"][instance] = {"failed": True}
160                    # TODO: No logger here as nothing has been initialized
161                    # print(f"WARNING, unable to collect TCAT bins from instance {instance}")
162                    pass

Requests bin information from TCAT instances

@classmethod
def get_options(cls, parent_dataset=None, user=None):
164    @classmethod
165    def get_options(cls, parent_dataset=None, user=None):
166        """
167        Get data source options
168
169        This method takes the pre-defined options, but fills the 'bins' options
170        with bins currently available from the configured TCAT instances.
171
172        :param DataSet parent_dataset:  An object representing the dataset that
173        the processor would be run on
174        :param User user:  Flask user the options will be displayed for, in
175        case they are requested for display in the 4CAT web interface. This can
176        be used to show some options only to privileges users.
177        """
178        options = cls.options
179
180        cls.collect_all_bins()
181        if all([data.get("failed", False) for instance, data in cls.bin_data["all_bins"].items()]):
182            options["bin"] = {
183                "type": UserInput.OPTION_INFO,
184                "help": "Could not connect to DMI-TCAT instance(s)."
185            }
186            return options
187
188        options["bin"] = {
189            "type": UserInput.OPTION_CHOICE,
190            "options": {},
191            "help": "Query bin"
192        }
193
194        for instance, bins in cls.bin_data["all_bins"].items():
195            # make the host somewhat human-readable
196            # also strip out embedded HTTP auths
197            host = re.sub(r"^https?://", "", instance).split("@").pop()
198            for bin_name, bin in bins.items():
199                bin_key = "%s@%s" % (bin_name, host)
200                display_text = f"{bin_name}: {bin.get('tweets_approximate')} tweets from {bin.get('range').get('first_tweet')} to {bin.get('range').get('last_tweet')}"
201                options["bin"]["options"][bin_key] = display_text
202
203        return options

Get data source options

This method takes the pre-defined options, but fills the 'bins' options with bins currently available from the configured TCAT instances.

Parameters
  • DataSet parent_dataset: An object representing the dataset that the processor would be run on
  • User user: Flask user the options will be displayed for, in case they are requested for display in the 4CAT web interface. This can be used to show some options only to privileges users.
def get_items(self, query):
205    def get_items(self, query):
206        """
207        Use the DMI-TCAT tweet export to retrieve tweets
208
209        :param query:
210        :return:
211        """
212        bin = self.parameters.get("bin")
213        bin_name = bin.split("@")[0]
214        bin_host = bin.split("@").pop()
215
216        # we cannot store the full instance URL as a parameter, because it may
217        # contain sensitive information (e.g. HTTP auth) - so we find the full
218        # instance URL again here
219        # while the parameter could be marked 'sensitive', the values would
220        # still show up in e.g. the HTML of the 'create dataset' form
221        available_instances = config.get("dmi-tcat-search.instances", [])
222        instance_url = ""
223        instance = None
224        for available_instance in available_instances:
225            hostname = re.sub(r"https?://", "", available_instance).split("@").pop().rstrip("/")
226            if hostname == bin_host:
227                instance_url = available_instance
228                instance = available_instance.rstrip("/")
229                break
230
231        if not instance_url:
232            return self.dataset.finish_with_error("Invalid DMI-TCAT instance name '%s'" % bin_host)
233
234        # Collect the bins again (ensure we have updated info in case bin is still active)
235        self.collect_all_bins(force_update=True)
236        # Add metadata to parameters
237        try:
238            current_bin = self.bin_data["all_bins"][instance][bin_name]
239        except KeyError:
240            return self.dataset.finish_with_error(f"Lost connection to TCAT instance {bin_host}")
241        # Add TCAT metadata to dataset
242        self.dataset.tcat_bin_data = current_bin
243        if current_bin.get("type") in ["follow", "track", "timeline", "geotrack"] and ("phrase_times" not in current_bin or not "user_times" not in current_bin):
244            self.dataset.update_status("Warning: TCAT not updated to send phrase and user time ranges; consider updating if you would like to retain this BIN metadata.")
245
246        # now get the parameters...
247        request_url = instance_url.rstrip("/") + "/analysis/mod.export_tweets.php"
248
249        # Allow for blank dates
250        if self.parameters.get("min_date"):
251            start_date = datetime.datetime.fromtimestamp(self.parameters.get("min_date")).strftime("%Y-%m-%d")
252        else:
253            first_tweet_timestamp = current_bin.get('range').get('first_tweet')
254            start_date = datetime.datetime.strptime(first_tweet_timestamp, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%d")
255
256        end_date = datetime.datetime.fromtimestamp(self.parameters.get("max_date")).strftime("%Y-%m-%d") if self.parameters.get("max_date") else (datetime.datetime.now() + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
257        parameters = {
258            "dataset": bin_name,
259            "query": self.parameters.get("query"),
260            "url_query": self.parameters.get("url"),
261            "media_url_query": self.parameters.get("url-media"),
262            "exclude": self.parameters.get("query-exclude"),
263            "from_user_name": self.parameters.get("user-name"),
264            "from_user_lang": self.parameters.get("user-language"),
265            "lang": self.parameters.get("tweet-language"),
266            "exclude_from_user_name": self.parameters.get("user-exclude"),
267            "from_source": re.sub(r"<[^>]+>", "", self.parameters.get("tweet-client")),
268            "startdate": start_date,
269            "enddate": end_date,
270            "replyto": "yes" if self.parameters.get("exclude-replies") == "exclude" else "no",
271            "whattodo": "",
272            "exportSettings": "urls,mentions,hashtags,media,",
273            "graph_resolution": "day",
274            "outputformat": "csv"
275        }
276
277        # for now we simply request the full CSV export of the bin with the
278        # given parameters, letting TCAT handle the full text search and so
279        # on
280        self.dataset.update_status("Searching for tweets on %s" % bin_host)
281        response = requests.get(request_url, params=parameters, stream=True)
282        if response.status_code != 200:
283            return self.dataset.finish_with_error("Query bin not available: received HTTP Error %i" % response.status_code)
284
285        # process the file in 1kB chunks, buffer as we go
286        # If a newline is encountered, the buffer is processed as a row of csv
287        # data. This works as long as there are no newlines in the csv itself,
288        # which is the case for TCAT exports. Processing as a stream is needed
289        # to avoid having to load the full file in memory
290        buffer = bytearray()
291        fieldnames = None
292        items = 0
293        encoding = None
294        for chunk in response.iter_content(chunk_size=1024):
295            # see if this chunk contains a newline, in which case we have a
296            # full line to process (e.g. as a tweet)
297            lines = []
298            buffer += bytearray(chunk)
299
300            if not encoding and len(buffer) > 3:
301                # response.encoding is not correct sometimes, since it does not
302                # indicate that the file uses a BOM, so sniff it instead once
303                # we have some bytes
304                encoding = sniff_encoding(buffer)
305
306            # split buffer by newlines and process each full line
307            # the last line is always carried over, since it may be incomplete
308            if b"\n" in buffer:
309                buffered_lines = buffer.split(b"\n")
310                lines = buffered_lines[:-1]
311                buffer = buffered_lines.pop()
312            elif not chunk:
313                # eof, process left-over data
314                lines = buffer.split(b"\n")
315
316            # and finally we can process the data
317            for line in lines:
318                # use a dummy csv reader to abstract away the annoying csv parsing
319                # this is quite a bit of overhead, but beats implementing csv parsing
320                # manually, and it's still reasonably fast (about 10k/second)
321                dummy_file = io.TextIOWrapper(io.BytesIO(line.replace(b"\0", b"")), encoding=encoding)
322                reader = csv.reader(dummy_file,
323                                    delimiter=",",
324                                    quotechar='"',
325                                    doublequote=True,
326                                    quoting=csv.QUOTE_MINIMAL)
327                row_data = next(reader)
328
329                if row_data and not fieldnames:
330                    # first line in file
331                    fieldnames = row_data.copy()
332
333                elif row_data:
334                    tweet = dict(zip(fieldnames, row_data))
335                    items += 1
336
337                    if items % 250 == 0:
338                        self.dataset.update_status("Loaded %i tweets from bin %s@%s" % (items, bin_name, bin_host))
339
340                    yield self.tcat_to_APIv2(tweet)
341
342            if not chunk:
343                # end of file
344                break

Use the DMI-TCAT tweet export to retrieve tweets

Parameters
  • query:
Returns
@staticmethod
def tcat_to_4cat_time(tcat_time):
346    @ staticmethod
347    def tcat_to_4cat_time(tcat_time):
348        """
349        Twitter APIv2 time is in format "%Y-%m-%dT%H:%M:%S.000Z" while TCAT uses "%Y-%m-%d %H:%M:%S" and a timestamp.
350
351        :return datetime:
352        """
353        try:
354            tcat_time = int(tcat_time)
355            return datetime.datetime.fromtimestamp(tcat_time).strftime("%Y-%m-%dT%H:%M:%S.000Z")
356        except ValueError:
357            return datetime.datetime.strptime(tcat_time, "%Y-%m-%d %H:%M:%S").strftime("%Y-%m-%dT%H:%M:%S.000Z")

Twitter APIv2 time is in format "%Y-%m-%dT%H:%M:%S.000Z" while TCAT uses "%Y-%m-%d %H:%M:%S" and a timestamp.

Returns
@staticmethod
def tcat_to_APIv2(tcat_tweet):
359    @staticmethod
360    def tcat_to_APIv2(tcat_tweet):
361        """
362        Attempt to construct a 4CAT tweet gathered from APIv2 to allow for use of Twitter specific processors!
363
364        A great deal of information is missing so there may result in some issues. Notes are kept for the expected
365        type and, if the data is missing in TCAT, None is used. Therefor it should be possible to refactor processors
366        to handle None if necessary.
367        """
368        # We're missing lots of data here...
369
370        urls = [url.strip() for url in (tcat_tweet["urls_expanded"].split(";") if tcat_tweet["urls_expanded"] else tcat_tweet["urls_followed"].split(";") if tcat_tweet["urls_followed"] else tcat_tweet["urls_followed"].split(";")) if url]
371        # TCAT media_id: 7 = video, 3 = photo, 16 = animated_gif
372        media_type = "video" if tcat_tweet["media_id"] == "7" else "photo" if tcat_tweet["media_id"] == "3" else "animated_gif" if tcat_tweet["media_id"] == "16" else tcat_tweet["media_id"]
373
374        # 4CAT Twitter APIv2 result data structure
375        APIv2_tweet = {
376            "lang": tcat_tweet["lang"],  # str
377            "source": tcat_tweet["source"],  # REMOVED FROM TWITTER API v2
378            "possibly_sensitive": True if tcat_tweet["possibly_sensitive"] == 1 else False if tcat_tweet["possibly_sensitive"] == 0 else None,  # bool
379            "text": tcat_tweet["text"],  # str
380            "edit_history_tweet_ids": None,  # list; Missing in TCAT data
381            "public_metrics": {
382                "retweet_count": tcat_tweet["retweet_count"],  # int
383                "reply_count": None,  # int; Missing in TCAT data
384                "like_count": tcat_tweet["favorite_count"],  # int
385                "quote_count": None,  # int; Missing in TCAT data
386                "impression_count": None,  # int; Missing in TCAT data
387                # TCAT has also favorite_count
388            },
389            "entities": {
390                "mentions": [{
391                    "id": None,  # str; Missing in TCAT data
392                    "username": mention.strip(),  # str
393                    # Twitter v2 API has additional user fields
394                } for mention in tcat_tweet["mentions"].split(";") if mention],
395                "annotations": None,  # list; Missing in TCAT data
396                "urls": [{
397                    "url": url,  # str
398                    "expanded_url": url,  # str
399                    # Twitter v2 API has additional URL fields
400                } for url in urls],
401                "hashtags": [{
402                    "tag": hashtag.strip(),  # str
403                    "start": None,  # int; Missing in TCAT data
404                    "end": None,  # int; Missing in TCAT data
405                } for hashtag in tcat_tweet["hashtags"].split(";") if hashtag],
406                "cashtags": None,  # list; Missing in TCAT data
407            },
408            "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["time"]),  # str
409            "id": tcat_tweet["id"],  # str
410            "author_id": tcat_tweet["from_user_id"],  # str
411            "context_annotations": None,  # list; Missing in TCAT data
412            "reply_settings": None,  # str; Missing in TCAT data
413            "conversation_id": None,  # str; TCAT has a in_reply_to_status_id but this is not necessarily the original Tweet that started the conversation
414            "author_user": {
415                "protected": None,  # bool; Missing in TCAT data
416                "verified": True if tcat_tweet["from_user_verified"] == 1 else False if tcat_tweet["from_user_verified"] == 0 else None,  # bool
417                "created_at": SearchWithinTCATBins.tcat_to_4cat_time(tcat_tweet["from_user_created_at"]),  # str
418                "name": tcat_tweet["from_user_realname"],  # str
419                "entities": {
420                    "description": None,  # dict; contains entities from author description such as mentions, URLs, etc.; Missing in TCAT data
421                    "url": None,  # dict; containers entities from author url e.g. URL data; Missing in TCAT data
422                },
423                "description": tcat_tweet["from_user_description"],  # str
424                "pinned_tweet_id": None,  # str; Missing in TCAT data
425                "profile_image_url": tcat_tweet["from_user_profile_image_url"],  # str
426                "url": tcat_tweet["from_user_url"],  # str
427                "username": tcat_tweet["from_user_name"],  # str
428                "id": tcat_tweet["from_user_id"],  # str
429                "location": None,  # str; Missing in TCAT data
430                "public_metrics": {
431                    "followers_count": tcat_tweet["from_user_followercount"],  # int
432                    "following_count": tcat_tweet["from_user_friendcount"],  # int
433                    "tweet_count": tcat_tweet["from_user_tweetcount"],  # int
434                    "listed_count": tcat_tweet["from_user_listed"],  # int
435                    # TCAT has also from_user_favourites_count
436                },
437                "withheld": {
438                    "country_codes": tcat_tweet["from_user_withheld_scope"].split(";"),  # list; TODO TCAT has column, but have not seen it populated in testing... This is guess
439                },
440                # TCAT has also from_user_lang, from_user_utcoffset, from_user_timezone
441            },
442            "attachments": {
443                # TCAT has some media data, but not the URLs listed
444                "media_keys": [{
445                    "type": media_type,
446                    "url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["photo"]]),  # str; TCAT does not have the URL though it may be in the list of URLs
447                    "variants": [{"url": ",".join([url for url in urls if (url.split("/")[-2] if len(url.split("/")) > 1 else "") in ["video"]]), "bit_rate":0}]  # list; This is not the expected direct link to video, but it is a URL to the video
448                    # Twitter API v2 has additional data
449                }],  # list; TCAT seems to only have one type of media per tweet
450                "poll_ids": None,  # list; Missing from TCAT data
451            },
452            "geo": {
453                "place_id": None,  # str; Missing from TCAT data
454                "place": {
455                    "country": None,  # str; Missing from TCAT data
456                    "id": None,  # str; Missing from TCAT data
457                    "geo": {
458
459                    },
460                    "country_code": None,  # str; Missing from TCAT data
461                    "name": tcat_tweet["location"],  # str
462                    "place_type": None,  # str; Missing from TCAT data
463                    "full_name": tcat_tweet["location"],  # str
464                },
465                "coordindates": {
466                    "type": None,  # str; Missing from TCAT data
467                    "coordinates": [tcat_tweet["lng"], tcat_tweet["lat"]],  # list i.e. [longitude, latitude]
468                },
469            },
470            "withheld": {
471                "copyright": True if tcat_tweet["withheld_copyright"] == 1 else False if tcat_tweet["withheld_copyright"] == 0 else None,  # bool; TODO TCAT has column, but have not seen it populated in testing... This is guess
472                "country_codes": tcat_tweet["withheld_scope"].split(";"),  # list; TODO TCAT has column, but have not seen it populated in testing... This is guess
473            },
474        }
475
476        # Referenced Tweets; Twitter API v2 has entire tweet data here which we will be missing
477        referenced_tweets = []
478        if tcat_tweet["text"][:4] == "RT @":
479            # Retweet
480            referenced_tweets.append({
481                "type": "retweeted",
482                "id": None,  # str; Missing in TCAT data
483            })
484        if tcat_tweet["quoted_status_id"]:
485            # Quote
486            referenced_tweets.append({
487                "type": "quoted",
488                "id": tcat_tweet["quoted_status_id"],  # str; Missing in TCAT data
489            })
490        if tcat_tweet["in_reply_to_status_id"]:
491            # Reply
492            referenced_tweets.append({
493                "type": "replied_to",
494                "id": tcat_tweet["in_reply_to_status_id"],  # str; Missing in TCAT data
495            })
496            # These should NOT be None in case a processor/user attempts to identify a reply using these
497            APIv2_tweet["in_reply_to_user_id"] = "UNKNOWN"  # str; Missing from TCAT data
498            APIv2_tweet["in_reply_to_user"] = {"username": "UNKNOWN"}  # dict; Missing from TCAT data
499
500        APIv2_tweet["referenced_tweets"] = referenced_tweets  # list
501
502        # Append any extra TCAT data
503        additional_TCAT_data = {}
504        for field in SearchWithinTCATBins.additional_TCAT_fields:
505            additional_TCAT_data["TCAT_"+field] = tcat_tweet[field]
506        APIv2_tweet.update(additional_TCAT_data)
507
508        return APIv2_tweet

Attempt to construct a 4CAT tweet gathered from APIv2 to allow for use of Twitter specific processors!

A great deal of information is missing so there may result in some issues. Notes are kept for the expected type and, if the data is missing in TCAT, None is used. Therefor it should be possible to refactor processors to handle None if necessary.

@staticmethod
def validate_query(query, request, user):
510    @staticmethod
511    def validate_query(query, request, user):
512        """
513        Validate DMI-TCAT query input
514
515        :param dict query:  Query parameters, from client-side.
516        :param request:  Flask request
517        :param User user:  User object of user who has submitted the query
518        :return dict:  Safe query parameters
519        """
520        # no query 4 u
521        if not query.get("bin", "").strip():
522            raise QueryParametersException("You must choose a query bin to get tweets from.")
523
524        # Dates need to make sense as a range to search within
525        after, before = query.get("daterange")
526        if (after and before) and before <= after:
527            raise QueryParametersException("A date range must start before it ends")
528
529        query["min_date"], query["max_date"] = query.get("daterange")
530        del query["daterange"]
531
532        # simple!
533        return query

Validate DMI-TCAT query input

Parameters
  • dict query: Query parameters, from client-side.
  • request: Flask request
  • User user: User object of user who has submitted the query
Returns

Safe query parameters

@staticmethod
def map_item(item):
535    @staticmethod
536    def map_item(item):
537        """
538        Use Twitter APIv2 map_item
539        """
540        mapped_tweet = SearchWithTwitterAPIv2.map_item(item)
541
542        # Add TCAT extra data
543        data = mapped_tweet.get_item_data()
544        message = mapped_tweet.get_message()
545        for field in SearchWithinTCATBins.additional_TCAT_fields:
546            data["TCAT_" + field] = item.get("TCAT_" + field)
547
548        return MappedItem(data, message)

Use Twitter APIv2 map_item