Edit on GitHub

datasources.vk.search_vk

VK keyword search

  1"""
  2VK keyword search
  3"""
  4import datetime
  5from pathlib import Path
  6
  7import vk_api
  8
  9from backend.lib.search import Search
 10from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException, ProcessorException
 11from common.lib.helpers import UserInput
 12from common.lib.item_mapping import MappedItem
 13from common.config_manager import config
 14
 15
 16class SearchVK(Search):
 17    """
 18    Get posts via the VK API
 19    """
 20    type = "vk-search"  # job ID
 21    title = "VK"
 22    extension = "ndjson"
 23    is_local = False    # Whether this datasource is locally scraped
 24    is_static = False   # Whether this datasource is still updated
 25
 26    previous_request = 0
 27    import_issues = True
 28
 29    references = [
 30        "[VK API documentation](https://vk.com/dev/first_guide)",
 31        "[Python API wrapper](https://github.com/python273/vk_api)"
 32    ]
 33
 34    expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count"  # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group
 35
 36    @classmethod
 37    def get_options(cls, parent_dataset=None, user=None):
 38        """
 39        Get VK data source options
 40
 41        :param parent_dataset:  Should always be None
 42        :param user:  User to provide options for
 43        :return dict:  Data source options
 44        """
 45
 46        intro_text = ("This data source uses VK's [API](https://vk.com/dev/first_guide) and a python "
 47                      "[wrapper](https://github.com/python273/vk_api) to request information from VK using your "
 48                      "username and password.")
 49
 50        options = {
 51            "intro-1": {
 52                "type": UserInput.OPTION_INFO,
 53                "help": intro_text
 54            },
 55            "query_type": {
 56                "type": UserInput.OPTION_CHOICE,
 57                "help": "Query Type",
 58                "options": {
 59                    "newsfeed": "News Feed search",
 60                },
 61                "default": "newsfeed"
 62            },
 63            "intro-2": {
 64                "type": UserInput.OPTION_INFO,
 65                "help": "Your username and password will be deleted after your query is complete."
 66            },
 67            "username": {
 68                "type": UserInput.OPTION_TEXT,
 69                "sensitive": True,
 70                "cache": True,
 71                "help": "VK Username"
 72            },
 73            "password": {
 74                "type": UserInput.OPTION_TEXT,
 75                "sensitive": True,
 76                "cache": True,
 77                "help": "VK Password"
 78            },
 79            "intro-3": {
 80                "type": UserInput.OPTION_INFO,
 81                "help": "Enter the text to search for below."
 82            },
 83            "query": {
 84                "type": UserInput.OPTION_TEXT_LARGE,
 85                "help": "Query"
 86            },
 87            "amount": {
 88                "type": UserInput.OPTION_TEXT,
 89                "help": "Max items to retrieve",
 90                "min": 0,
 91                "max": 1000,
 92                "default": 100
 93            },
 94            "include_comments": {
 95                "type": UserInput.OPTION_TOGGLE,
 96                "help": "Include post comments",
 97                "default": False,
 98                "tooltip": ""
 99            },
100            "divider-2": {
101                "type": UserInput.OPTION_DIVIDER
102            },
103            "daterange-info": {
104                "type": UserInput.OPTION_INFO,
105                "help": "VK daterange defaults vary by type of query. For the News Feed, posts are returned starting "
106                        "with the most recent and working backwards."
107            },
108            "daterange": {
109                "type": UserInput.OPTION_DATERANGE,
110                "help": "Date range"
111            },
112        }
113
114        return options
115
116    def get_items(self, query):
117        """
118        Use the VK API
119
120        :param query:
121        :return:
122        """
123        if self.parameters.get("username") is None or self.parameters.get("password") is None:
124            self.dataset.update_status(
125                "VK query failed or was interrupted; please create new query in order to provide username and password again.",
126                is_final=True)
127            return []
128
129        self.dataset.update_status(f"Logging in to VK")
130        try:
131            vk_session = self.login(self.parameters.get("username"), self.parameters.get("password"))
132        except vk_api.exceptions.AuthError as e:
133            self.log.warning(f"VK Auth Issues: {e}")
134            self.dataset.update_status(f"VK unable to authorize user: {e}", is_final=True)
135            return []
136
137        query_type = self.parameters.get("query_type")
138        query = self.parameters.get("query")
139        include_comments = self.parameters.get("include_comments", False)
140
141        if query_type == "newsfeed":
142            query_parameters = {"query": query,
143                                "max_amount": self.parameters.get("amount")}
144
145            # Add start and end dates if provided
146            if self.parameters.get("min_date"):
147                query_parameters['start_time'] = self.parameters.get("min_date")
148            if self.parameters.get("max_date"):
149                query_parameters['end_time'] = self.parameters.get("max_date")
150
151            vk_helper = vk_session.get_api()
152
153            # Collect Newsfeed results
154            num_results = 0
155            self.dataset.update_status(f"Submitting query...")
156            for i, result_batch in enumerate(self.search_newsfeed(vk_helper, **query_parameters)):
157                if self.interrupted:
158                    raise ProcessorInterruptedException("Interrupted while fetching newsfeed data from the VK API")
159
160                self.dataset.update_status(f"Processing results batch {i+1}")
161                for result in result_batch:
162                    result.update({'4cat_item_type': 'post'})
163                    yield result
164                    num_results += 1
165
166                    if include_comments:
167                        for comment in self.collect_all_comments(vk_helper, owner_id=result.get("owner_id"), post_id=result.get("id")):
168                            comment.update({'4cat_item_type': 'comment'})
169                            yield comment
170
171                    self.dataset.update_status(f"Received {num_results} results of max {self.parameters.get('amount')} from the VK API")
172                    self.dataset.update_progress(num_results / self.parameters.get('amount'))
173
174    def login(self, username, password):
175        """
176        Login and authenticate user
177        """
178        vk_session = vk_api.VkApi(username,
179                                  password,
180                                  config_filename=Path(config.get("PATH_ROOT")).joinpath(config.get("PATH_SESSIONS"), username+"-vk_config.json"))
181        vk_session.auth()
182
183        return vk_session
184
185    def search_newsfeed(self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs):
186        """
187        Collects all newsfeed posts
188
189        :param Object vk_helper:    Authorized vk_api.VkApi
190        :param str query:           String representing the search query
191        :param int max_amount:      Max number of posts to collect
192        :param int num_collected:   Number of previously collected results
193        :param int start_time:      Timestamp for earliest post
194        :param int end_time:        Timestamp for latest post
195        :return generator:          Yields groups of posts
196        """
197        remaining = max_amount - num_collected
198        parameters = {
199            "q": query,
200            "extended": 1,
201            "count": remaining if remaining < 200 else 200,
202            "fields": self.expanded_profile_fields,
203        }
204        if start_time:
205            parameters["start_time"] = start_time
206        if end_time:
207            parameters["end_time"] = end_time
208
209        response = vk_helper.newsfeed.search(**parameters)
210        news_feed_results = response.get("items", [])
211        num_collected = num_collected + len(news_feed_results)
212
213        # Flesh out profiles and groups
214        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
215        [result.update({"author_profile": author_profiles.get(result.get("from_id"), {})}) for result in news_feed_results]
216
217        yield news_feed_results
218
219        # Collect additional results
220        if response.get("next_from") and num_collected < max_amount:
221            parameters.update({"start_from": response.get("next_from")})
222            for additional_results in self.search_newsfeed(vk_helper, query, max_amount, num_collected=num_collected, **parameters):
223                yield additional_results
224
225    def collect_all_comments(self, vk_helper, owner_id, post_id):
226        """
227        Collects all comments and replies to a VK post
228
229        :param Object vk_helper:           Authorized vk_api.VkApi
230        :param int owner_id:            Owner ID provided by post/comment/etc
231        :param int post_id:             ID of post from which to collect comments
232        :return generator:              Yields comments and replies
233        """
234        # Collect top level comments from post
235        comments = self.get_comments(vk_helper, owner_id, post_id=post_id)
236
237        # Extract replies and collect more if needed
238        for comment in comments:
239            yield comment
240
241            reply_count = comment.get("thread", {}).get("count", 0)
242            replies = comment.get("thread", {}).get("items", [])
243            if reply_count > 10 and len(replies) == 10:
244                # Collect additional replies
245                replies += self.get_comments(vk_helper, owner_id, comment_id=comment.get("id"), last_collected_id=replies[-1].get("id"))[1:]
246
247            for reply in replies:
248                yield reply
249                if reply.get("thread"):
250                    self.log.warning("VK Datasource issue with replies: additional depth needs to be handled; contact 4CAT devs")
251                    # TODO: this will need modification if reply threads gain depth
252
253    def get_comments(self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs):
254        """
255        Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either
256        post_id or comment_id, but not both.
257
258        More information can be found here:
259        https://vk.com/dev/wall.getComments
260
261        :param Object vk_helper:       Authorized vk_api.VkApi
262        :param int owner_id:            Owner ID provided by post/comment/etc
263        :param int post_id:             ID of post from which to collect comments
264        :param int comment_id:          ID of comment from which to collect comments
265        :param int last_collected_id:   ID of the last comment to collected; used as start to continue collecting comments
266        :return list:                   List of comments
267        """
268        if self.interrupted:
269            raise ProcessorInterruptedException("Interrupted while fetching comments from the VK API")
270
271        if post_id is None and comment_id is None:
272            raise ProcessorException("Must provide either post_id or comment_id to collect comments from VK")
273
274        parameters = {
275            "owner_id": owner_id,
276            "need_likes": 1,
277            "preview_length": 0,
278            "extended": 1,
279            "count": 100,
280            "thread_items_count": 10,
281            "fields": self.expanded_profile_fields,
282        }
283        if post_id:
284            parameters.update({"post_id": post_id})
285        if comment_id:
286            parameters.update({"comment_id": comment_id})
287        if last_collected_id:
288            parameters.update({"start_comment_id": last_collected_id})
289
290        # Collect comments from VK
291        try:
292            response = vk_helper.wall.getComments(**parameters)
293        except vk_api.exceptions.ApiError as e:
294            self.dataset.log(f"Unable to collect comments for owner_id {owner_id} and {'post_id' if post_id is not None else 'comment_id'} {post_id if post_id is not None else comment_id}: {e}")
295            return []
296        comments = response.get("items", [])
297
298        # Flesh out profiles and groups
299        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
300        [comment.update({"author_profile": author_profiles.get(comment.get("from_id"), {})}) for comment in comments]
301        # Also expand replies
302        [reply.update({"author_profile": author_profiles.get(reply.get("from_id"), {})}) for replies in [comment.get("thread", {}).get("items", []) for comment in comments if comment.get("thread")] for reply in replies]
303
304        # Check if there are potentially additional comments
305        if response.get("count") > 100 and len(comments) == 100:
306            # Update params with last collected comment
307            parameters.update({"start_comment_id": comments[-1].get("id")})
308            # Collect additional comments from VK and remove first comment (which is duplicate)
309            comments += self.get_comments(vk_helper=vk_helper, **parameters)[1:]
310
311        return comments
312
313    @ staticmethod
314    def expand_profile_fields(dict_of_profile_types):
315        """
316        Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to
317        differentiate source of data later.
318        """
319        author_types = {}
320        for profile_type, profiles in dict_of_profile_types.items():
321            for profile in profiles:
322                if "id" not in profile:
323                    raise ProcessorException("Profile missing id field; VK data format incorrect/changed")
324                elif profile.get("id") in author_types:
325                    raise ProcessorException("Profile id duplicated across profile types; unable to combine profiles")
326                profile.update({"4CAT_author_profile_type": profile_type})
327                author_types[profile.get("id")] = profile
328        return author_types
329
330    @staticmethod
331    def validate_query(query, request, user):
332        """
333        Validate input for a dataset query on the VK data source.
334
335        Will raise a QueryParametersException if invalid parameters are
336        encountered. Parameters are additionally sanitised.
337
338        :param dict query:  Query parameters, from client-side.
339        :param request:  Flask request
340        :param User user:  User object of user who has submitted the query
341        :return dict:  Safe query parameters
342        """
343        # Please provide something...
344        if not query.get("query", None):
345            raise QueryParametersException("Please provide a query.")
346
347        # the dates need to make sense as a range to search within
348        # but, on VK, you can also specify before *or* after only
349        after, before = query.get("daterange")
350        if before and after and before < after:
351            raise QueryParametersException("Date range must start before it ends")
352
353        # TODO: test username and password?
354
355        # if we made it this far, the query can be executed
356        params = {
357            "query":  query.get("query"),
358            "query_type": query.get("query_type"),
359            "amount": query.get("amount"),
360            "include_comments": query.get("include_comments"),
361            "min_date": after,
362            "max_date": before,
363            "username": query.get("username"),
364            "password": query.get("password"),
365        }
366
367        return params
368
369    @staticmethod
370    def map_item(item):
371        """
372        Map a nested VK object to a flat dictionary
373
374        :param item:  VK object as originally returned by the VK API
375        :return dict:  Dictionary in the format expected by 4CAT
376        """
377        vk_item_time = datetime.datetime.fromtimestamp(item.get('date'))
378
379        # Process attachments
380        photos = []
381        videos = []
382        audio = []
383        links = []
384        docs = []
385        for attachment in item.get("attachments", []):
386            attachment_type = attachment.get("type")
387            attachment = attachment.get(attachment_type)
388            if attachment_type == "photo":
389                if attachment.get("sizes"):
390                    photos.append(sorted(attachment.get("sizes"), key=lambda d: d['width'], reverse=True)[0].get('url'))
391                else:
392                    photos.append(str(attachment))
393            elif attachment_type == "video":
394                # TODO: can I get the actual URL? Does not seem like it...
395                videos.append(f"https://vk.com/video{attachment.get('owner_id')}_{attachment.get('id')}")
396            elif attachment_type == "audio":
397                # TODO: Seem unable to create the URL with provided information...
398                audio.append(f"{attachment.get('artist')} - {attachment.get('title')}")
399            elif attachment_type == "link":
400                links.append(attachment.get('url', str(attachment)))
401            elif attachment_type == "doc":
402                docs.append(attachment.get('url', str(attachment)))
403
404        # Use 4cat_item_type to populate different fields
405        tread_id = ""
406        in_reply_to_user = ""
407        in_reply_to_comment_id = ""
408        if item.get("4cat_item_type") == "post":
409            tread_id = item.get("id")
410        elif item.get("4cat_item_type") == "comment":
411            tread_id = item.get("post_id")
412            in_reply_to_user = item.get("reply_to_user")
413            in_reply_to_comment_id = item.get("reply_to_comment")
414
415        author_profile = item.get("author_profile", {})
416        profile_source = "user" if author_profile.get("4CAT_author_profile_type") == "profile" else "community" if author_profile.get("4CAT_author_profile_type") == "group" else "N/A"
417        # Use source of author profile if "type" not present (e.g., in users profiles do not seem to have type)
418        author_type = author_profile.get("type", profile_source)
419
420        return MappedItem({
421            "id": item.get("id"),
422            "thread_id": tread_id,
423            "timestamp": vk_item_time.strftime("%Y-%m-%d %H:%M:%S"),
424            "unix_timestamp": int(vk_item_time.timestamp()),
425            "link": f"https://vk.com/wall{item.get('owner_id')}_{item.get('id')}",
426            "item_type": item.get("4cat_item_type"),
427            "body": item.get("text"),
428            "author_id": item.get("from_id"),
429            "author_type": author_type,
430            "author_screen_name": author_profile.get("screen_name"),
431            "author_name": author_profile.get("name", " ".join([author_profile.get("first_name", ""), author_profile.get("last_name", "")])),
432            "author_sex": "F" if author_profile.get("sex") == 1 else "M" if author_profile.get("sex") == 2 else "Not Specified" if author_profile.get("sex") == 0 else author_profile.get("sex", "N/A"),
433            "author_city": author_profile.get("city", {}).get("title", ""),
434            "author_country": author_profile.get("country", {}).get("title", ""),
435            "author_photo": author_profile.get("photo_200",
436                                               author_profile.get("photo_100", author_profile.get("photo_50", ""))),
437            "author_is_admin": True if author_profile.get("is_admin") == 1 else False if author_profile.get("is_admin") == 0 else author_profile.get("is_admin", "N/A"),
438            "author_is_advertiser": True if author_profile.get("is_advertiser") == 1 else False if author_profile.get(
439                "is_advertiser") == 0 else author_profile.get("is_advertiser", "N/A"),
440            "author_deactivated": author_profile.get("is_deactivated", False),
441            "author_privacy_is_closed": 'closed' if author_profile.get("is_closed") == 1 else 'open' if author_profile.get("is_closed") == 0 else 'private' if author_profile.get("is_closed") == 2 else author_profile.get("is_closed", "N/A"),
442            "author_followers": author_profile.get("followers_count", author_profile.get("members_count", "N/A")),
443            "in_reply_to_user": in_reply_to_user,
444            "in_reply_to_comment_id": in_reply_to_comment_id,
445            "source": item.get("post_source", {}).get("type"),
446            "views": item.get("views", {}).get("count"),
447            "likes": item.get("likes", {}).get("count"),
448            "post_comments": item.get("comments", {}).get("count"),
449            "edited": datetime.datetime.fromtimestamp(item.get("edited")).strftime("%Y-%m-%d %H:%M:%S") if item.get("edited", False) else False,
450            "photos": ", ".join(photos),
451            "videos": ", ".join(videos),
452            "audio": ", ".join(audio),
453            "links": ", ".join(links),
454            "docs": ", ".join(docs),
455            "subject": "",
456        })
class SearchVK(backend.lib.search.Search):
 17class SearchVK(Search):
 18    """
 19    Get posts via the VK API
 20    """
 21    type = "vk-search"  # job ID
 22    title = "VK"
 23    extension = "ndjson"
 24    is_local = False    # Whether this datasource is locally scraped
 25    is_static = False   # Whether this datasource is still updated
 26
 27    previous_request = 0
 28    import_issues = True
 29
 30    references = [
 31        "[VK API documentation](https://vk.com/dev/first_guide)",
 32        "[Python API wrapper](https://github.com/python273/vk_api)"
 33    ]
 34
 35    expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count"  # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group
 36
 37    @classmethod
 38    def get_options(cls, parent_dataset=None, user=None):
 39        """
 40        Get VK data source options
 41
 42        :param parent_dataset:  Should always be None
 43        :param user:  User to provide options for
 44        :return dict:  Data source options
 45        """
 46
 47        intro_text = ("This data source uses VK's [API](https://vk.com/dev/first_guide) and a python "
 48                      "[wrapper](https://github.com/python273/vk_api) to request information from VK using your "
 49                      "username and password.")
 50
 51        options = {
 52            "intro-1": {
 53                "type": UserInput.OPTION_INFO,
 54                "help": intro_text
 55            },
 56            "query_type": {
 57                "type": UserInput.OPTION_CHOICE,
 58                "help": "Query Type",
 59                "options": {
 60                    "newsfeed": "News Feed search",
 61                },
 62                "default": "newsfeed"
 63            },
 64            "intro-2": {
 65                "type": UserInput.OPTION_INFO,
 66                "help": "Your username and password will be deleted after your query is complete."
 67            },
 68            "username": {
 69                "type": UserInput.OPTION_TEXT,
 70                "sensitive": True,
 71                "cache": True,
 72                "help": "VK Username"
 73            },
 74            "password": {
 75                "type": UserInput.OPTION_TEXT,
 76                "sensitive": True,
 77                "cache": True,
 78                "help": "VK Password"
 79            },
 80            "intro-3": {
 81                "type": UserInput.OPTION_INFO,
 82                "help": "Enter the text to search for below."
 83            },
 84            "query": {
 85                "type": UserInput.OPTION_TEXT_LARGE,
 86                "help": "Query"
 87            },
 88            "amount": {
 89                "type": UserInput.OPTION_TEXT,
 90                "help": "Max items to retrieve",
 91                "min": 0,
 92                "max": 1000,
 93                "default": 100
 94            },
 95            "include_comments": {
 96                "type": UserInput.OPTION_TOGGLE,
 97                "help": "Include post comments",
 98                "default": False,
 99                "tooltip": ""
100            },
101            "divider-2": {
102                "type": UserInput.OPTION_DIVIDER
103            },
104            "daterange-info": {
105                "type": UserInput.OPTION_INFO,
106                "help": "VK daterange defaults vary by type of query. For the News Feed, posts are returned starting "
107                        "with the most recent and working backwards."
108            },
109            "daterange": {
110                "type": UserInput.OPTION_DATERANGE,
111                "help": "Date range"
112            },
113        }
114
115        return options
116
117    def get_items(self, query):
118        """
119        Use the VK API
120
121        :param query:
122        :return:
123        """
124        if self.parameters.get("username") is None or self.parameters.get("password") is None:
125            self.dataset.update_status(
126                "VK query failed or was interrupted; please create new query in order to provide username and password again.",
127                is_final=True)
128            return []
129
130        self.dataset.update_status(f"Logging in to VK")
131        try:
132            vk_session = self.login(self.parameters.get("username"), self.parameters.get("password"))
133        except vk_api.exceptions.AuthError as e:
134            self.log.warning(f"VK Auth Issues: {e}")
135            self.dataset.update_status(f"VK unable to authorize user: {e}", is_final=True)
136            return []
137
138        query_type = self.parameters.get("query_type")
139        query = self.parameters.get("query")
140        include_comments = self.parameters.get("include_comments", False)
141
142        if query_type == "newsfeed":
143            query_parameters = {"query": query,
144                                "max_amount": self.parameters.get("amount")}
145
146            # Add start and end dates if provided
147            if self.parameters.get("min_date"):
148                query_parameters['start_time'] = self.parameters.get("min_date")
149            if self.parameters.get("max_date"):
150                query_parameters['end_time'] = self.parameters.get("max_date")
151
152            vk_helper = vk_session.get_api()
153
154            # Collect Newsfeed results
155            num_results = 0
156            self.dataset.update_status(f"Submitting query...")
157            for i, result_batch in enumerate(self.search_newsfeed(vk_helper, **query_parameters)):
158                if self.interrupted:
159                    raise ProcessorInterruptedException("Interrupted while fetching newsfeed data from the VK API")
160
161                self.dataset.update_status(f"Processing results batch {i+1}")
162                for result in result_batch:
163                    result.update({'4cat_item_type': 'post'})
164                    yield result
165                    num_results += 1
166
167                    if include_comments:
168                        for comment in self.collect_all_comments(vk_helper, owner_id=result.get("owner_id"), post_id=result.get("id")):
169                            comment.update({'4cat_item_type': 'comment'})
170                            yield comment
171
172                    self.dataset.update_status(f"Received {num_results} results of max {self.parameters.get('amount')} from the VK API")
173                    self.dataset.update_progress(num_results / self.parameters.get('amount'))
174
175    def login(self, username, password):
176        """
177        Login and authenticate user
178        """
179        vk_session = vk_api.VkApi(username,
180                                  password,
181                                  config_filename=Path(config.get("PATH_ROOT")).joinpath(config.get("PATH_SESSIONS"), username+"-vk_config.json"))
182        vk_session.auth()
183
184        return vk_session
185
186    def search_newsfeed(self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs):
187        """
188        Collects all newsfeed posts
189
190        :param Object vk_helper:    Authorized vk_api.VkApi
191        :param str query:           String representing the search query
192        :param int max_amount:      Max number of posts to collect
193        :param int num_collected:   Number of previously collected results
194        :param int start_time:      Timestamp for earliest post
195        :param int end_time:        Timestamp for latest post
196        :return generator:          Yields groups of posts
197        """
198        remaining = max_amount - num_collected
199        parameters = {
200            "q": query,
201            "extended": 1,
202            "count": remaining if remaining < 200 else 200,
203            "fields": self.expanded_profile_fields,
204        }
205        if start_time:
206            parameters["start_time"] = start_time
207        if end_time:
208            parameters["end_time"] = end_time
209
210        response = vk_helper.newsfeed.search(**parameters)
211        news_feed_results = response.get("items", [])
212        num_collected = num_collected + len(news_feed_results)
213
214        # Flesh out profiles and groups
215        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
216        [result.update({"author_profile": author_profiles.get(result.get("from_id"), {})}) for result in news_feed_results]
217
218        yield news_feed_results
219
220        # Collect additional results
221        if response.get("next_from") and num_collected < max_amount:
222            parameters.update({"start_from": response.get("next_from")})
223            for additional_results in self.search_newsfeed(vk_helper, query, max_amount, num_collected=num_collected, **parameters):
224                yield additional_results
225
226    def collect_all_comments(self, vk_helper, owner_id, post_id):
227        """
228        Collects all comments and replies to a VK post
229
230        :param Object vk_helper:           Authorized vk_api.VkApi
231        :param int owner_id:            Owner ID provided by post/comment/etc
232        :param int post_id:             ID of post from which to collect comments
233        :return generator:              Yields comments and replies
234        """
235        # Collect top level comments from post
236        comments = self.get_comments(vk_helper, owner_id, post_id=post_id)
237
238        # Extract replies and collect more if needed
239        for comment in comments:
240            yield comment
241
242            reply_count = comment.get("thread", {}).get("count", 0)
243            replies = comment.get("thread", {}).get("items", [])
244            if reply_count > 10 and len(replies) == 10:
245                # Collect additional replies
246                replies += self.get_comments(vk_helper, owner_id, comment_id=comment.get("id"), last_collected_id=replies[-1].get("id"))[1:]
247
248            for reply in replies:
249                yield reply
250                if reply.get("thread"):
251                    self.log.warning("VK Datasource issue with replies: additional depth needs to be handled; contact 4CAT devs")
252                    # TODO: this will need modification if reply threads gain depth
253
254    def get_comments(self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs):
255        """
256        Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either
257        post_id or comment_id, but not both.
258
259        More information can be found here:
260        https://vk.com/dev/wall.getComments
261
262        :param Object vk_helper:       Authorized vk_api.VkApi
263        :param int owner_id:            Owner ID provided by post/comment/etc
264        :param int post_id:             ID of post from which to collect comments
265        :param int comment_id:          ID of comment from which to collect comments
266        :param int last_collected_id:   ID of the last comment to collected; used as start to continue collecting comments
267        :return list:                   List of comments
268        """
269        if self.interrupted:
270            raise ProcessorInterruptedException("Interrupted while fetching comments from the VK API")
271
272        if post_id is None and comment_id is None:
273            raise ProcessorException("Must provide either post_id or comment_id to collect comments from VK")
274
275        parameters = {
276            "owner_id": owner_id,
277            "need_likes": 1,
278            "preview_length": 0,
279            "extended": 1,
280            "count": 100,
281            "thread_items_count": 10,
282            "fields": self.expanded_profile_fields,
283        }
284        if post_id:
285            parameters.update({"post_id": post_id})
286        if comment_id:
287            parameters.update({"comment_id": comment_id})
288        if last_collected_id:
289            parameters.update({"start_comment_id": last_collected_id})
290
291        # Collect comments from VK
292        try:
293            response = vk_helper.wall.getComments(**parameters)
294        except vk_api.exceptions.ApiError as e:
295            self.dataset.log(f"Unable to collect comments for owner_id {owner_id} and {'post_id' if post_id is not None else 'comment_id'} {post_id if post_id is not None else comment_id}: {e}")
296            return []
297        comments = response.get("items", [])
298
299        # Flesh out profiles and groups
300        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
301        [comment.update({"author_profile": author_profiles.get(comment.get("from_id"), {})}) for comment in comments]
302        # Also expand replies
303        [reply.update({"author_profile": author_profiles.get(reply.get("from_id"), {})}) for replies in [comment.get("thread", {}).get("items", []) for comment in comments if comment.get("thread")] for reply in replies]
304
305        # Check if there are potentially additional comments
306        if response.get("count") > 100 and len(comments) == 100:
307            # Update params with last collected comment
308            parameters.update({"start_comment_id": comments[-1].get("id")})
309            # Collect additional comments from VK and remove first comment (which is duplicate)
310            comments += self.get_comments(vk_helper=vk_helper, **parameters)[1:]
311
312        return comments
313
314    @ staticmethod
315    def expand_profile_fields(dict_of_profile_types):
316        """
317        Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to
318        differentiate source of data later.
319        """
320        author_types = {}
321        for profile_type, profiles in dict_of_profile_types.items():
322            for profile in profiles:
323                if "id" not in profile:
324                    raise ProcessorException("Profile missing id field; VK data format incorrect/changed")
325                elif profile.get("id") in author_types:
326                    raise ProcessorException("Profile id duplicated across profile types; unable to combine profiles")
327                profile.update({"4CAT_author_profile_type": profile_type})
328                author_types[profile.get("id")] = profile
329        return author_types
330
331    @staticmethod
332    def validate_query(query, request, user):
333        """
334        Validate input for a dataset query on the VK data source.
335
336        Will raise a QueryParametersException if invalid parameters are
337        encountered. Parameters are additionally sanitised.
338
339        :param dict query:  Query parameters, from client-side.
340        :param request:  Flask request
341        :param User user:  User object of user who has submitted the query
342        :return dict:  Safe query parameters
343        """
344        # Please provide something...
345        if not query.get("query", None):
346            raise QueryParametersException("Please provide a query.")
347
348        # the dates need to make sense as a range to search within
349        # but, on VK, you can also specify before *or* after only
350        after, before = query.get("daterange")
351        if before and after and before < after:
352            raise QueryParametersException("Date range must start before it ends")
353
354        # TODO: test username and password?
355
356        # if we made it this far, the query can be executed
357        params = {
358            "query":  query.get("query"),
359            "query_type": query.get("query_type"),
360            "amount": query.get("amount"),
361            "include_comments": query.get("include_comments"),
362            "min_date": after,
363            "max_date": before,
364            "username": query.get("username"),
365            "password": query.get("password"),
366        }
367
368        return params
369
370    @staticmethod
371    def map_item(item):
372        """
373        Map a nested VK object to a flat dictionary
374
375        :param item:  VK object as originally returned by the VK API
376        :return dict:  Dictionary in the format expected by 4CAT
377        """
378        vk_item_time = datetime.datetime.fromtimestamp(item.get('date'))
379
380        # Process attachments
381        photos = []
382        videos = []
383        audio = []
384        links = []
385        docs = []
386        for attachment in item.get("attachments", []):
387            attachment_type = attachment.get("type")
388            attachment = attachment.get(attachment_type)
389            if attachment_type == "photo":
390                if attachment.get("sizes"):
391                    photos.append(sorted(attachment.get("sizes"), key=lambda d: d['width'], reverse=True)[0].get('url'))
392                else:
393                    photos.append(str(attachment))
394            elif attachment_type == "video":
395                # TODO: can I get the actual URL? Does not seem like it...
396                videos.append(f"https://vk.com/video{attachment.get('owner_id')}_{attachment.get('id')}")
397            elif attachment_type == "audio":
398                # TODO: Seem unable to create the URL with provided information...
399                audio.append(f"{attachment.get('artist')} - {attachment.get('title')}")
400            elif attachment_type == "link":
401                links.append(attachment.get('url', str(attachment)))
402            elif attachment_type == "doc":
403                docs.append(attachment.get('url', str(attachment)))
404
405        # Use 4cat_item_type to populate different fields
406        tread_id = ""
407        in_reply_to_user = ""
408        in_reply_to_comment_id = ""
409        if item.get("4cat_item_type") == "post":
410            tread_id = item.get("id")
411        elif item.get("4cat_item_type") == "comment":
412            tread_id = item.get("post_id")
413            in_reply_to_user = item.get("reply_to_user")
414            in_reply_to_comment_id = item.get("reply_to_comment")
415
416        author_profile = item.get("author_profile", {})
417        profile_source = "user" if author_profile.get("4CAT_author_profile_type") == "profile" else "community" if author_profile.get("4CAT_author_profile_type") == "group" else "N/A"
418        # Use source of author profile if "type" not present (e.g., in users profiles do not seem to have type)
419        author_type = author_profile.get("type", profile_source)
420
421        return MappedItem({
422            "id": item.get("id"),
423            "thread_id": tread_id,
424            "timestamp": vk_item_time.strftime("%Y-%m-%d %H:%M:%S"),
425            "unix_timestamp": int(vk_item_time.timestamp()),
426            "link": f"https://vk.com/wall{item.get('owner_id')}_{item.get('id')}",
427            "item_type": item.get("4cat_item_type"),
428            "body": item.get("text"),
429            "author_id": item.get("from_id"),
430            "author_type": author_type,
431            "author_screen_name": author_profile.get("screen_name"),
432            "author_name": author_profile.get("name", " ".join([author_profile.get("first_name", ""), author_profile.get("last_name", "")])),
433            "author_sex": "F" if author_profile.get("sex") == 1 else "M" if author_profile.get("sex") == 2 else "Not Specified" if author_profile.get("sex") == 0 else author_profile.get("sex", "N/A"),
434            "author_city": author_profile.get("city", {}).get("title", ""),
435            "author_country": author_profile.get("country", {}).get("title", ""),
436            "author_photo": author_profile.get("photo_200",
437                                               author_profile.get("photo_100", author_profile.get("photo_50", ""))),
438            "author_is_admin": True if author_profile.get("is_admin") == 1 else False if author_profile.get("is_admin") == 0 else author_profile.get("is_admin", "N/A"),
439            "author_is_advertiser": True if author_profile.get("is_advertiser") == 1 else False if author_profile.get(
440                "is_advertiser") == 0 else author_profile.get("is_advertiser", "N/A"),
441            "author_deactivated": author_profile.get("is_deactivated", False),
442            "author_privacy_is_closed": 'closed' if author_profile.get("is_closed") == 1 else 'open' if author_profile.get("is_closed") == 0 else 'private' if author_profile.get("is_closed") == 2 else author_profile.get("is_closed", "N/A"),
443            "author_followers": author_profile.get("followers_count", author_profile.get("members_count", "N/A")),
444            "in_reply_to_user": in_reply_to_user,
445            "in_reply_to_comment_id": in_reply_to_comment_id,
446            "source": item.get("post_source", {}).get("type"),
447            "views": item.get("views", {}).get("count"),
448            "likes": item.get("likes", {}).get("count"),
449            "post_comments": item.get("comments", {}).get("count"),
450            "edited": datetime.datetime.fromtimestamp(item.get("edited")).strftime("%Y-%m-%d %H:%M:%S") if item.get("edited", False) else False,
451            "photos": ", ".join(photos),
452            "videos": ", ".join(videos),
453            "audio": ", ".join(audio),
454            "links": ", ".join(links),
455            "docs": ", ".join(docs),
456            "subject": "",
457        })

Get posts via the VK API

type = 'vk-search'
title = 'VK'
extension = 'ndjson'
is_local = False
is_static = False
previous_request = 0
import_issues = True
references = ['[VK API documentation](https://vk.com/dev/first_guide)', '[Python API wrapper](https://github.com/python273/vk_api)']
expanded_profile_fields = 'id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count'
@classmethod
def get_options(cls, parent_dataset=None, user=None):
 37    @classmethod
 38    def get_options(cls, parent_dataset=None, user=None):
 39        """
 40        Get VK data source options
 41
 42        :param parent_dataset:  Should always be None
 43        :param user:  User to provide options for
 44        :return dict:  Data source options
 45        """
 46
 47        intro_text = ("This data source uses VK's [API](https://vk.com/dev/first_guide) and a python "
 48                      "[wrapper](https://github.com/python273/vk_api) to request information from VK using your "
 49                      "username and password.")
 50
 51        options = {
 52            "intro-1": {
 53                "type": UserInput.OPTION_INFO,
 54                "help": intro_text
 55            },
 56            "query_type": {
 57                "type": UserInput.OPTION_CHOICE,
 58                "help": "Query Type",
 59                "options": {
 60                    "newsfeed": "News Feed search",
 61                },
 62                "default": "newsfeed"
 63            },
 64            "intro-2": {
 65                "type": UserInput.OPTION_INFO,
 66                "help": "Your username and password will be deleted after your query is complete."
 67            },
 68            "username": {
 69                "type": UserInput.OPTION_TEXT,
 70                "sensitive": True,
 71                "cache": True,
 72                "help": "VK Username"
 73            },
 74            "password": {
 75                "type": UserInput.OPTION_TEXT,
 76                "sensitive": True,
 77                "cache": True,
 78                "help": "VK Password"
 79            },
 80            "intro-3": {
 81                "type": UserInput.OPTION_INFO,
 82                "help": "Enter the text to search for below."
 83            },
 84            "query": {
 85                "type": UserInput.OPTION_TEXT_LARGE,
 86                "help": "Query"
 87            },
 88            "amount": {
 89                "type": UserInput.OPTION_TEXT,
 90                "help": "Max items to retrieve",
 91                "min": 0,
 92                "max": 1000,
 93                "default": 100
 94            },
 95            "include_comments": {
 96                "type": UserInput.OPTION_TOGGLE,
 97                "help": "Include post comments",
 98                "default": False,
 99                "tooltip": ""
100            },
101            "divider-2": {
102                "type": UserInput.OPTION_DIVIDER
103            },
104            "daterange-info": {
105                "type": UserInput.OPTION_INFO,
106                "help": "VK daterange defaults vary by type of query. For the News Feed, posts are returned starting "
107                        "with the most recent and working backwards."
108            },
109            "daterange": {
110                "type": UserInput.OPTION_DATERANGE,
111                "help": "Date range"
112            },
113        }
114
115        return options

Get VK data source options

Parameters
  • parent_dataset: Should always be None
  • user: User to provide options for
Returns

Data source options

def get_items(self, query):
117    def get_items(self, query):
118        """
119        Use the VK API
120
121        :param query:
122        :return:
123        """
124        if self.parameters.get("username") is None or self.parameters.get("password") is None:
125            self.dataset.update_status(
126                "VK query failed or was interrupted; please create new query in order to provide username and password again.",
127                is_final=True)
128            return []
129
130        self.dataset.update_status(f"Logging in to VK")
131        try:
132            vk_session = self.login(self.parameters.get("username"), self.parameters.get("password"))
133        except vk_api.exceptions.AuthError as e:
134            self.log.warning(f"VK Auth Issues: {e}")
135            self.dataset.update_status(f"VK unable to authorize user: {e}", is_final=True)
136            return []
137
138        query_type = self.parameters.get("query_type")
139        query = self.parameters.get("query")
140        include_comments = self.parameters.get("include_comments", False)
141
142        if query_type == "newsfeed":
143            query_parameters = {"query": query,
144                                "max_amount": self.parameters.get("amount")}
145
146            # Add start and end dates if provided
147            if self.parameters.get("min_date"):
148                query_parameters['start_time'] = self.parameters.get("min_date")
149            if self.parameters.get("max_date"):
150                query_parameters['end_time'] = self.parameters.get("max_date")
151
152            vk_helper = vk_session.get_api()
153
154            # Collect Newsfeed results
155            num_results = 0
156            self.dataset.update_status(f"Submitting query...")
157            for i, result_batch in enumerate(self.search_newsfeed(vk_helper, **query_parameters)):
158                if self.interrupted:
159                    raise ProcessorInterruptedException("Interrupted while fetching newsfeed data from the VK API")
160
161                self.dataset.update_status(f"Processing results batch {i+1}")
162                for result in result_batch:
163                    result.update({'4cat_item_type': 'post'})
164                    yield result
165                    num_results += 1
166
167                    if include_comments:
168                        for comment in self.collect_all_comments(vk_helper, owner_id=result.get("owner_id"), post_id=result.get("id")):
169                            comment.update({'4cat_item_type': 'comment'})
170                            yield comment
171
172                    self.dataset.update_status(f"Received {num_results} results of max {self.parameters.get('amount')} from the VK API")
173                    self.dataset.update_progress(num_results / self.parameters.get('amount'))

Use the VK API

Parameters
  • query:
Returns
def login(self, username, password):
175    def login(self, username, password):
176        """
177        Login and authenticate user
178        """
179        vk_session = vk_api.VkApi(username,
180                                  password,
181                                  config_filename=Path(config.get("PATH_ROOT")).joinpath(config.get("PATH_SESSIONS"), username+"-vk_config.json"))
182        vk_session.auth()
183
184        return vk_session

Login and authenticate user

def search_newsfeed( self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs):
186    def search_newsfeed(self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs):
187        """
188        Collects all newsfeed posts
189
190        :param Object vk_helper:    Authorized vk_api.VkApi
191        :param str query:           String representing the search query
192        :param int max_amount:      Max number of posts to collect
193        :param int num_collected:   Number of previously collected results
194        :param int start_time:      Timestamp for earliest post
195        :param int end_time:        Timestamp for latest post
196        :return generator:          Yields groups of posts
197        """
198        remaining = max_amount - num_collected
199        parameters = {
200            "q": query,
201            "extended": 1,
202            "count": remaining if remaining < 200 else 200,
203            "fields": self.expanded_profile_fields,
204        }
205        if start_time:
206            parameters["start_time"] = start_time
207        if end_time:
208            parameters["end_time"] = end_time
209
210        response = vk_helper.newsfeed.search(**parameters)
211        news_feed_results = response.get("items", [])
212        num_collected = num_collected + len(news_feed_results)
213
214        # Flesh out profiles and groups
215        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
216        [result.update({"author_profile": author_profiles.get(result.get("from_id"), {})}) for result in news_feed_results]
217
218        yield news_feed_results
219
220        # Collect additional results
221        if response.get("next_from") and num_collected < max_amount:
222            parameters.update({"start_from": response.get("next_from")})
223            for additional_results in self.search_newsfeed(vk_helper, query, max_amount, num_collected=num_collected, **parameters):
224                yield additional_results

Collects all newsfeed posts

Parameters
  • Object vk_helper: Authorized vk_api.VkApi
  • str query: String representing the search query
  • int max_amount: Max number of posts to collect
  • int num_collected: Number of previously collected results
  • int start_time: Timestamp for earliest post
  • int end_time: Timestamp for latest post
Returns
      Yields groups of posts
def collect_all_comments(self, vk_helper, owner_id, post_id):
226    def collect_all_comments(self, vk_helper, owner_id, post_id):
227        """
228        Collects all comments and replies to a VK post
229
230        :param Object vk_helper:           Authorized vk_api.VkApi
231        :param int owner_id:            Owner ID provided by post/comment/etc
232        :param int post_id:             ID of post from which to collect comments
233        :return generator:              Yields comments and replies
234        """
235        # Collect top level comments from post
236        comments = self.get_comments(vk_helper, owner_id, post_id=post_id)
237
238        # Extract replies and collect more if needed
239        for comment in comments:
240            yield comment
241
242            reply_count = comment.get("thread", {}).get("count", 0)
243            replies = comment.get("thread", {}).get("items", [])
244            if reply_count > 10 and len(replies) == 10:
245                # Collect additional replies
246                replies += self.get_comments(vk_helper, owner_id, comment_id=comment.get("id"), last_collected_id=replies[-1].get("id"))[1:]
247
248            for reply in replies:
249                yield reply
250                if reply.get("thread"):
251                    self.log.warning("VK Datasource issue with replies: additional depth needs to be handled; contact 4CAT devs")
252                    # TODO: this will need modification if reply threads gain depth

Collects all comments and replies to a VK post

Parameters
  • Object vk_helper: Authorized vk_api.VkApi
  • int owner_id: Owner ID provided by post/comment/etc
  • int post_id: ID of post from which to collect comments
Returns
          Yields comments and replies
def get_comments( self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs):
254    def get_comments(self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs):
255        """
256        Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either
257        post_id or comment_id, but not both.
258
259        More information can be found here:
260        https://vk.com/dev/wall.getComments
261
262        :param Object vk_helper:       Authorized vk_api.VkApi
263        :param int owner_id:            Owner ID provided by post/comment/etc
264        :param int post_id:             ID of post from which to collect comments
265        :param int comment_id:          ID of comment from which to collect comments
266        :param int last_collected_id:   ID of the last comment to collected; used as start to continue collecting comments
267        :return list:                   List of comments
268        """
269        if self.interrupted:
270            raise ProcessorInterruptedException("Interrupted while fetching comments from the VK API")
271
272        if post_id is None and comment_id is None:
273            raise ProcessorException("Must provide either post_id or comment_id to collect comments from VK")
274
275        parameters = {
276            "owner_id": owner_id,
277            "need_likes": 1,
278            "preview_length": 0,
279            "extended": 1,
280            "count": 100,
281            "thread_items_count": 10,
282            "fields": self.expanded_profile_fields,
283        }
284        if post_id:
285            parameters.update({"post_id": post_id})
286        if comment_id:
287            parameters.update({"comment_id": comment_id})
288        if last_collected_id:
289            parameters.update({"start_comment_id": last_collected_id})
290
291        # Collect comments from VK
292        try:
293            response = vk_helper.wall.getComments(**parameters)
294        except vk_api.exceptions.ApiError as e:
295            self.dataset.log(f"Unable to collect comments for owner_id {owner_id} and {'post_id' if post_id is not None else 'comment_id'} {post_id if post_id is not None else comment_id}: {e}")
296            return []
297        comments = response.get("items", [])
298
299        # Flesh out profiles and groups
300        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
301        [comment.update({"author_profile": author_profiles.get(comment.get("from_id"), {})}) for comment in comments]
302        # Also expand replies
303        [reply.update({"author_profile": author_profiles.get(reply.get("from_id"), {})}) for replies in [comment.get("thread", {}).get("items", []) for comment in comments if comment.get("thread")] for reply in replies]
304
305        # Check if there are potentially additional comments
306        if response.get("count") > 100 and len(comments) == 100:
307            # Update params with last collected comment
308            parameters.update({"start_comment_id": comments[-1].get("id")})
309            # Collect additional comments from VK and remove first comment (which is duplicate)
310            comments += self.get_comments(vk_helper=vk_helper, **parameters)[1:]
311
312        return comments

Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either post_id or comment_id, but not both.

More information can be found here: https://vk.com/dev/wall.getComments

Parameters
  • Object vk_helper: Authorized vk_api.VkApi
  • int owner_id: Owner ID provided by post/comment/etc
  • int post_id: ID of post from which to collect comments
  • int comment_id: ID of comment from which to collect comments
  • int last_collected_id: ID of the last comment to collected; used as start to continue collecting comments
Returns
               List of comments
@staticmethod
def expand_profile_fields(dict_of_profile_types):
314    @ staticmethod
315    def expand_profile_fields(dict_of_profile_types):
316        """
317        Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to
318        differentiate source of data later.
319        """
320        author_types = {}
321        for profile_type, profiles in dict_of_profile_types.items():
322            for profile in profiles:
323                if "id" not in profile:
324                    raise ProcessorException("Profile missing id field; VK data format incorrect/changed")
325                elif profile.get("id") in author_types:
326                    raise ProcessorException("Profile id duplicated across profile types; unable to combine profiles")
327                profile.update({"4CAT_author_profile_type": profile_type})
328                author_types[profile.get("id")] = profile
329        return author_types

Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to differentiate source of data later.

@staticmethod
def validate_query(query, request, user):
331    @staticmethod
332    def validate_query(query, request, user):
333        """
334        Validate input for a dataset query on the VK data source.
335
336        Will raise a QueryParametersException if invalid parameters are
337        encountered. Parameters are additionally sanitised.
338
339        :param dict query:  Query parameters, from client-side.
340        :param request:  Flask request
341        :param User user:  User object of user who has submitted the query
342        :return dict:  Safe query parameters
343        """
344        # Please provide something...
345        if not query.get("query", None):
346            raise QueryParametersException("Please provide a query.")
347
348        # the dates need to make sense as a range to search within
349        # but, on VK, you can also specify before *or* after only
350        after, before = query.get("daterange")
351        if before and after and before < after:
352            raise QueryParametersException("Date range must start before it ends")
353
354        # TODO: test username and password?
355
356        # if we made it this far, the query can be executed
357        params = {
358            "query":  query.get("query"),
359            "query_type": query.get("query_type"),
360            "amount": query.get("amount"),
361            "include_comments": query.get("include_comments"),
362            "min_date": after,
363            "max_date": before,
364            "username": query.get("username"),
365            "password": query.get("password"),
366        }
367
368        return params

Validate input for a dataset query on the VK data source.

Will raise a QueryParametersException if invalid parameters are encountered. Parameters are additionally sanitised.

Parameters
  • dict query: Query parameters, from client-side.
  • request: Flask request
  • User user: User object of user who has submitted the query
Returns

Safe query parameters

@staticmethod
def map_item(item):
370    @staticmethod
371    def map_item(item):
372        """
373        Map a nested VK object to a flat dictionary
374
375        :param item:  VK object as originally returned by the VK API
376        :return dict:  Dictionary in the format expected by 4CAT
377        """
378        vk_item_time = datetime.datetime.fromtimestamp(item.get('date'))
379
380        # Process attachments
381        photos = []
382        videos = []
383        audio = []
384        links = []
385        docs = []
386        for attachment in item.get("attachments", []):
387            attachment_type = attachment.get("type")
388            attachment = attachment.get(attachment_type)
389            if attachment_type == "photo":
390                if attachment.get("sizes"):
391                    photos.append(sorted(attachment.get("sizes"), key=lambda d: d['width'], reverse=True)[0].get('url'))
392                else:
393                    photos.append(str(attachment))
394            elif attachment_type == "video":
395                # TODO: can I get the actual URL? Does not seem like it...
396                videos.append(f"https://vk.com/video{attachment.get('owner_id')}_{attachment.get('id')}")
397            elif attachment_type == "audio":
398                # TODO: Seem unable to create the URL with provided information...
399                audio.append(f"{attachment.get('artist')} - {attachment.get('title')}")
400            elif attachment_type == "link":
401                links.append(attachment.get('url', str(attachment)))
402            elif attachment_type == "doc":
403                docs.append(attachment.get('url', str(attachment)))
404
405        # Use 4cat_item_type to populate different fields
406        tread_id = ""
407        in_reply_to_user = ""
408        in_reply_to_comment_id = ""
409        if item.get("4cat_item_type") == "post":
410            tread_id = item.get("id")
411        elif item.get("4cat_item_type") == "comment":
412            tread_id = item.get("post_id")
413            in_reply_to_user = item.get("reply_to_user")
414            in_reply_to_comment_id = item.get("reply_to_comment")
415
416        author_profile = item.get("author_profile", {})
417        profile_source = "user" if author_profile.get("4CAT_author_profile_type") == "profile" else "community" if author_profile.get("4CAT_author_profile_type") == "group" else "N/A"
418        # Use source of author profile if "type" not present (e.g., in users profiles do not seem to have type)
419        author_type = author_profile.get("type", profile_source)
420
421        return MappedItem({
422            "id": item.get("id"),
423            "thread_id": tread_id,
424            "timestamp": vk_item_time.strftime("%Y-%m-%d %H:%M:%S"),
425            "unix_timestamp": int(vk_item_time.timestamp()),
426            "link": f"https://vk.com/wall{item.get('owner_id')}_{item.get('id')}",
427            "item_type": item.get("4cat_item_type"),
428            "body": item.get("text"),
429            "author_id": item.get("from_id"),
430            "author_type": author_type,
431            "author_screen_name": author_profile.get("screen_name"),
432            "author_name": author_profile.get("name", " ".join([author_profile.get("first_name", ""), author_profile.get("last_name", "")])),
433            "author_sex": "F" if author_profile.get("sex") == 1 else "M" if author_profile.get("sex") == 2 else "Not Specified" if author_profile.get("sex") == 0 else author_profile.get("sex", "N/A"),
434            "author_city": author_profile.get("city", {}).get("title", ""),
435            "author_country": author_profile.get("country", {}).get("title", ""),
436            "author_photo": author_profile.get("photo_200",
437                                               author_profile.get("photo_100", author_profile.get("photo_50", ""))),
438            "author_is_admin": True if author_profile.get("is_admin") == 1 else False if author_profile.get("is_admin") == 0 else author_profile.get("is_admin", "N/A"),
439            "author_is_advertiser": True if author_profile.get("is_advertiser") == 1 else False if author_profile.get(
440                "is_advertiser") == 0 else author_profile.get("is_advertiser", "N/A"),
441            "author_deactivated": author_profile.get("is_deactivated", False),
442            "author_privacy_is_closed": 'closed' if author_profile.get("is_closed") == 1 else 'open' if author_profile.get("is_closed") == 0 else 'private' if author_profile.get("is_closed") == 2 else author_profile.get("is_closed", "N/A"),
443            "author_followers": author_profile.get("followers_count", author_profile.get("members_count", "N/A")),
444            "in_reply_to_user": in_reply_to_user,
445            "in_reply_to_comment_id": in_reply_to_comment_id,
446            "source": item.get("post_source", {}).get("type"),
447            "views": item.get("views", {}).get("count"),
448            "likes": item.get("likes", {}).get("count"),
449            "post_comments": item.get("comments", {}).get("count"),
450            "edited": datetime.datetime.fromtimestamp(item.get("edited")).strftime("%Y-%m-%d %H:%M:%S") if item.get("edited", False) else False,
451            "photos": ", ".join(photos),
452            "videos": ", ".join(videos),
453            "audio": ", ".join(audio),
454            "links": ", ".join(links),
455            "docs": ", ".join(docs),
456            "subject": "",
457        })

Map a nested VK object to a flat dictionary

Parameters
  • item: VK object as originally returned by the VK API
Returns

Dictionary in the format expected by 4CAT