Edit on GitHub

datasources.vk.search_vk

VK keyword search

  1"""
  2VK keyword search
  3"""
  4import datetime
  5
  6import vk_api
  7
  8from backend.lib.search import Search
  9from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException, ProcessorException
 10from common.lib.helpers import UserInput
 11from common.lib.item_mapping import MappedItem
 12
 13
 14class SearchVK(Search):
 15    """
 16    Get posts via the VK API
 17    """
 18    type = "vk-search"  # job ID
 19    title = "VK"
 20    extension = "ndjson"
 21    is_local = False    # Whether this datasource is locally scraped
 22    is_static = False   # Whether this datasource is still updated
 23
 24    previous_request = 0
 25    import_issues = True
 26
 27    references = [
 28        "[VK API documentation](https://vk.com/dev/first_guide)",
 29        "[Python API wrapper](https://github.com/python273/vk_api)"
 30    ]
 31    
 32    expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count"  # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group
 33
 34    @classmethod
 35    def get_options(cls, parent_dataset=None, config=None):
 36        """
 37        Get VK data source options
 38
 39        :param config:
 40        :param parent_dataset:  Should always be None
 41        :return dict:  Data source options
 42        """
 43
 44        intro_text = ("This data source uses VK's [API](https://vk.com/dev/first_guide) and a python "
 45                      "[wrapper](https://github.com/python273/vk_api) to request information from VK using your "
 46                      "username and password.")
 47
 48        options = {
 49            "intro-1": {
 50                "type": UserInput.OPTION_INFO,
 51                "help": intro_text
 52            },
 53            "query_type": {
 54                "type": UserInput.OPTION_CHOICE,
 55                "help": "Query Type",
 56                "options": {
 57                    "newsfeed": "News Feed search",
 58                },
 59                "default": "newsfeed"
 60            },
 61            "intro-2": {
 62                "type": UserInput.OPTION_INFO,
 63                "help": "Your username and password will be deleted after your query is complete."
 64            },
 65            "username": {
 66                "type": UserInput.OPTION_TEXT,
 67                "sensitive": True,
 68                "cache": True,
 69                "help": "VK Username"
 70            },
 71            "password": {
 72                "type": UserInput.OPTION_TEXT,
 73                "sensitive": True,
 74                "cache": True,
 75                "help": "VK Password"
 76            },
 77            "intro-3": {
 78                "type": UserInput.OPTION_INFO,
 79                "help": "Enter the text to search for below."
 80            },
 81            "query": {
 82                "type": UserInput.OPTION_TEXT_LARGE,
 83                "help": "Query"
 84            },
 85            "amount": {
 86                "type": UserInput.OPTION_TEXT,
 87                "help": "Max items to retrieve",
 88                "min": 0,
 89                "max": 1000,
 90                "default": 100
 91            },
 92            "include_comments": {
 93                "type": UserInput.OPTION_TOGGLE,
 94                "help": "Include post comments",
 95                "default": False,
 96                "tooltip": ""
 97            },
 98            "divider-2": {
 99                "type": UserInput.OPTION_DIVIDER
100            },
101            "daterange-info": {
102                "type": UserInput.OPTION_INFO,
103                "help": "VK daterange defaults vary by type of query. For the News Feed, posts are returned starting "
104                        "with the most recent and working backwards."
105            },
106            "daterange": {
107                "type": UserInput.OPTION_DATERANGE,
108                "help": "Date range"
109            },
110        }
111
112        return options
113
114    def get_items(self, query):
115        """
116        Use the VK API
117
118        :param query:
119        :return:
120        """
121        if self.parameters.get("username") is None or self.parameters.get("password") is None:
122            self.dataset.update_status(
123                "VK query failed or was interrupted; please create new query in order to provide username and password again.",
124                is_final=True)
125            return []
126
127        self.dataset.update_status("Logging in to VK")
128        try:
129            vk_session = self.login(self.parameters.get("username"), self.parameters.get("password"))
130        except vk_api.exceptions.AuthError as e:
131            self.log.warning(f"VK Auth Issues: {e}")
132            self.dataset.update_status(f"VK unable to authorize user: {e}", is_final=True)
133            return []
134
135        query_type = self.parameters.get("query_type")
136        query = self.parameters.get("query")
137        include_comments = self.parameters.get("include_comments", False)
138
139        if query_type == "newsfeed":
140            query_parameters = {"query": query,
141                                "max_amount": self.parameters.get("amount")}
142
143            # Add start and end dates if provided
144            if self.parameters.get("min_date"):
145                query_parameters['start_time'] = self.parameters.get("min_date")
146            if self.parameters.get("max_date"):
147                query_parameters['end_time'] = self.parameters.get("max_date")
148
149            vk_helper = vk_session.get_api()
150
151            # Collect Newsfeed results
152            num_results = 0
153            self.dataset.update_status("Submitting query...")
154            for i, result_batch in enumerate(self.search_newsfeed(vk_helper, **query_parameters)):
155                if self.interrupted:
156                    raise ProcessorInterruptedException("Interrupted while fetching newsfeed data from the VK API")
157
158                self.dataset.update_status(f"Processing results batch {i+1}")
159                for result in result_batch:
160                    result.update({'4cat_item_type': 'post'})
161                    yield result
162                    num_results += 1
163
164                    if include_comments:
165                        for comment in self.collect_all_comments(vk_helper, owner_id=result.get("owner_id"), post_id=result.get("id")):
166                            comment.update({'4cat_item_type': 'comment'})
167                            yield comment
168
169                    self.dataset.update_status(f"Received {num_results} results of max {self.parameters.get('amount')} from the VK API")
170                    self.dataset.update_progress(num_results / self.parameters.get('amount'))
171
172    def login(self, username, password):
173        """
174        Login and authenticate user
175        """
176        vk_session = vk_api.VkApi(username,
177                                  password,
178                                  config_filename=self.config.get("PATH_SESSIONS").joinpath(username+"-vk_config.json"))
179        vk_session.auth()
180
181        return vk_session
182
183    def search_newsfeed(self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs):
184        """
185        Collects all newsfeed posts
186
187        :param Object vk_helper:    Authorized vk_api.VkApi
188        :param str query:           String representing the search query
189        :param int max_amount:      Max number of posts to collect
190        :param int num_collected:   Number of previously collected results
191        :param int start_time:      Timestamp for earliest post
192        :param int end_time:        Timestamp for latest post
193        :return generator:          Yields groups of posts
194        """
195        remaining = max_amount - num_collected
196        parameters = {
197            "q": query,
198            "extended": 1,
199            "count": remaining if remaining < 200 else 200,
200            "fields": self.expanded_profile_fields,
201        }
202        if start_time:
203            parameters["start_time"] = start_time
204        if end_time:
205            parameters["end_time"] = end_time
206
207        response = vk_helper.newsfeed.search(**parameters)
208        news_feed_results = response.get("items", [])
209        num_collected = num_collected + len(news_feed_results)
210
211        # Flesh out profiles and groups
212        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
213        [result.update({"author_profile": author_profiles.get(result.get("from_id"), {})}) for result in news_feed_results]
214
215        yield news_feed_results
216
217        # Collect additional results
218        if response.get("next_from") and num_collected < max_amount:
219            parameters.update({"start_from": response.get("next_from")})
220            for additional_results in self.search_newsfeed(vk_helper, query, max_amount, num_collected=num_collected, **parameters):
221                yield additional_results
222
223    def collect_all_comments(self, vk_helper, owner_id, post_id):
224        """
225        Collects all comments and replies to a VK post
226
227        :param Object vk_helper:           Authorized vk_api.VkApi
228        :param int owner_id:            Owner ID provided by post/comment/etc
229        :param int post_id:             ID of post from which to collect comments
230        :return generator:              Yields comments and replies
231        """
232        # Collect top level comments from post
233        comments = self.get_comments(vk_helper, owner_id, post_id=post_id)
234
235        # Extract replies and collect more if needed
236        for comment in comments:
237            yield comment
238
239            reply_count = comment.get("thread", {}).get("count", 0)
240            replies = comment.get("thread", {}).get("items", [])
241            if reply_count > 10 and len(replies) == 10:
242                # Collect additional replies
243                replies += self.get_comments(vk_helper, owner_id, comment_id=comment.get("id"), last_collected_id=replies[-1].get("id"))[1:]
244
245            for reply in replies:
246                yield reply
247                if reply.get("thread"):
248                    self.log.warning("VK Datasource issue with replies: additional depth needs to be handled; contact 4CAT devs")
249                    # TODO: this will need modification if reply threads gain depth
250
251    def get_comments(self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs):
252        """
253        Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either
254        post_id or comment_id, but not both.
255
256        More information can be found here:
257        https://vk.com/dev/wall.getComments
258
259        :param Object vk_helper:       Authorized vk_api.VkApi
260        :param int owner_id:            Owner ID provided by post/comment/etc
261        :param int post_id:             ID of post from which to collect comments
262        :param int comment_id:          ID of comment from which to collect comments
263        :param int last_collected_id:   ID of the last comment to collected; used as start to continue collecting comments
264        :return list:                   List of comments
265        """
266        if self.interrupted:
267            raise ProcessorInterruptedException("Interrupted while fetching comments from the VK API")
268
269        if post_id is None and comment_id is None:
270            raise ProcessorException("Must provide either post_id or comment_id to collect comments from VK")
271
272        parameters = {
273            "owner_id": owner_id,
274            "need_likes": 1,
275            "preview_length": 0,
276            "extended": 1,
277            "count": 100,
278            "thread_items_count": 10,
279            "fields": self.expanded_profile_fields,
280        }
281        if post_id:
282            parameters.update({"post_id": post_id})
283        if comment_id:
284            parameters.update({"comment_id": comment_id})
285        if last_collected_id:
286            parameters.update({"start_comment_id": last_collected_id})
287
288        # Collect comments from VK
289        try:
290            response = vk_helper.wall.getComments(**parameters)
291        except vk_api.exceptions.ApiError as e:
292            self.dataset.log(f"Unable to collect comments for owner_id {owner_id} and {'post_id' if post_id is not None else 'comment_id'} {post_id if post_id is not None else comment_id}: {e}")
293            return []
294        comments = response.get("items", [])
295
296        # Flesh out profiles and groups
297        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
298        [comment.update({"author_profile": author_profiles.get(comment.get("from_id"), {})}) for comment in comments]
299        # Also expand replies
300        [reply.update({"author_profile": author_profiles.get(reply.get("from_id"), {})}) for replies in [comment.get("thread", {}).get("items", []) for comment in comments if comment.get("thread")] for reply in replies]
301
302        # Check if there are potentially additional comments
303        if response.get("count") > 100 and len(comments) == 100:
304            # Update params with last collected comment
305            parameters.update({"start_comment_id": comments[-1].get("id")})
306            # Collect additional comments from VK and remove first comment (which is duplicate)
307            comments += self.get_comments(vk_helper=vk_helper, **parameters)[1:]
308
309        return comments
310
311    @ staticmethod
312    def expand_profile_fields(dict_of_profile_types):
313        """
314        Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to
315        differentiate source of data later.
316        """
317        author_types = {}
318        for profile_type, profiles in dict_of_profile_types.items():
319            for profile in profiles:
320                if "id" not in profile:
321                    raise ProcessorException("Profile missing id field; VK data format incorrect/changed")
322                elif profile.get("id") in author_types:
323                    raise ProcessorException("Profile id duplicated across profile types; unable to combine profiles")
324                profile.update({"4CAT_author_profile_type": profile_type})
325                author_types[profile.get("id")] = profile
326        return author_types
327
328    @staticmethod
329    def validate_query(query, request, config):
330        """
331        Validate input for a dataset query on the VK data source.
332
333        Will raise a QueryParametersException if invalid parameters are
334        encountered. Parameters are additionally sanitised.
335
336        :param dict query:  Query parameters, from client-side.
337        :param request:  Flask request
338        :param ConfigManager|None config:  Configuration reader (context-aware)
339        :return dict:  Safe query parameters
340        """
341        # Please provide something...
342        if not query.get("query", None):
343            raise QueryParametersException("Please provide a query.")
344
345        # the dates need to make sense as a range to search within
346        # but, on VK, you can also specify before *or* after only
347        after, before = query.get("daterange")
348        if before and after and before < after:
349            raise QueryParametersException("Date range must start before it ends")
350
351        # TODO: test username and password?
352
353        # if we made it this far, the query can be executed
354        params = {
355            "query":  query.get("query"),
356            "query_type": query.get("query_type"),
357            "amount": query.get("amount"),
358            "include_comments": query.get("include_comments"),
359            "min_date": after,
360            "max_date": before,
361            "username": query.get("username"),
362            "password": query.get("password"),
363        }
364
365        return params
366
367    @staticmethod
368    def map_item(item):
369        """
370        Map a nested VK object to a flat dictionary
371
372        :param item:  VK object as originally returned by the VK API
373        :return dict:  Dictionary in the format expected by 4CAT
374        """
375        vk_item_time = datetime.datetime.fromtimestamp(item.get('date'))
376
377        # Process attachments
378        photos = []
379        videos = []
380        audio = []
381        links = []
382        docs = []
383        for attachment in item.get("attachments", []):
384            attachment_type = attachment.get("type")
385            attachment = attachment.get(attachment_type)
386            if attachment_type == "photo":
387                if attachment.get("sizes"):
388                    photos.append(sorted(attachment.get("sizes"), key=lambda d: d['width'], reverse=True)[0].get('url'))
389                else:
390                    photos.append(str(attachment))
391            elif attachment_type == "video":
392                # TODO: can I get the actual URL? Does not seem like it...
393                videos.append(f"https://vk.com/video{attachment.get('owner_id')}_{attachment.get('id')}")
394            elif attachment_type == "audio":
395                # TODO: Seem unable to create the URL with provided information...
396                audio.append(f"{attachment.get('artist')} - {attachment.get('title')}")
397            elif attachment_type == "link":
398                links.append(attachment.get('url', str(attachment)))
399            elif attachment_type == "doc":
400                docs.append(attachment.get('url', str(attachment)))
401
402        # Use 4cat_item_type to populate different fields
403        tread_id = ""
404        in_reply_to_user = ""
405        in_reply_to_comment_id = ""
406        if item.get("4cat_item_type") == "post":
407            tread_id = item.get("id")
408        elif item.get("4cat_item_type") == "comment":
409            tread_id = item.get("post_id")
410            in_reply_to_user = item.get("reply_to_user")
411            in_reply_to_comment_id = item.get("reply_to_comment")
412
413        author_profile = item.get("author_profile", {})
414        profile_source = "user" if author_profile.get("4CAT_author_profile_type") == "profile" else "community" if author_profile.get("4CAT_author_profile_type") == "group" else "N/A"
415        # Use source of author profile if "type" not present (e.g., in users profiles do not seem to have type)
416        author_type = author_profile.get("type", profile_source)
417
418        return MappedItem({
419            "id": item.get("id"),
420            "thread_id": tread_id,
421            "timestamp": vk_item_time.strftime("%Y-%m-%d %H:%M:%S"),
422            "unix_timestamp": int(vk_item_time.timestamp()),
423            "link": f"https://vk.com/wall{item.get('owner_id')}_{item.get('id')}",
424            "item_type": item.get("4cat_item_type"),
425            "body": item.get("text"),
426            "author_id": item.get("from_id"),
427            "author_type": author_type,
428            "author_screen_name": author_profile.get("screen_name"),
429            "author_name": author_profile.get("name", " ".join([author_profile.get("first_name", ""), author_profile.get("last_name", "")])),
430            "author_sex": "F" if author_profile.get("sex") == 1 else "M" if author_profile.get("sex") == 2 else "Not Specified" if author_profile.get("sex") == 0 else author_profile.get("sex", "N/A"),
431            "author_city": author_profile.get("city", {}).get("title", ""),
432            "author_country": author_profile.get("country", {}).get("title", ""),
433            "author_photo": author_profile.get("photo_200",
434                                               author_profile.get("photo_100", author_profile.get("photo_50", ""))),
435            "author_is_admin": True if author_profile.get("is_admin") == 1 else False if author_profile.get("is_admin") == 0 else author_profile.get("is_admin", "N/A"),
436            "author_is_advertiser": True if author_profile.get("is_advertiser") == 1 else False if author_profile.get(
437                "is_advertiser") == 0 else author_profile.get("is_advertiser", "N/A"),
438            "author_deactivated": author_profile.get("is_deactivated", False),
439            "author_privacy_is_closed": 'closed' if author_profile.get("is_closed") == 1 else 'open' if author_profile.get("is_closed") == 0 else 'private' if author_profile.get("is_closed") == 2 else author_profile.get("is_closed", "N/A"),
440            "author_followers": author_profile.get("followers_count", author_profile.get("members_count", "N/A")),
441            "in_reply_to_user": in_reply_to_user,
442            "in_reply_to_comment_id": in_reply_to_comment_id,
443            "source": item.get("post_source", {}).get("type"),
444            "views": item.get("views", {}).get("count"),
445            "likes": item.get("likes", {}).get("count"),
446            "post_comments": item.get("comments", {}).get("count"),
447            "edited": datetime.datetime.fromtimestamp(item.get("edited")).strftime("%Y-%m-%d %H:%M:%S") if item.get("edited", False) else False,
448            "photos": ", ".join(photos),
449            "videos": ", ".join(videos),
450            "audio": ", ".join(audio),
451            "links": ", ".join(links),
452            "docs": ", ".join(docs),
453            "subject": "",
454        })
class SearchVK(backend.lib.search.Search):
 15class SearchVK(Search):
 16    """
 17    Get posts via the VK API
 18    """
 19    type = "vk-search"  # job ID
 20    title = "VK"
 21    extension = "ndjson"
 22    is_local = False    # Whether this datasource is locally scraped
 23    is_static = False   # Whether this datasource is still updated
 24
 25    previous_request = 0
 26    import_issues = True
 27
 28    references = [
 29        "[VK API documentation](https://vk.com/dev/first_guide)",
 30        "[Python API wrapper](https://github.com/python273/vk_api)"
 31    ]
 32    
 33    expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count"  # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group
 34
 35    @classmethod
 36    def get_options(cls, parent_dataset=None, config=None):
 37        """
 38        Get VK data source options
 39
 40        :param config:
 41        :param parent_dataset:  Should always be None
 42        :return dict:  Data source options
 43        """
 44
 45        intro_text = ("This data source uses VK's [API](https://vk.com/dev/first_guide) and a python "
 46                      "[wrapper](https://github.com/python273/vk_api) to request information from VK using your "
 47                      "username and password.")
 48
 49        options = {
 50            "intro-1": {
 51                "type": UserInput.OPTION_INFO,
 52                "help": intro_text
 53            },
 54            "query_type": {
 55                "type": UserInput.OPTION_CHOICE,
 56                "help": "Query Type",
 57                "options": {
 58                    "newsfeed": "News Feed search",
 59                },
 60                "default": "newsfeed"
 61            },
 62            "intro-2": {
 63                "type": UserInput.OPTION_INFO,
 64                "help": "Your username and password will be deleted after your query is complete."
 65            },
 66            "username": {
 67                "type": UserInput.OPTION_TEXT,
 68                "sensitive": True,
 69                "cache": True,
 70                "help": "VK Username"
 71            },
 72            "password": {
 73                "type": UserInput.OPTION_TEXT,
 74                "sensitive": True,
 75                "cache": True,
 76                "help": "VK Password"
 77            },
 78            "intro-3": {
 79                "type": UserInput.OPTION_INFO,
 80                "help": "Enter the text to search for below."
 81            },
 82            "query": {
 83                "type": UserInput.OPTION_TEXT_LARGE,
 84                "help": "Query"
 85            },
 86            "amount": {
 87                "type": UserInput.OPTION_TEXT,
 88                "help": "Max items to retrieve",
 89                "min": 0,
 90                "max": 1000,
 91                "default": 100
 92            },
 93            "include_comments": {
 94                "type": UserInput.OPTION_TOGGLE,
 95                "help": "Include post comments",
 96                "default": False,
 97                "tooltip": ""
 98            },
 99            "divider-2": {
100                "type": UserInput.OPTION_DIVIDER
101            },
102            "daterange-info": {
103                "type": UserInput.OPTION_INFO,
104                "help": "VK daterange defaults vary by type of query. For the News Feed, posts are returned starting "
105                        "with the most recent and working backwards."
106            },
107            "daterange": {
108                "type": UserInput.OPTION_DATERANGE,
109                "help": "Date range"
110            },
111        }
112
113        return options
114
115    def get_items(self, query):
116        """
117        Use the VK API
118
119        :param query:
120        :return:
121        """
122        if self.parameters.get("username") is None or self.parameters.get("password") is None:
123            self.dataset.update_status(
124                "VK query failed or was interrupted; please create new query in order to provide username and password again.",
125                is_final=True)
126            return []
127
128        self.dataset.update_status("Logging in to VK")
129        try:
130            vk_session = self.login(self.parameters.get("username"), self.parameters.get("password"))
131        except vk_api.exceptions.AuthError as e:
132            self.log.warning(f"VK Auth Issues: {e}")
133            self.dataset.update_status(f"VK unable to authorize user: {e}", is_final=True)
134            return []
135
136        query_type = self.parameters.get("query_type")
137        query = self.parameters.get("query")
138        include_comments = self.parameters.get("include_comments", False)
139
140        if query_type == "newsfeed":
141            query_parameters = {"query": query,
142                                "max_amount": self.parameters.get("amount")}
143
144            # Add start and end dates if provided
145            if self.parameters.get("min_date"):
146                query_parameters['start_time'] = self.parameters.get("min_date")
147            if self.parameters.get("max_date"):
148                query_parameters['end_time'] = self.parameters.get("max_date")
149
150            vk_helper = vk_session.get_api()
151
152            # Collect Newsfeed results
153            num_results = 0
154            self.dataset.update_status("Submitting query...")
155            for i, result_batch in enumerate(self.search_newsfeed(vk_helper, **query_parameters)):
156                if self.interrupted:
157                    raise ProcessorInterruptedException("Interrupted while fetching newsfeed data from the VK API")
158
159                self.dataset.update_status(f"Processing results batch {i+1}")
160                for result in result_batch:
161                    result.update({'4cat_item_type': 'post'})
162                    yield result
163                    num_results += 1
164
165                    if include_comments:
166                        for comment in self.collect_all_comments(vk_helper, owner_id=result.get("owner_id"), post_id=result.get("id")):
167                            comment.update({'4cat_item_type': 'comment'})
168                            yield comment
169
170                    self.dataset.update_status(f"Received {num_results} results of max {self.parameters.get('amount')} from the VK API")
171                    self.dataset.update_progress(num_results / self.parameters.get('amount'))
172
173    def login(self, username, password):
174        """
175        Login and authenticate user
176        """
177        vk_session = vk_api.VkApi(username,
178                                  password,
179                                  config_filename=self.config.get("PATH_SESSIONS").joinpath(username+"-vk_config.json"))
180        vk_session.auth()
181
182        return vk_session
183
184    def search_newsfeed(self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs):
185        """
186        Collects all newsfeed posts
187
188        :param Object vk_helper:    Authorized vk_api.VkApi
189        :param str query:           String representing the search query
190        :param int max_amount:      Max number of posts to collect
191        :param int num_collected:   Number of previously collected results
192        :param int start_time:      Timestamp for earliest post
193        :param int end_time:        Timestamp for latest post
194        :return generator:          Yields groups of posts
195        """
196        remaining = max_amount - num_collected
197        parameters = {
198            "q": query,
199            "extended": 1,
200            "count": remaining if remaining < 200 else 200,
201            "fields": self.expanded_profile_fields,
202        }
203        if start_time:
204            parameters["start_time"] = start_time
205        if end_time:
206            parameters["end_time"] = end_time
207
208        response = vk_helper.newsfeed.search(**parameters)
209        news_feed_results = response.get("items", [])
210        num_collected = num_collected + len(news_feed_results)
211
212        # Flesh out profiles and groups
213        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
214        [result.update({"author_profile": author_profiles.get(result.get("from_id"), {})}) for result in news_feed_results]
215
216        yield news_feed_results
217
218        # Collect additional results
219        if response.get("next_from") and num_collected < max_amount:
220            parameters.update({"start_from": response.get("next_from")})
221            for additional_results in self.search_newsfeed(vk_helper, query, max_amount, num_collected=num_collected, **parameters):
222                yield additional_results
223
224    def collect_all_comments(self, vk_helper, owner_id, post_id):
225        """
226        Collects all comments and replies to a VK post
227
228        :param Object vk_helper:           Authorized vk_api.VkApi
229        :param int owner_id:            Owner ID provided by post/comment/etc
230        :param int post_id:             ID of post from which to collect comments
231        :return generator:              Yields comments and replies
232        """
233        # Collect top level comments from post
234        comments = self.get_comments(vk_helper, owner_id, post_id=post_id)
235
236        # Extract replies and collect more if needed
237        for comment in comments:
238            yield comment
239
240            reply_count = comment.get("thread", {}).get("count", 0)
241            replies = comment.get("thread", {}).get("items", [])
242            if reply_count > 10 and len(replies) == 10:
243                # Collect additional replies
244                replies += self.get_comments(vk_helper, owner_id, comment_id=comment.get("id"), last_collected_id=replies[-1].get("id"))[1:]
245
246            for reply in replies:
247                yield reply
248                if reply.get("thread"):
249                    self.log.warning("VK Datasource issue with replies: additional depth needs to be handled; contact 4CAT devs")
250                    # TODO: this will need modification if reply threads gain depth
251
252    def get_comments(self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs):
253        """
254        Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either
255        post_id or comment_id, but not both.
256
257        More information can be found here:
258        https://vk.com/dev/wall.getComments
259
260        :param Object vk_helper:       Authorized vk_api.VkApi
261        :param int owner_id:            Owner ID provided by post/comment/etc
262        :param int post_id:             ID of post from which to collect comments
263        :param int comment_id:          ID of comment from which to collect comments
264        :param int last_collected_id:   ID of the last comment to collected; used as start to continue collecting comments
265        :return list:                   List of comments
266        """
267        if self.interrupted:
268            raise ProcessorInterruptedException("Interrupted while fetching comments from the VK API")
269
270        if post_id is None and comment_id is None:
271            raise ProcessorException("Must provide either post_id or comment_id to collect comments from VK")
272
273        parameters = {
274            "owner_id": owner_id,
275            "need_likes": 1,
276            "preview_length": 0,
277            "extended": 1,
278            "count": 100,
279            "thread_items_count": 10,
280            "fields": self.expanded_profile_fields,
281        }
282        if post_id:
283            parameters.update({"post_id": post_id})
284        if comment_id:
285            parameters.update({"comment_id": comment_id})
286        if last_collected_id:
287            parameters.update({"start_comment_id": last_collected_id})
288
289        # Collect comments from VK
290        try:
291            response = vk_helper.wall.getComments(**parameters)
292        except vk_api.exceptions.ApiError as e:
293            self.dataset.log(f"Unable to collect comments for owner_id {owner_id} and {'post_id' if post_id is not None else 'comment_id'} {post_id if post_id is not None else comment_id}: {e}")
294            return []
295        comments = response.get("items", [])
296
297        # Flesh out profiles and groups
298        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
299        [comment.update({"author_profile": author_profiles.get(comment.get("from_id"), {})}) for comment in comments]
300        # Also expand replies
301        [reply.update({"author_profile": author_profiles.get(reply.get("from_id"), {})}) for replies in [comment.get("thread", {}).get("items", []) for comment in comments if comment.get("thread")] for reply in replies]
302
303        # Check if there are potentially additional comments
304        if response.get("count") > 100 and len(comments) == 100:
305            # Update params with last collected comment
306            parameters.update({"start_comment_id": comments[-1].get("id")})
307            # Collect additional comments from VK and remove first comment (which is duplicate)
308            comments += self.get_comments(vk_helper=vk_helper, **parameters)[1:]
309
310        return comments
311
312    @ staticmethod
313    def expand_profile_fields(dict_of_profile_types):
314        """
315        Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to
316        differentiate source of data later.
317        """
318        author_types = {}
319        for profile_type, profiles in dict_of_profile_types.items():
320            for profile in profiles:
321                if "id" not in profile:
322                    raise ProcessorException("Profile missing id field; VK data format incorrect/changed")
323                elif profile.get("id") in author_types:
324                    raise ProcessorException("Profile id duplicated across profile types; unable to combine profiles")
325                profile.update({"4CAT_author_profile_type": profile_type})
326                author_types[profile.get("id")] = profile
327        return author_types
328
329    @staticmethod
330    def validate_query(query, request, config):
331        """
332        Validate input for a dataset query on the VK data source.
333
334        Will raise a QueryParametersException if invalid parameters are
335        encountered. Parameters are additionally sanitised.
336
337        :param dict query:  Query parameters, from client-side.
338        :param request:  Flask request
339        :param ConfigManager|None config:  Configuration reader (context-aware)
340        :return dict:  Safe query parameters
341        """
342        # Please provide something...
343        if not query.get("query", None):
344            raise QueryParametersException("Please provide a query.")
345
346        # the dates need to make sense as a range to search within
347        # but, on VK, you can also specify before *or* after only
348        after, before = query.get("daterange")
349        if before and after and before < after:
350            raise QueryParametersException("Date range must start before it ends")
351
352        # TODO: test username and password?
353
354        # if we made it this far, the query can be executed
355        params = {
356            "query":  query.get("query"),
357            "query_type": query.get("query_type"),
358            "amount": query.get("amount"),
359            "include_comments": query.get("include_comments"),
360            "min_date": after,
361            "max_date": before,
362            "username": query.get("username"),
363            "password": query.get("password"),
364        }
365
366        return params
367
368    @staticmethod
369    def map_item(item):
370        """
371        Map a nested VK object to a flat dictionary
372
373        :param item:  VK object as originally returned by the VK API
374        :return dict:  Dictionary in the format expected by 4CAT
375        """
376        vk_item_time = datetime.datetime.fromtimestamp(item.get('date'))
377
378        # Process attachments
379        photos = []
380        videos = []
381        audio = []
382        links = []
383        docs = []
384        for attachment in item.get("attachments", []):
385            attachment_type = attachment.get("type")
386            attachment = attachment.get(attachment_type)
387            if attachment_type == "photo":
388                if attachment.get("sizes"):
389                    photos.append(sorted(attachment.get("sizes"), key=lambda d: d['width'], reverse=True)[0].get('url'))
390                else:
391                    photos.append(str(attachment))
392            elif attachment_type == "video":
393                # TODO: can I get the actual URL? Does not seem like it...
394                videos.append(f"https://vk.com/video{attachment.get('owner_id')}_{attachment.get('id')}")
395            elif attachment_type == "audio":
396                # TODO: Seem unable to create the URL with provided information...
397                audio.append(f"{attachment.get('artist')} - {attachment.get('title')}")
398            elif attachment_type == "link":
399                links.append(attachment.get('url', str(attachment)))
400            elif attachment_type == "doc":
401                docs.append(attachment.get('url', str(attachment)))
402
403        # Use 4cat_item_type to populate different fields
404        tread_id = ""
405        in_reply_to_user = ""
406        in_reply_to_comment_id = ""
407        if item.get("4cat_item_type") == "post":
408            tread_id = item.get("id")
409        elif item.get("4cat_item_type") == "comment":
410            tread_id = item.get("post_id")
411            in_reply_to_user = item.get("reply_to_user")
412            in_reply_to_comment_id = item.get("reply_to_comment")
413
414        author_profile = item.get("author_profile", {})
415        profile_source = "user" if author_profile.get("4CAT_author_profile_type") == "profile" else "community" if author_profile.get("4CAT_author_profile_type") == "group" else "N/A"
416        # Use source of author profile if "type" not present (e.g., in users profiles do not seem to have type)
417        author_type = author_profile.get("type", profile_source)
418
419        return MappedItem({
420            "id": item.get("id"),
421            "thread_id": tread_id,
422            "timestamp": vk_item_time.strftime("%Y-%m-%d %H:%M:%S"),
423            "unix_timestamp": int(vk_item_time.timestamp()),
424            "link": f"https://vk.com/wall{item.get('owner_id')}_{item.get('id')}",
425            "item_type": item.get("4cat_item_type"),
426            "body": item.get("text"),
427            "author_id": item.get("from_id"),
428            "author_type": author_type,
429            "author_screen_name": author_profile.get("screen_name"),
430            "author_name": author_profile.get("name", " ".join([author_profile.get("first_name", ""), author_profile.get("last_name", "")])),
431            "author_sex": "F" if author_profile.get("sex") == 1 else "M" if author_profile.get("sex") == 2 else "Not Specified" if author_profile.get("sex") == 0 else author_profile.get("sex", "N/A"),
432            "author_city": author_profile.get("city", {}).get("title", ""),
433            "author_country": author_profile.get("country", {}).get("title", ""),
434            "author_photo": author_profile.get("photo_200",
435                                               author_profile.get("photo_100", author_profile.get("photo_50", ""))),
436            "author_is_admin": True if author_profile.get("is_admin") == 1 else False if author_profile.get("is_admin") == 0 else author_profile.get("is_admin", "N/A"),
437            "author_is_advertiser": True if author_profile.get("is_advertiser") == 1 else False if author_profile.get(
438                "is_advertiser") == 0 else author_profile.get("is_advertiser", "N/A"),
439            "author_deactivated": author_profile.get("is_deactivated", False),
440            "author_privacy_is_closed": 'closed' if author_profile.get("is_closed") == 1 else 'open' if author_profile.get("is_closed") == 0 else 'private' if author_profile.get("is_closed") == 2 else author_profile.get("is_closed", "N/A"),
441            "author_followers": author_profile.get("followers_count", author_profile.get("members_count", "N/A")),
442            "in_reply_to_user": in_reply_to_user,
443            "in_reply_to_comment_id": in_reply_to_comment_id,
444            "source": item.get("post_source", {}).get("type"),
445            "views": item.get("views", {}).get("count"),
446            "likes": item.get("likes", {}).get("count"),
447            "post_comments": item.get("comments", {}).get("count"),
448            "edited": datetime.datetime.fromtimestamp(item.get("edited")).strftime("%Y-%m-%d %H:%M:%S") if item.get("edited", False) else False,
449            "photos": ", ".join(photos),
450            "videos": ", ".join(videos),
451            "audio": ", ".join(audio),
452            "links": ", ".join(links),
453            "docs": ", ".join(docs),
454            "subject": "",
455        })

Get posts via the VK API

type = 'vk-search'
title = 'VK'
extension = 'ndjson'
is_local = False
is_static = False
previous_request = 0
import_issues = True
references = ['[VK API documentation](https://vk.com/dev/first_guide)', '[Python API wrapper](https://github.com/python273/vk_api)']
expanded_profile_fields = 'id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count'
@classmethod
def get_options(cls, parent_dataset=None, config=None):
 35    @classmethod
 36    def get_options(cls, parent_dataset=None, config=None):
 37        """
 38        Get VK data source options
 39
 40        :param config:
 41        :param parent_dataset:  Should always be None
 42        :return dict:  Data source options
 43        """
 44
 45        intro_text = ("This data source uses VK's [API](https://vk.com/dev/first_guide) and a python "
 46                      "[wrapper](https://github.com/python273/vk_api) to request information from VK using your "
 47                      "username and password.")
 48
 49        options = {
 50            "intro-1": {
 51                "type": UserInput.OPTION_INFO,
 52                "help": intro_text
 53            },
 54            "query_type": {
 55                "type": UserInput.OPTION_CHOICE,
 56                "help": "Query Type",
 57                "options": {
 58                    "newsfeed": "News Feed search",
 59                },
 60                "default": "newsfeed"
 61            },
 62            "intro-2": {
 63                "type": UserInput.OPTION_INFO,
 64                "help": "Your username and password will be deleted after your query is complete."
 65            },
 66            "username": {
 67                "type": UserInput.OPTION_TEXT,
 68                "sensitive": True,
 69                "cache": True,
 70                "help": "VK Username"
 71            },
 72            "password": {
 73                "type": UserInput.OPTION_TEXT,
 74                "sensitive": True,
 75                "cache": True,
 76                "help": "VK Password"
 77            },
 78            "intro-3": {
 79                "type": UserInput.OPTION_INFO,
 80                "help": "Enter the text to search for below."
 81            },
 82            "query": {
 83                "type": UserInput.OPTION_TEXT_LARGE,
 84                "help": "Query"
 85            },
 86            "amount": {
 87                "type": UserInput.OPTION_TEXT,
 88                "help": "Max items to retrieve",
 89                "min": 0,
 90                "max": 1000,
 91                "default": 100
 92            },
 93            "include_comments": {
 94                "type": UserInput.OPTION_TOGGLE,
 95                "help": "Include post comments",
 96                "default": False,
 97                "tooltip": ""
 98            },
 99            "divider-2": {
100                "type": UserInput.OPTION_DIVIDER
101            },
102            "daterange-info": {
103                "type": UserInput.OPTION_INFO,
104                "help": "VK daterange defaults vary by type of query. For the News Feed, posts are returned starting "
105                        "with the most recent and working backwards."
106            },
107            "daterange": {
108                "type": UserInput.OPTION_DATERANGE,
109                "help": "Date range"
110            },
111        }
112
113        return options

Get VK data source options

Parameters
  • config:
  • parent_dataset: Should always be None
Returns

Data source options

def get_items(self, query):
115    def get_items(self, query):
116        """
117        Use the VK API
118
119        :param query:
120        :return:
121        """
122        if self.parameters.get("username") is None or self.parameters.get("password") is None:
123            self.dataset.update_status(
124                "VK query failed or was interrupted; please create new query in order to provide username and password again.",
125                is_final=True)
126            return []
127
128        self.dataset.update_status("Logging in to VK")
129        try:
130            vk_session = self.login(self.parameters.get("username"), self.parameters.get("password"))
131        except vk_api.exceptions.AuthError as e:
132            self.log.warning(f"VK Auth Issues: {e}")
133            self.dataset.update_status(f"VK unable to authorize user: {e}", is_final=True)
134            return []
135
136        query_type = self.parameters.get("query_type")
137        query = self.parameters.get("query")
138        include_comments = self.parameters.get("include_comments", False)
139
140        if query_type == "newsfeed":
141            query_parameters = {"query": query,
142                                "max_amount": self.parameters.get("amount")}
143
144            # Add start and end dates if provided
145            if self.parameters.get("min_date"):
146                query_parameters['start_time'] = self.parameters.get("min_date")
147            if self.parameters.get("max_date"):
148                query_parameters['end_time'] = self.parameters.get("max_date")
149
150            vk_helper = vk_session.get_api()
151
152            # Collect Newsfeed results
153            num_results = 0
154            self.dataset.update_status("Submitting query...")
155            for i, result_batch in enumerate(self.search_newsfeed(vk_helper, **query_parameters)):
156                if self.interrupted:
157                    raise ProcessorInterruptedException("Interrupted while fetching newsfeed data from the VK API")
158
159                self.dataset.update_status(f"Processing results batch {i+1}")
160                for result in result_batch:
161                    result.update({'4cat_item_type': 'post'})
162                    yield result
163                    num_results += 1
164
165                    if include_comments:
166                        for comment in self.collect_all_comments(vk_helper, owner_id=result.get("owner_id"), post_id=result.get("id")):
167                            comment.update({'4cat_item_type': 'comment'})
168                            yield comment
169
170                    self.dataset.update_status(f"Received {num_results} results of max {self.parameters.get('amount')} from the VK API")
171                    self.dataset.update_progress(num_results / self.parameters.get('amount'))

Use the VK API

Parameters
  • query:
Returns
def login(self, username, password):
173    def login(self, username, password):
174        """
175        Login and authenticate user
176        """
177        vk_session = vk_api.VkApi(username,
178                                  password,
179                                  config_filename=self.config.get("PATH_SESSIONS").joinpath(username+"-vk_config.json"))
180        vk_session.auth()
181
182        return vk_session

Login and authenticate user

def search_newsfeed( self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs):
184    def search_newsfeed(self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs):
185        """
186        Collects all newsfeed posts
187
188        :param Object vk_helper:    Authorized vk_api.VkApi
189        :param str query:           String representing the search query
190        :param int max_amount:      Max number of posts to collect
191        :param int num_collected:   Number of previously collected results
192        :param int start_time:      Timestamp for earliest post
193        :param int end_time:        Timestamp for latest post
194        :return generator:          Yields groups of posts
195        """
196        remaining = max_amount - num_collected
197        parameters = {
198            "q": query,
199            "extended": 1,
200            "count": remaining if remaining < 200 else 200,
201            "fields": self.expanded_profile_fields,
202        }
203        if start_time:
204            parameters["start_time"] = start_time
205        if end_time:
206            parameters["end_time"] = end_time
207
208        response = vk_helper.newsfeed.search(**parameters)
209        news_feed_results = response.get("items", [])
210        num_collected = num_collected + len(news_feed_results)
211
212        # Flesh out profiles and groups
213        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
214        [result.update({"author_profile": author_profiles.get(result.get("from_id"), {})}) for result in news_feed_results]
215
216        yield news_feed_results
217
218        # Collect additional results
219        if response.get("next_from") and num_collected < max_amount:
220            parameters.update({"start_from": response.get("next_from")})
221            for additional_results in self.search_newsfeed(vk_helper, query, max_amount, num_collected=num_collected, **parameters):
222                yield additional_results

Collects all newsfeed posts

Parameters
  • Object vk_helper: Authorized vk_api.VkApi
  • str query: String representing the search query
  • int max_amount: Max number of posts to collect
  • int num_collected: Number of previously collected results
  • int start_time: Timestamp for earliest post
  • int end_time: Timestamp for latest post
Returns
      Yields groups of posts
def collect_all_comments(self, vk_helper, owner_id, post_id):
224    def collect_all_comments(self, vk_helper, owner_id, post_id):
225        """
226        Collects all comments and replies to a VK post
227
228        :param Object vk_helper:           Authorized vk_api.VkApi
229        :param int owner_id:            Owner ID provided by post/comment/etc
230        :param int post_id:             ID of post from which to collect comments
231        :return generator:              Yields comments and replies
232        """
233        # Collect top level comments from post
234        comments = self.get_comments(vk_helper, owner_id, post_id=post_id)
235
236        # Extract replies and collect more if needed
237        for comment in comments:
238            yield comment
239
240            reply_count = comment.get("thread", {}).get("count", 0)
241            replies = comment.get("thread", {}).get("items", [])
242            if reply_count > 10 and len(replies) == 10:
243                # Collect additional replies
244                replies += self.get_comments(vk_helper, owner_id, comment_id=comment.get("id"), last_collected_id=replies[-1].get("id"))[1:]
245
246            for reply in replies:
247                yield reply
248                if reply.get("thread"):
249                    self.log.warning("VK Datasource issue with replies: additional depth needs to be handled; contact 4CAT devs")
250                    # TODO: this will need modification if reply threads gain depth

Collects all comments and replies to a VK post

Parameters
  • Object vk_helper: Authorized vk_api.VkApi
  • int owner_id: Owner ID provided by post/comment/etc
  • int post_id: ID of post from which to collect comments
Returns
          Yields comments and replies
def get_comments( self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs):
252    def get_comments(self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs):
253        """
254        Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either
255        post_id or comment_id, but not both.
256
257        More information can be found here:
258        https://vk.com/dev/wall.getComments
259
260        :param Object vk_helper:       Authorized vk_api.VkApi
261        :param int owner_id:            Owner ID provided by post/comment/etc
262        :param int post_id:             ID of post from which to collect comments
263        :param int comment_id:          ID of comment from which to collect comments
264        :param int last_collected_id:   ID of the last comment to collected; used as start to continue collecting comments
265        :return list:                   List of comments
266        """
267        if self.interrupted:
268            raise ProcessorInterruptedException("Interrupted while fetching comments from the VK API")
269
270        if post_id is None and comment_id is None:
271            raise ProcessorException("Must provide either post_id or comment_id to collect comments from VK")
272
273        parameters = {
274            "owner_id": owner_id,
275            "need_likes": 1,
276            "preview_length": 0,
277            "extended": 1,
278            "count": 100,
279            "thread_items_count": 10,
280            "fields": self.expanded_profile_fields,
281        }
282        if post_id:
283            parameters.update({"post_id": post_id})
284        if comment_id:
285            parameters.update({"comment_id": comment_id})
286        if last_collected_id:
287            parameters.update({"start_comment_id": last_collected_id})
288
289        # Collect comments from VK
290        try:
291            response = vk_helper.wall.getComments(**parameters)
292        except vk_api.exceptions.ApiError as e:
293            self.dataset.log(f"Unable to collect comments for owner_id {owner_id} and {'post_id' if post_id is not None else 'comment_id'} {post_id if post_id is not None else comment_id}: {e}")
294            return []
295        comments = response.get("items", [])
296
297        # Flesh out profiles and groups
298        author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])})
299        [comment.update({"author_profile": author_profiles.get(comment.get("from_id"), {})}) for comment in comments]
300        # Also expand replies
301        [reply.update({"author_profile": author_profiles.get(reply.get("from_id"), {})}) for replies in [comment.get("thread", {}).get("items", []) for comment in comments if comment.get("thread")] for reply in replies]
302
303        # Check if there are potentially additional comments
304        if response.get("count") > 100 and len(comments) == 100:
305            # Update params with last collected comment
306            parameters.update({"start_comment_id": comments[-1].get("id")})
307            # Collect additional comments from VK and remove first comment (which is duplicate)
308            comments += self.get_comments(vk_helper=vk_helper, **parameters)[1:]
309
310        return comments

Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either post_id or comment_id, but not both.

More information can be found here: https://vk.com/dev/wall.getComments

Parameters
  • Object vk_helper: Authorized vk_api.VkApi
  • int owner_id: Owner ID provided by post/comment/etc
  • int post_id: ID of post from which to collect comments
  • int comment_id: ID of comment from which to collect comments
  • int last_collected_id: ID of the last comment to collected; used as start to continue collecting comments
Returns
               List of comments
@staticmethod
def expand_profile_fields(dict_of_profile_types):
312    @ staticmethod
313    def expand_profile_fields(dict_of_profile_types):
314        """
315        Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to
316        differentiate source of data later.
317        """
318        author_types = {}
319        for profile_type, profiles in dict_of_profile_types.items():
320            for profile in profiles:
321                if "id" not in profile:
322                    raise ProcessorException("Profile missing id field; VK data format incorrect/changed")
323                elif profile.get("id") in author_types:
324                    raise ProcessorException("Profile id duplicated across profile types; unable to combine profiles")
325                profile.update({"4CAT_author_profile_type": profile_type})
326                author_types[profile.get("id")] = profile
327        return author_types

Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to differentiate source of data later.

@staticmethod
def validate_query(query, request, config):
329    @staticmethod
330    def validate_query(query, request, config):
331        """
332        Validate input for a dataset query on the VK data source.
333
334        Will raise a QueryParametersException if invalid parameters are
335        encountered. Parameters are additionally sanitised.
336
337        :param dict query:  Query parameters, from client-side.
338        :param request:  Flask request
339        :param ConfigManager|None config:  Configuration reader (context-aware)
340        :return dict:  Safe query parameters
341        """
342        # Please provide something...
343        if not query.get("query", None):
344            raise QueryParametersException("Please provide a query.")
345
346        # the dates need to make sense as a range to search within
347        # but, on VK, you can also specify before *or* after only
348        after, before = query.get("daterange")
349        if before and after and before < after:
350            raise QueryParametersException("Date range must start before it ends")
351
352        # TODO: test username and password?
353
354        # if we made it this far, the query can be executed
355        params = {
356            "query":  query.get("query"),
357            "query_type": query.get("query_type"),
358            "amount": query.get("amount"),
359            "include_comments": query.get("include_comments"),
360            "min_date": after,
361            "max_date": before,
362            "username": query.get("username"),
363            "password": query.get("password"),
364        }
365
366        return params

Validate input for a dataset query on the VK data source.

Will raise a QueryParametersException if invalid parameters are encountered. Parameters are additionally sanitised.

Parameters
  • dict query: Query parameters, from client-side.
  • request: Flask request
  • ConfigManager|None config: Configuration reader (context-aware)
Returns

Safe query parameters

@staticmethod
def map_item(item):
368    @staticmethod
369    def map_item(item):
370        """
371        Map a nested VK object to a flat dictionary
372
373        :param item:  VK object as originally returned by the VK API
374        :return dict:  Dictionary in the format expected by 4CAT
375        """
376        vk_item_time = datetime.datetime.fromtimestamp(item.get('date'))
377
378        # Process attachments
379        photos = []
380        videos = []
381        audio = []
382        links = []
383        docs = []
384        for attachment in item.get("attachments", []):
385            attachment_type = attachment.get("type")
386            attachment = attachment.get(attachment_type)
387            if attachment_type == "photo":
388                if attachment.get("sizes"):
389                    photos.append(sorted(attachment.get("sizes"), key=lambda d: d['width'], reverse=True)[0].get('url'))
390                else:
391                    photos.append(str(attachment))
392            elif attachment_type == "video":
393                # TODO: can I get the actual URL? Does not seem like it...
394                videos.append(f"https://vk.com/video{attachment.get('owner_id')}_{attachment.get('id')}")
395            elif attachment_type == "audio":
396                # TODO: Seem unable to create the URL with provided information...
397                audio.append(f"{attachment.get('artist')} - {attachment.get('title')}")
398            elif attachment_type == "link":
399                links.append(attachment.get('url', str(attachment)))
400            elif attachment_type == "doc":
401                docs.append(attachment.get('url', str(attachment)))
402
403        # Use 4cat_item_type to populate different fields
404        tread_id = ""
405        in_reply_to_user = ""
406        in_reply_to_comment_id = ""
407        if item.get("4cat_item_type") == "post":
408            tread_id = item.get("id")
409        elif item.get("4cat_item_type") == "comment":
410            tread_id = item.get("post_id")
411            in_reply_to_user = item.get("reply_to_user")
412            in_reply_to_comment_id = item.get("reply_to_comment")
413
414        author_profile = item.get("author_profile", {})
415        profile_source = "user" if author_profile.get("4CAT_author_profile_type") == "profile" else "community" if author_profile.get("4CAT_author_profile_type") == "group" else "N/A"
416        # Use source of author profile if "type" not present (e.g., in users profiles do not seem to have type)
417        author_type = author_profile.get("type", profile_source)
418
419        return MappedItem({
420            "id": item.get("id"),
421            "thread_id": tread_id,
422            "timestamp": vk_item_time.strftime("%Y-%m-%d %H:%M:%S"),
423            "unix_timestamp": int(vk_item_time.timestamp()),
424            "link": f"https://vk.com/wall{item.get('owner_id')}_{item.get('id')}",
425            "item_type": item.get("4cat_item_type"),
426            "body": item.get("text"),
427            "author_id": item.get("from_id"),
428            "author_type": author_type,
429            "author_screen_name": author_profile.get("screen_name"),
430            "author_name": author_profile.get("name", " ".join([author_profile.get("first_name", ""), author_profile.get("last_name", "")])),
431            "author_sex": "F" if author_profile.get("sex") == 1 else "M" if author_profile.get("sex") == 2 else "Not Specified" if author_profile.get("sex") == 0 else author_profile.get("sex", "N/A"),
432            "author_city": author_profile.get("city", {}).get("title", ""),
433            "author_country": author_profile.get("country", {}).get("title", ""),
434            "author_photo": author_profile.get("photo_200",
435                                               author_profile.get("photo_100", author_profile.get("photo_50", ""))),
436            "author_is_admin": True if author_profile.get("is_admin") == 1 else False if author_profile.get("is_admin") == 0 else author_profile.get("is_admin", "N/A"),
437            "author_is_advertiser": True if author_profile.get("is_advertiser") == 1 else False if author_profile.get(
438                "is_advertiser") == 0 else author_profile.get("is_advertiser", "N/A"),
439            "author_deactivated": author_profile.get("is_deactivated", False),
440            "author_privacy_is_closed": 'closed' if author_profile.get("is_closed") == 1 else 'open' if author_profile.get("is_closed") == 0 else 'private' if author_profile.get("is_closed") == 2 else author_profile.get("is_closed", "N/A"),
441            "author_followers": author_profile.get("followers_count", author_profile.get("members_count", "N/A")),
442            "in_reply_to_user": in_reply_to_user,
443            "in_reply_to_comment_id": in_reply_to_comment_id,
444            "source": item.get("post_source", {}).get("type"),
445            "views": item.get("views", {}).get("count"),
446            "likes": item.get("likes", {}).get("count"),
447            "post_comments": item.get("comments", {}).get("count"),
448            "edited": datetime.datetime.fromtimestamp(item.get("edited")).strftime("%Y-%m-%d %H:%M:%S") if item.get("edited", False) else False,
449            "photos": ", ".join(photos),
450            "videos": ", ".join(videos),
451            "audio": ", ".join(audio),
452            "links": ", ".join(links),
453            "docs": ", ".join(docs),
454            "subject": "",
455        })

Map a nested VK object to a flat dictionary

Parameters
  • item: VK object as originally returned by the VK API
Returns

Dictionary in the format expected by 4CAT