datasources.vk.search_vk
VK keyword search
1""" 2VK keyword search 3""" 4import datetime 5 6import vk_api 7 8from backend.lib.search import Search 9from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException, ProcessorException 10from common.lib.helpers import UserInput 11from common.lib.item_mapping import MappedItem 12 13 14class SearchVK(Search): 15 """ 16 Get posts via the VK API 17 """ 18 type = "vk-search" # job ID 19 title = "VK" 20 extension = "ndjson" 21 is_local = False # Whether this datasource is locally scraped 22 is_static = False # Whether this datasource is still updated 23 24 previous_request = 0 25 import_issues = True 26 27 references = [ 28 "[VK API documentation](https://vk.com/dev/first_guide)", 29 "[Python API wrapper](https://github.com/python273/vk_api)" 30 ] 31 32 expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count" # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group 33 34 @classmethod 35 def get_options(cls, parent_dataset=None, config=None): 36 """ 37 Get VK data source options 38 39 :param config: 40 :param parent_dataset: Should always be None 41 :return dict: Data source options 42 """ 43 44 intro_text = ("This data source uses VK's [API](https://vk.com/dev/first_guide) and a python " 45 "[wrapper](https://github.com/python273/vk_api) to request information from VK using your " 46 "username and password.") 47 48 options = { 49 "intro-1": { 50 "type": UserInput.OPTION_INFO, 51 "help": intro_text 52 }, 53 "query_type": { 54 "type": UserInput.OPTION_CHOICE, 55 "help": "Query Type", 56 "options": { 57 "newsfeed": "News Feed search", 58 }, 59 "default": "newsfeed" 60 }, 61 "intro-2": { 62 "type": UserInput.OPTION_INFO, 63 "help": "Your username and password will be deleted after your query is complete." 64 }, 65 "username": { 66 "type": UserInput.OPTION_TEXT, 67 "sensitive": True, 68 "cache": True, 69 "help": "VK Username" 70 }, 71 "password": { 72 "type": UserInput.OPTION_TEXT, 73 "sensitive": True, 74 "cache": True, 75 "help": "VK Password" 76 }, 77 "intro-3": { 78 "type": UserInput.OPTION_INFO, 79 "help": "Enter the text to search for below." 80 }, 81 "query": { 82 "type": UserInput.OPTION_TEXT_LARGE, 83 "help": "Query" 84 }, 85 "amount": { 86 "type": UserInput.OPTION_TEXT, 87 "help": "Max items to retrieve", 88 "min": 0, 89 "max": 1000, 90 "default": 100 91 }, 92 "include_comments": { 93 "type": UserInput.OPTION_TOGGLE, 94 "help": "Include post comments", 95 "default": False, 96 "tooltip": "" 97 }, 98 "divider-2": { 99 "type": UserInput.OPTION_DIVIDER 100 }, 101 "daterange-info": { 102 "type": UserInput.OPTION_INFO, 103 "help": "VK daterange defaults vary by type of query. For the News Feed, posts are returned starting " 104 "with the most recent and working backwards." 105 }, 106 "daterange": { 107 "type": UserInput.OPTION_DATERANGE, 108 "help": "Date range" 109 }, 110 } 111 112 return options 113 114 def get_items(self, query): 115 """ 116 Use the VK API 117 118 :param query: 119 :return: 120 """ 121 if self.parameters.get("username") is None or self.parameters.get("password") is None: 122 self.dataset.update_status( 123 "VK query failed or was interrupted; please create new query in order to provide username and password again.", 124 is_final=True) 125 return [] 126 127 self.dataset.update_status("Logging in to VK") 128 try: 129 vk_session = self.login(self.parameters.get("username"), self.parameters.get("password")) 130 except vk_api.exceptions.AuthError as e: 131 self.log.warning(f"VK Auth Issues: {e}") 132 self.dataset.update_status(f"VK unable to authorize user: {e}", is_final=True) 133 return [] 134 135 query_type = self.parameters.get("query_type") 136 query = self.parameters.get("query") 137 include_comments = self.parameters.get("include_comments", False) 138 139 if query_type == "newsfeed": 140 query_parameters = {"query": query, 141 "max_amount": self.parameters.get("amount")} 142 143 # Add start and end dates if provided 144 if self.parameters.get("min_date"): 145 query_parameters['start_time'] = self.parameters.get("min_date") 146 if self.parameters.get("max_date"): 147 query_parameters['end_time'] = self.parameters.get("max_date") 148 149 vk_helper = vk_session.get_api() 150 151 # Collect Newsfeed results 152 num_results = 0 153 self.dataset.update_status("Submitting query...") 154 for i, result_batch in enumerate(self.search_newsfeed(vk_helper, **query_parameters)): 155 if self.interrupted: 156 raise ProcessorInterruptedException("Interrupted while fetching newsfeed data from the VK API") 157 158 self.dataset.update_status(f"Processing results batch {i+1}") 159 for result in result_batch: 160 result.update({'4cat_item_type': 'post'}) 161 yield result 162 num_results += 1 163 164 if include_comments: 165 for comment in self.collect_all_comments(vk_helper, owner_id=result.get("owner_id"), post_id=result.get("id")): 166 comment.update({'4cat_item_type': 'comment'}) 167 yield comment 168 169 self.dataset.update_status(f"Received {num_results} results of max {self.parameters.get('amount')} from the VK API") 170 self.dataset.update_progress(num_results / self.parameters.get('amount')) 171 172 def login(self, username, password): 173 """ 174 Login and authenticate user 175 """ 176 vk_session = vk_api.VkApi(username, 177 password, 178 config_filename=self.config.get("PATH_SESSIONS").joinpath(username+"-vk_config.json")) 179 vk_session.auth() 180 181 return vk_session 182 183 def search_newsfeed(self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs): 184 """ 185 Collects all newsfeed posts 186 187 :param Object vk_helper: Authorized vk_api.VkApi 188 :param str query: String representing the search query 189 :param int max_amount: Max number of posts to collect 190 :param int num_collected: Number of previously collected results 191 :param int start_time: Timestamp for earliest post 192 :param int end_time: Timestamp for latest post 193 :return generator: Yields groups of posts 194 """ 195 remaining = max_amount - num_collected 196 parameters = { 197 "q": query, 198 "extended": 1, 199 "count": remaining if remaining < 200 else 200, 200 "fields": self.expanded_profile_fields, 201 } 202 if start_time: 203 parameters["start_time"] = start_time 204 if end_time: 205 parameters["end_time"] = end_time 206 207 response = vk_helper.newsfeed.search(**parameters) 208 news_feed_results = response.get("items", []) 209 num_collected = num_collected + len(news_feed_results) 210 211 # Flesh out profiles and groups 212 author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])}) 213 [result.update({"author_profile": author_profiles.get(result.get("from_id"), {})}) for result in news_feed_results] 214 215 yield news_feed_results 216 217 # Collect additional results 218 if response.get("next_from") and num_collected < max_amount: 219 parameters.update({"start_from": response.get("next_from")}) 220 for additional_results in self.search_newsfeed(vk_helper, query, max_amount, num_collected=num_collected, **parameters): 221 yield additional_results 222 223 def collect_all_comments(self, vk_helper, owner_id, post_id): 224 """ 225 Collects all comments and replies to a VK post 226 227 :param Object vk_helper: Authorized vk_api.VkApi 228 :param int owner_id: Owner ID provided by post/comment/etc 229 :param int post_id: ID of post from which to collect comments 230 :return generator: Yields comments and replies 231 """ 232 # Collect top level comments from post 233 comments = self.get_comments(vk_helper, owner_id, post_id=post_id) 234 235 # Extract replies and collect more if needed 236 for comment in comments: 237 yield comment 238 239 reply_count = comment.get("thread", {}).get("count", 0) 240 replies = comment.get("thread", {}).get("items", []) 241 if reply_count > 10 and len(replies) == 10: 242 # Collect additional replies 243 replies += self.get_comments(vk_helper, owner_id, comment_id=comment.get("id"), last_collected_id=replies[-1].get("id"))[1:] 244 245 for reply in replies: 246 yield reply 247 if reply.get("thread"): 248 self.log.warning("VK Datasource issue with replies: additional depth needs to be handled; contact 4CAT devs") 249 # TODO: this will need modification if reply threads gain depth 250 251 def get_comments(self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs): 252 """ 253 Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either 254 post_id or comment_id, but not both. 255 256 More information can be found here: 257 https://vk.com/dev/wall.getComments 258 259 :param Object vk_helper: Authorized vk_api.VkApi 260 :param int owner_id: Owner ID provided by post/comment/etc 261 :param int post_id: ID of post from which to collect comments 262 :param int comment_id: ID of comment from which to collect comments 263 :param int last_collected_id: ID of the last comment to collected; used as start to continue collecting comments 264 :return list: List of comments 265 """ 266 if self.interrupted: 267 raise ProcessorInterruptedException("Interrupted while fetching comments from the VK API") 268 269 if post_id is None and comment_id is None: 270 raise ProcessorException("Must provide either post_id or comment_id to collect comments from VK") 271 272 parameters = { 273 "owner_id": owner_id, 274 "need_likes": 1, 275 "preview_length": 0, 276 "extended": 1, 277 "count": 100, 278 "thread_items_count": 10, 279 "fields": self.expanded_profile_fields, 280 } 281 if post_id: 282 parameters.update({"post_id": post_id}) 283 if comment_id: 284 parameters.update({"comment_id": comment_id}) 285 if last_collected_id: 286 parameters.update({"start_comment_id": last_collected_id}) 287 288 # Collect comments from VK 289 try: 290 response = vk_helper.wall.getComments(**parameters) 291 except vk_api.exceptions.ApiError as e: 292 self.dataset.log(f"Unable to collect comments for owner_id {owner_id} and {'post_id' if post_id is not None else 'comment_id'} {post_id if post_id is not None else comment_id}: {e}") 293 return [] 294 comments = response.get("items", []) 295 296 # Flesh out profiles and groups 297 author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])}) 298 [comment.update({"author_profile": author_profiles.get(comment.get("from_id"), {})}) for comment in comments] 299 # Also expand replies 300 [reply.update({"author_profile": author_profiles.get(reply.get("from_id"), {})}) for replies in [comment.get("thread", {}).get("items", []) for comment in comments if comment.get("thread")] for reply in replies] 301 302 # Check if there are potentially additional comments 303 if response.get("count") > 100 and len(comments) == 100: 304 # Update params with last collected comment 305 parameters.update({"start_comment_id": comments[-1].get("id")}) 306 # Collect additional comments from VK and remove first comment (which is duplicate) 307 comments += self.get_comments(vk_helper=vk_helper, **parameters)[1:] 308 309 return comments 310 311 @ staticmethod 312 def expand_profile_fields(dict_of_profile_types): 313 """ 314 Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to 315 differentiate source of data later. 316 """ 317 author_types = {} 318 for profile_type, profiles in dict_of_profile_types.items(): 319 for profile in profiles: 320 if "id" not in profile: 321 raise ProcessorException("Profile missing id field; VK data format incorrect/changed") 322 elif profile.get("id") in author_types: 323 raise ProcessorException("Profile id duplicated across profile types; unable to combine profiles") 324 profile.update({"4CAT_author_profile_type": profile_type}) 325 author_types[profile.get("id")] = profile 326 return author_types 327 328 @staticmethod 329 def validate_query(query, request, config): 330 """ 331 Validate input for a dataset query on the VK data source. 332 333 Will raise a QueryParametersException if invalid parameters are 334 encountered. Parameters are additionally sanitised. 335 336 :param dict query: Query parameters, from client-side. 337 :param request: Flask request 338 :param ConfigManager|None config: Configuration reader (context-aware) 339 :return dict: Safe query parameters 340 """ 341 # Please provide something... 342 if not query.get("query", None): 343 raise QueryParametersException("Please provide a query.") 344 345 # the dates need to make sense as a range to search within 346 # but, on VK, you can also specify before *or* after only 347 after, before = query.get("daterange") 348 if before and after and before < after: 349 raise QueryParametersException("Date range must start before it ends") 350 351 # TODO: test username and password? 352 353 # if we made it this far, the query can be executed 354 params = { 355 "query": query.get("query"), 356 "query_type": query.get("query_type"), 357 "amount": query.get("amount"), 358 "include_comments": query.get("include_comments"), 359 "min_date": after, 360 "max_date": before, 361 "username": query.get("username"), 362 "password": query.get("password"), 363 } 364 365 return params 366 367 @staticmethod 368 def map_item(item): 369 """ 370 Map a nested VK object to a flat dictionary 371 372 :param item: VK object as originally returned by the VK API 373 :return dict: Dictionary in the format expected by 4CAT 374 """ 375 vk_item_time = datetime.datetime.fromtimestamp(item.get('date')) 376 377 # Process attachments 378 photos = [] 379 videos = [] 380 audio = [] 381 links = [] 382 docs = [] 383 for attachment in item.get("attachments", []): 384 attachment_type = attachment.get("type") 385 attachment = attachment.get(attachment_type) 386 if attachment_type == "photo": 387 if attachment.get("sizes"): 388 photos.append(sorted(attachment.get("sizes"), key=lambda d: d['width'], reverse=True)[0].get('url')) 389 else: 390 photos.append(str(attachment)) 391 elif attachment_type == "video": 392 # TODO: can I get the actual URL? Does not seem like it... 393 videos.append(f"https://vk.com/video{attachment.get('owner_id')}_{attachment.get('id')}") 394 elif attachment_type == "audio": 395 # TODO: Seem unable to create the URL with provided information... 396 audio.append(f"{attachment.get('artist')} - {attachment.get('title')}") 397 elif attachment_type == "link": 398 links.append(attachment.get('url', str(attachment))) 399 elif attachment_type == "doc": 400 docs.append(attachment.get('url', str(attachment))) 401 402 # Use 4cat_item_type to populate different fields 403 tread_id = "" 404 in_reply_to_user = "" 405 in_reply_to_comment_id = "" 406 if item.get("4cat_item_type") == "post": 407 tread_id = item.get("id") 408 elif item.get("4cat_item_type") == "comment": 409 tread_id = item.get("post_id") 410 in_reply_to_user = item.get("reply_to_user") 411 in_reply_to_comment_id = item.get("reply_to_comment") 412 413 author_profile = item.get("author_profile", {}) 414 profile_source = "user" if author_profile.get("4CAT_author_profile_type") == "profile" else "community" if author_profile.get("4CAT_author_profile_type") == "group" else "N/A" 415 # Use source of author profile if "type" not present (e.g., in users profiles do not seem to have type) 416 author_type = author_profile.get("type", profile_source) 417 418 return MappedItem({ 419 "id": item.get("id"), 420 "thread_id": tread_id, 421 "timestamp": vk_item_time.strftime("%Y-%m-%d %H:%M:%S"), 422 "unix_timestamp": int(vk_item_time.timestamp()), 423 "link": f"https://vk.com/wall{item.get('owner_id')}_{item.get('id')}", 424 "item_type": item.get("4cat_item_type"), 425 "body": item.get("text"), 426 "author_id": item.get("from_id"), 427 "author_type": author_type, 428 "author_screen_name": author_profile.get("screen_name"), 429 "author_name": author_profile.get("name", " ".join([author_profile.get("first_name", ""), author_profile.get("last_name", "")])), 430 "author_sex": "F" if author_profile.get("sex") == 1 else "M" if author_profile.get("sex") == 2 else "Not Specified" if author_profile.get("sex") == 0 else author_profile.get("sex", "N/A"), 431 "author_city": author_profile.get("city", {}).get("title", ""), 432 "author_country": author_profile.get("country", {}).get("title", ""), 433 "author_photo": author_profile.get("photo_200", 434 author_profile.get("photo_100", author_profile.get("photo_50", ""))), 435 "author_is_admin": True if author_profile.get("is_admin") == 1 else False if author_profile.get("is_admin") == 0 else author_profile.get("is_admin", "N/A"), 436 "author_is_advertiser": True if author_profile.get("is_advertiser") == 1 else False if author_profile.get( 437 "is_advertiser") == 0 else author_profile.get("is_advertiser", "N/A"), 438 "author_deactivated": author_profile.get("is_deactivated", False), 439 "author_privacy_is_closed": 'closed' if author_profile.get("is_closed") == 1 else 'open' if author_profile.get("is_closed") == 0 else 'private' if author_profile.get("is_closed") == 2 else author_profile.get("is_closed", "N/A"), 440 "author_followers": author_profile.get("followers_count", author_profile.get("members_count", "N/A")), 441 "in_reply_to_user": in_reply_to_user, 442 "in_reply_to_comment_id": in_reply_to_comment_id, 443 "source": item.get("post_source", {}).get("type"), 444 "views": item.get("views", {}).get("count"), 445 "likes": item.get("likes", {}).get("count"), 446 "post_comments": item.get("comments", {}).get("count"), 447 "edited": datetime.datetime.fromtimestamp(item.get("edited")).strftime("%Y-%m-%d %H:%M:%S") if item.get("edited", False) else False, 448 "photos": ", ".join(photos), 449 "videos": ", ".join(videos), 450 "audio": ", ".join(audio), 451 "links": ", ".join(links), 452 "docs": ", ".join(docs), 453 "subject": "", 454 })
15class SearchVK(Search): 16 """ 17 Get posts via the VK API 18 """ 19 type = "vk-search" # job ID 20 title = "VK" 21 extension = "ndjson" 22 is_local = False # Whether this datasource is locally scraped 23 is_static = False # Whether this datasource is still updated 24 25 previous_request = 0 26 import_issues = True 27 28 references = [ 29 "[VK API documentation](https://vk.com/dev/first_guide)", 30 "[Python API wrapper](https://github.com/python273/vk_api)" 31 ] 32 33 expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count" # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group 34 35 @classmethod 36 def get_options(cls, parent_dataset=None, config=None): 37 """ 38 Get VK data source options 39 40 :param config: 41 :param parent_dataset: Should always be None 42 :return dict: Data source options 43 """ 44 45 intro_text = ("This data source uses VK's [API](https://vk.com/dev/first_guide) and a python " 46 "[wrapper](https://github.com/python273/vk_api) to request information from VK using your " 47 "username and password.") 48 49 options = { 50 "intro-1": { 51 "type": UserInput.OPTION_INFO, 52 "help": intro_text 53 }, 54 "query_type": { 55 "type": UserInput.OPTION_CHOICE, 56 "help": "Query Type", 57 "options": { 58 "newsfeed": "News Feed search", 59 }, 60 "default": "newsfeed" 61 }, 62 "intro-2": { 63 "type": UserInput.OPTION_INFO, 64 "help": "Your username and password will be deleted after your query is complete." 65 }, 66 "username": { 67 "type": UserInput.OPTION_TEXT, 68 "sensitive": True, 69 "cache": True, 70 "help": "VK Username" 71 }, 72 "password": { 73 "type": UserInput.OPTION_TEXT, 74 "sensitive": True, 75 "cache": True, 76 "help": "VK Password" 77 }, 78 "intro-3": { 79 "type": UserInput.OPTION_INFO, 80 "help": "Enter the text to search for below." 81 }, 82 "query": { 83 "type": UserInput.OPTION_TEXT_LARGE, 84 "help": "Query" 85 }, 86 "amount": { 87 "type": UserInput.OPTION_TEXT, 88 "help": "Max items to retrieve", 89 "min": 0, 90 "max": 1000, 91 "default": 100 92 }, 93 "include_comments": { 94 "type": UserInput.OPTION_TOGGLE, 95 "help": "Include post comments", 96 "default": False, 97 "tooltip": "" 98 }, 99 "divider-2": { 100 "type": UserInput.OPTION_DIVIDER 101 }, 102 "daterange-info": { 103 "type": UserInput.OPTION_INFO, 104 "help": "VK daterange defaults vary by type of query. For the News Feed, posts are returned starting " 105 "with the most recent and working backwards." 106 }, 107 "daterange": { 108 "type": UserInput.OPTION_DATERANGE, 109 "help": "Date range" 110 }, 111 } 112 113 return options 114 115 def get_items(self, query): 116 """ 117 Use the VK API 118 119 :param query: 120 :return: 121 """ 122 if self.parameters.get("username") is None or self.parameters.get("password") is None: 123 self.dataset.update_status( 124 "VK query failed or was interrupted; please create new query in order to provide username and password again.", 125 is_final=True) 126 return [] 127 128 self.dataset.update_status("Logging in to VK") 129 try: 130 vk_session = self.login(self.parameters.get("username"), self.parameters.get("password")) 131 except vk_api.exceptions.AuthError as e: 132 self.log.warning(f"VK Auth Issues: {e}") 133 self.dataset.update_status(f"VK unable to authorize user: {e}", is_final=True) 134 return [] 135 136 query_type = self.parameters.get("query_type") 137 query = self.parameters.get("query") 138 include_comments = self.parameters.get("include_comments", False) 139 140 if query_type == "newsfeed": 141 query_parameters = {"query": query, 142 "max_amount": self.parameters.get("amount")} 143 144 # Add start and end dates if provided 145 if self.parameters.get("min_date"): 146 query_parameters['start_time'] = self.parameters.get("min_date") 147 if self.parameters.get("max_date"): 148 query_parameters['end_time'] = self.parameters.get("max_date") 149 150 vk_helper = vk_session.get_api() 151 152 # Collect Newsfeed results 153 num_results = 0 154 self.dataset.update_status("Submitting query...") 155 for i, result_batch in enumerate(self.search_newsfeed(vk_helper, **query_parameters)): 156 if self.interrupted: 157 raise ProcessorInterruptedException("Interrupted while fetching newsfeed data from the VK API") 158 159 self.dataset.update_status(f"Processing results batch {i+1}") 160 for result in result_batch: 161 result.update({'4cat_item_type': 'post'}) 162 yield result 163 num_results += 1 164 165 if include_comments: 166 for comment in self.collect_all_comments(vk_helper, owner_id=result.get("owner_id"), post_id=result.get("id")): 167 comment.update({'4cat_item_type': 'comment'}) 168 yield comment 169 170 self.dataset.update_status(f"Received {num_results} results of max {self.parameters.get('amount')} from the VK API") 171 self.dataset.update_progress(num_results / self.parameters.get('amount')) 172 173 def login(self, username, password): 174 """ 175 Login and authenticate user 176 """ 177 vk_session = vk_api.VkApi(username, 178 password, 179 config_filename=self.config.get("PATH_SESSIONS").joinpath(username+"-vk_config.json")) 180 vk_session.auth() 181 182 return vk_session 183 184 def search_newsfeed(self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs): 185 """ 186 Collects all newsfeed posts 187 188 :param Object vk_helper: Authorized vk_api.VkApi 189 :param str query: String representing the search query 190 :param int max_amount: Max number of posts to collect 191 :param int num_collected: Number of previously collected results 192 :param int start_time: Timestamp for earliest post 193 :param int end_time: Timestamp for latest post 194 :return generator: Yields groups of posts 195 """ 196 remaining = max_amount - num_collected 197 parameters = { 198 "q": query, 199 "extended": 1, 200 "count": remaining if remaining < 200 else 200, 201 "fields": self.expanded_profile_fields, 202 } 203 if start_time: 204 parameters["start_time"] = start_time 205 if end_time: 206 parameters["end_time"] = end_time 207 208 response = vk_helper.newsfeed.search(**parameters) 209 news_feed_results = response.get("items", []) 210 num_collected = num_collected + len(news_feed_results) 211 212 # Flesh out profiles and groups 213 author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])}) 214 [result.update({"author_profile": author_profiles.get(result.get("from_id"), {})}) for result in news_feed_results] 215 216 yield news_feed_results 217 218 # Collect additional results 219 if response.get("next_from") and num_collected < max_amount: 220 parameters.update({"start_from": response.get("next_from")}) 221 for additional_results in self.search_newsfeed(vk_helper, query, max_amount, num_collected=num_collected, **parameters): 222 yield additional_results 223 224 def collect_all_comments(self, vk_helper, owner_id, post_id): 225 """ 226 Collects all comments and replies to a VK post 227 228 :param Object vk_helper: Authorized vk_api.VkApi 229 :param int owner_id: Owner ID provided by post/comment/etc 230 :param int post_id: ID of post from which to collect comments 231 :return generator: Yields comments and replies 232 """ 233 # Collect top level comments from post 234 comments = self.get_comments(vk_helper, owner_id, post_id=post_id) 235 236 # Extract replies and collect more if needed 237 for comment in comments: 238 yield comment 239 240 reply_count = comment.get("thread", {}).get("count", 0) 241 replies = comment.get("thread", {}).get("items", []) 242 if reply_count > 10 and len(replies) == 10: 243 # Collect additional replies 244 replies += self.get_comments(vk_helper, owner_id, comment_id=comment.get("id"), last_collected_id=replies[-1].get("id"))[1:] 245 246 for reply in replies: 247 yield reply 248 if reply.get("thread"): 249 self.log.warning("VK Datasource issue with replies: additional depth needs to be handled; contact 4CAT devs") 250 # TODO: this will need modification if reply threads gain depth 251 252 def get_comments(self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs): 253 """ 254 Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either 255 post_id or comment_id, but not both. 256 257 More information can be found here: 258 https://vk.com/dev/wall.getComments 259 260 :param Object vk_helper: Authorized vk_api.VkApi 261 :param int owner_id: Owner ID provided by post/comment/etc 262 :param int post_id: ID of post from which to collect comments 263 :param int comment_id: ID of comment from which to collect comments 264 :param int last_collected_id: ID of the last comment to collected; used as start to continue collecting comments 265 :return list: List of comments 266 """ 267 if self.interrupted: 268 raise ProcessorInterruptedException("Interrupted while fetching comments from the VK API") 269 270 if post_id is None and comment_id is None: 271 raise ProcessorException("Must provide either post_id or comment_id to collect comments from VK") 272 273 parameters = { 274 "owner_id": owner_id, 275 "need_likes": 1, 276 "preview_length": 0, 277 "extended": 1, 278 "count": 100, 279 "thread_items_count": 10, 280 "fields": self.expanded_profile_fields, 281 } 282 if post_id: 283 parameters.update({"post_id": post_id}) 284 if comment_id: 285 parameters.update({"comment_id": comment_id}) 286 if last_collected_id: 287 parameters.update({"start_comment_id": last_collected_id}) 288 289 # Collect comments from VK 290 try: 291 response = vk_helper.wall.getComments(**parameters) 292 except vk_api.exceptions.ApiError as e: 293 self.dataset.log(f"Unable to collect comments for owner_id {owner_id} and {'post_id' if post_id is not None else 'comment_id'} {post_id if post_id is not None else comment_id}: {e}") 294 return [] 295 comments = response.get("items", []) 296 297 # Flesh out profiles and groups 298 author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])}) 299 [comment.update({"author_profile": author_profiles.get(comment.get("from_id"), {})}) for comment in comments] 300 # Also expand replies 301 [reply.update({"author_profile": author_profiles.get(reply.get("from_id"), {})}) for replies in [comment.get("thread", {}).get("items", []) for comment in comments if comment.get("thread")] for reply in replies] 302 303 # Check if there are potentially additional comments 304 if response.get("count") > 100 and len(comments) == 100: 305 # Update params with last collected comment 306 parameters.update({"start_comment_id": comments[-1].get("id")}) 307 # Collect additional comments from VK and remove first comment (which is duplicate) 308 comments += self.get_comments(vk_helper=vk_helper, **parameters)[1:] 309 310 return comments 311 312 @ staticmethod 313 def expand_profile_fields(dict_of_profile_types): 314 """ 315 Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to 316 differentiate source of data later. 317 """ 318 author_types = {} 319 for profile_type, profiles in dict_of_profile_types.items(): 320 for profile in profiles: 321 if "id" not in profile: 322 raise ProcessorException("Profile missing id field; VK data format incorrect/changed") 323 elif profile.get("id") in author_types: 324 raise ProcessorException("Profile id duplicated across profile types; unable to combine profiles") 325 profile.update({"4CAT_author_profile_type": profile_type}) 326 author_types[profile.get("id")] = profile 327 return author_types 328 329 @staticmethod 330 def validate_query(query, request, config): 331 """ 332 Validate input for a dataset query on the VK data source. 333 334 Will raise a QueryParametersException if invalid parameters are 335 encountered. Parameters are additionally sanitised. 336 337 :param dict query: Query parameters, from client-side. 338 :param request: Flask request 339 :param ConfigManager|None config: Configuration reader (context-aware) 340 :return dict: Safe query parameters 341 """ 342 # Please provide something... 343 if not query.get("query", None): 344 raise QueryParametersException("Please provide a query.") 345 346 # the dates need to make sense as a range to search within 347 # but, on VK, you can also specify before *or* after only 348 after, before = query.get("daterange") 349 if before and after and before < after: 350 raise QueryParametersException("Date range must start before it ends") 351 352 # TODO: test username and password? 353 354 # if we made it this far, the query can be executed 355 params = { 356 "query": query.get("query"), 357 "query_type": query.get("query_type"), 358 "amount": query.get("amount"), 359 "include_comments": query.get("include_comments"), 360 "min_date": after, 361 "max_date": before, 362 "username": query.get("username"), 363 "password": query.get("password"), 364 } 365 366 return params 367 368 @staticmethod 369 def map_item(item): 370 """ 371 Map a nested VK object to a flat dictionary 372 373 :param item: VK object as originally returned by the VK API 374 :return dict: Dictionary in the format expected by 4CAT 375 """ 376 vk_item_time = datetime.datetime.fromtimestamp(item.get('date')) 377 378 # Process attachments 379 photos = [] 380 videos = [] 381 audio = [] 382 links = [] 383 docs = [] 384 for attachment in item.get("attachments", []): 385 attachment_type = attachment.get("type") 386 attachment = attachment.get(attachment_type) 387 if attachment_type == "photo": 388 if attachment.get("sizes"): 389 photos.append(sorted(attachment.get("sizes"), key=lambda d: d['width'], reverse=True)[0].get('url')) 390 else: 391 photos.append(str(attachment)) 392 elif attachment_type == "video": 393 # TODO: can I get the actual URL? Does not seem like it... 394 videos.append(f"https://vk.com/video{attachment.get('owner_id')}_{attachment.get('id')}") 395 elif attachment_type == "audio": 396 # TODO: Seem unable to create the URL with provided information... 397 audio.append(f"{attachment.get('artist')} - {attachment.get('title')}") 398 elif attachment_type == "link": 399 links.append(attachment.get('url', str(attachment))) 400 elif attachment_type == "doc": 401 docs.append(attachment.get('url', str(attachment))) 402 403 # Use 4cat_item_type to populate different fields 404 tread_id = "" 405 in_reply_to_user = "" 406 in_reply_to_comment_id = "" 407 if item.get("4cat_item_type") == "post": 408 tread_id = item.get("id") 409 elif item.get("4cat_item_type") == "comment": 410 tread_id = item.get("post_id") 411 in_reply_to_user = item.get("reply_to_user") 412 in_reply_to_comment_id = item.get("reply_to_comment") 413 414 author_profile = item.get("author_profile", {}) 415 profile_source = "user" if author_profile.get("4CAT_author_profile_type") == "profile" else "community" if author_profile.get("4CAT_author_profile_type") == "group" else "N/A" 416 # Use source of author profile if "type" not present (e.g., in users profiles do not seem to have type) 417 author_type = author_profile.get("type", profile_source) 418 419 return MappedItem({ 420 "id": item.get("id"), 421 "thread_id": tread_id, 422 "timestamp": vk_item_time.strftime("%Y-%m-%d %H:%M:%S"), 423 "unix_timestamp": int(vk_item_time.timestamp()), 424 "link": f"https://vk.com/wall{item.get('owner_id')}_{item.get('id')}", 425 "item_type": item.get("4cat_item_type"), 426 "body": item.get("text"), 427 "author_id": item.get("from_id"), 428 "author_type": author_type, 429 "author_screen_name": author_profile.get("screen_name"), 430 "author_name": author_profile.get("name", " ".join([author_profile.get("first_name", ""), author_profile.get("last_name", "")])), 431 "author_sex": "F" if author_profile.get("sex") == 1 else "M" if author_profile.get("sex") == 2 else "Not Specified" if author_profile.get("sex") == 0 else author_profile.get("sex", "N/A"), 432 "author_city": author_profile.get("city", {}).get("title", ""), 433 "author_country": author_profile.get("country", {}).get("title", ""), 434 "author_photo": author_profile.get("photo_200", 435 author_profile.get("photo_100", author_profile.get("photo_50", ""))), 436 "author_is_admin": True if author_profile.get("is_admin") == 1 else False if author_profile.get("is_admin") == 0 else author_profile.get("is_admin", "N/A"), 437 "author_is_advertiser": True if author_profile.get("is_advertiser") == 1 else False if author_profile.get( 438 "is_advertiser") == 0 else author_profile.get("is_advertiser", "N/A"), 439 "author_deactivated": author_profile.get("is_deactivated", False), 440 "author_privacy_is_closed": 'closed' if author_profile.get("is_closed") == 1 else 'open' if author_profile.get("is_closed") == 0 else 'private' if author_profile.get("is_closed") == 2 else author_profile.get("is_closed", "N/A"), 441 "author_followers": author_profile.get("followers_count", author_profile.get("members_count", "N/A")), 442 "in_reply_to_user": in_reply_to_user, 443 "in_reply_to_comment_id": in_reply_to_comment_id, 444 "source": item.get("post_source", {}).get("type"), 445 "views": item.get("views", {}).get("count"), 446 "likes": item.get("likes", {}).get("count"), 447 "post_comments": item.get("comments", {}).get("count"), 448 "edited": datetime.datetime.fromtimestamp(item.get("edited")).strftime("%Y-%m-%d %H:%M:%S") if item.get("edited", False) else False, 449 "photos": ", ".join(photos), 450 "videos": ", ".join(videos), 451 "audio": ", ".join(audio), 452 "links": ", ".join(links), 453 "docs": ", ".join(docs), 454 "subject": "", 455 })
Get posts via the VK API
35 @classmethod 36 def get_options(cls, parent_dataset=None, config=None): 37 """ 38 Get VK data source options 39 40 :param config: 41 :param parent_dataset: Should always be None 42 :return dict: Data source options 43 """ 44 45 intro_text = ("This data source uses VK's [API](https://vk.com/dev/first_guide) and a python " 46 "[wrapper](https://github.com/python273/vk_api) to request information from VK using your " 47 "username and password.") 48 49 options = { 50 "intro-1": { 51 "type": UserInput.OPTION_INFO, 52 "help": intro_text 53 }, 54 "query_type": { 55 "type": UserInput.OPTION_CHOICE, 56 "help": "Query Type", 57 "options": { 58 "newsfeed": "News Feed search", 59 }, 60 "default": "newsfeed" 61 }, 62 "intro-2": { 63 "type": UserInput.OPTION_INFO, 64 "help": "Your username and password will be deleted after your query is complete." 65 }, 66 "username": { 67 "type": UserInput.OPTION_TEXT, 68 "sensitive": True, 69 "cache": True, 70 "help": "VK Username" 71 }, 72 "password": { 73 "type": UserInput.OPTION_TEXT, 74 "sensitive": True, 75 "cache": True, 76 "help": "VK Password" 77 }, 78 "intro-3": { 79 "type": UserInput.OPTION_INFO, 80 "help": "Enter the text to search for below." 81 }, 82 "query": { 83 "type": UserInput.OPTION_TEXT_LARGE, 84 "help": "Query" 85 }, 86 "amount": { 87 "type": UserInput.OPTION_TEXT, 88 "help": "Max items to retrieve", 89 "min": 0, 90 "max": 1000, 91 "default": 100 92 }, 93 "include_comments": { 94 "type": UserInput.OPTION_TOGGLE, 95 "help": "Include post comments", 96 "default": False, 97 "tooltip": "" 98 }, 99 "divider-2": { 100 "type": UserInput.OPTION_DIVIDER 101 }, 102 "daterange-info": { 103 "type": UserInput.OPTION_INFO, 104 "help": "VK daterange defaults vary by type of query. For the News Feed, posts are returned starting " 105 "with the most recent and working backwards." 106 }, 107 "daterange": { 108 "type": UserInput.OPTION_DATERANGE, 109 "help": "Date range" 110 }, 111 } 112 113 return options
Get VK data source options
Parameters
- config:
- parent_dataset: Should always be None
Returns
Data source options
115 def get_items(self, query): 116 """ 117 Use the VK API 118 119 :param query: 120 :return: 121 """ 122 if self.parameters.get("username") is None or self.parameters.get("password") is None: 123 self.dataset.update_status( 124 "VK query failed or was interrupted; please create new query in order to provide username and password again.", 125 is_final=True) 126 return [] 127 128 self.dataset.update_status("Logging in to VK") 129 try: 130 vk_session = self.login(self.parameters.get("username"), self.parameters.get("password")) 131 except vk_api.exceptions.AuthError as e: 132 self.log.warning(f"VK Auth Issues: {e}") 133 self.dataset.update_status(f"VK unable to authorize user: {e}", is_final=True) 134 return [] 135 136 query_type = self.parameters.get("query_type") 137 query = self.parameters.get("query") 138 include_comments = self.parameters.get("include_comments", False) 139 140 if query_type == "newsfeed": 141 query_parameters = {"query": query, 142 "max_amount": self.parameters.get("amount")} 143 144 # Add start and end dates if provided 145 if self.parameters.get("min_date"): 146 query_parameters['start_time'] = self.parameters.get("min_date") 147 if self.parameters.get("max_date"): 148 query_parameters['end_time'] = self.parameters.get("max_date") 149 150 vk_helper = vk_session.get_api() 151 152 # Collect Newsfeed results 153 num_results = 0 154 self.dataset.update_status("Submitting query...") 155 for i, result_batch in enumerate(self.search_newsfeed(vk_helper, **query_parameters)): 156 if self.interrupted: 157 raise ProcessorInterruptedException("Interrupted while fetching newsfeed data from the VK API") 158 159 self.dataset.update_status(f"Processing results batch {i+1}") 160 for result in result_batch: 161 result.update({'4cat_item_type': 'post'}) 162 yield result 163 num_results += 1 164 165 if include_comments: 166 for comment in self.collect_all_comments(vk_helper, owner_id=result.get("owner_id"), post_id=result.get("id")): 167 comment.update({'4cat_item_type': 'comment'}) 168 yield comment 169 170 self.dataset.update_status(f"Received {num_results} results of max {self.parameters.get('amount')} from the VK API") 171 self.dataset.update_progress(num_results / self.parameters.get('amount'))
Use the VK API
Parameters
- query:
Returns
173 def login(self, username, password): 174 """ 175 Login and authenticate user 176 """ 177 vk_session = vk_api.VkApi(username, 178 password, 179 config_filename=self.config.get("PATH_SESSIONS").joinpath(username+"-vk_config.json")) 180 vk_session.auth() 181 182 return vk_session
Login and authenticate user
184 def search_newsfeed(self, vk_helper, query, max_amount, num_collected=0, start_time=None, end_time=None, **kwargs): 185 """ 186 Collects all newsfeed posts 187 188 :param Object vk_helper: Authorized vk_api.VkApi 189 :param str query: String representing the search query 190 :param int max_amount: Max number of posts to collect 191 :param int num_collected: Number of previously collected results 192 :param int start_time: Timestamp for earliest post 193 :param int end_time: Timestamp for latest post 194 :return generator: Yields groups of posts 195 """ 196 remaining = max_amount - num_collected 197 parameters = { 198 "q": query, 199 "extended": 1, 200 "count": remaining if remaining < 200 else 200, 201 "fields": self.expanded_profile_fields, 202 } 203 if start_time: 204 parameters["start_time"] = start_time 205 if end_time: 206 parameters["end_time"] = end_time 207 208 response = vk_helper.newsfeed.search(**parameters) 209 news_feed_results = response.get("items", []) 210 num_collected = num_collected + len(news_feed_results) 211 212 # Flesh out profiles and groups 213 author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])}) 214 [result.update({"author_profile": author_profiles.get(result.get("from_id"), {})}) for result in news_feed_results] 215 216 yield news_feed_results 217 218 # Collect additional results 219 if response.get("next_from") and num_collected < max_amount: 220 parameters.update({"start_from": response.get("next_from")}) 221 for additional_results in self.search_newsfeed(vk_helper, query, max_amount, num_collected=num_collected, **parameters): 222 yield additional_results
Collects all newsfeed posts
Parameters
- Object vk_helper: Authorized vk_api.VkApi
- str query: String representing the search query
- int max_amount: Max number of posts to collect
- int num_collected: Number of previously collected results
- int start_time: Timestamp for earliest post
- int end_time: Timestamp for latest post
Returns
Yields groups of posts
224 def collect_all_comments(self, vk_helper, owner_id, post_id): 225 """ 226 Collects all comments and replies to a VK post 227 228 :param Object vk_helper: Authorized vk_api.VkApi 229 :param int owner_id: Owner ID provided by post/comment/etc 230 :param int post_id: ID of post from which to collect comments 231 :return generator: Yields comments and replies 232 """ 233 # Collect top level comments from post 234 comments = self.get_comments(vk_helper, owner_id, post_id=post_id) 235 236 # Extract replies and collect more if needed 237 for comment in comments: 238 yield comment 239 240 reply_count = comment.get("thread", {}).get("count", 0) 241 replies = comment.get("thread", {}).get("items", []) 242 if reply_count > 10 and len(replies) == 10: 243 # Collect additional replies 244 replies += self.get_comments(vk_helper, owner_id, comment_id=comment.get("id"), last_collected_id=replies[-1].get("id"))[1:] 245 246 for reply in replies: 247 yield reply 248 if reply.get("thread"): 249 self.log.warning("VK Datasource issue with replies: additional depth needs to be handled; contact 4CAT devs") 250 # TODO: this will need modification if reply threads gain depth
Collects all comments and replies to a VK post
Parameters
- Object vk_helper: Authorized vk_api.VkApi
- int owner_id: Owner ID provided by post/comment/etc
- int post_id: ID of post from which to collect comments
Returns
Yields comments and replies
252 def get_comments(self, vk_helper, owner_id, post_id=None, comment_id=None, last_collected_id=None, **kwargs): 253 """ 254 Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either 255 post_id or comment_id, but not both. 256 257 More information can be found here: 258 https://vk.com/dev/wall.getComments 259 260 :param Object vk_helper: Authorized vk_api.VkApi 261 :param int owner_id: Owner ID provided by post/comment/etc 262 :param int post_id: ID of post from which to collect comments 263 :param int comment_id: ID of comment from which to collect comments 264 :param int last_collected_id: ID of the last comment to collected; used as start to continue collecting comments 265 :return list: List of comments 266 """ 267 if self.interrupted: 268 raise ProcessorInterruptedException("Interrupted while fetching comments from the VK API") 269 270 if post_id is None and comment_id is None: 271 raise ProcessorException("Must provide either post_id or comment_id to collect comments from VK") 272 273 parameters = { 274 "owner_id": owner_id, 275 "need_likes": 1, 276 "preview_length": 0, 277 "extended": 1, 278 "count": 100, 279 "thread_items_count": 10, 280 "fields": self.expanded_profile_fields, 281 } 282 if post_id: 283 parameters.update({"post_id": post_id}) 284 if comment_id: 285 parameters.update({"comment_id": comment_id}) 286 if last_collected_id: 287 parameters.update({"start_comment_id": last_collected_id}) 288 289 # Collect comments from VK 290 try: 291 response = vk_helper.wall.getComments(**parameters) 292 except vk_api.exceptions.ApiError as e: 293 self.dataset.log(f"Unable to collect comments for owner_id {owner_id} and {'post_id' if post_id is not None else 'comment_id'} {post_id if post_id is not None else comment_id}: {e}") 294 return [] 295 comments = response.get("items", []) 296 297 # Flesh out profiles and groups 298 author_profiles = self.expand_profile_fields({"profiles": response.get("profiles", []), "groups": response.get("groups", [])}) 299 [comment.update({"author_profile": author_profiles.get(comment.get("from_id"), {})}) for comment in comments] 300 # Also expand replies 301 [reply.update({"author_profile": author_profiles.get(reply.get("from_id"), {})}) for replies in [comment.get("thread", {}).get("items", []) for comment in comments if comment.get("thread")] for reply in replies] 302 303 # Check if there are potentially additional comments 304 if response.get("count") > 100 and len(comments) == 100: 305 # Update params with last collected comment 306 parameters.update({"start_comment_id": comments[-1].get("id")}) 307 # Collect additional comments from VK and remove first comment (which is duplicate) 308 comments += self.get_comments(vk_helper=vk_helper, **parameters)[1:] 309 310 return comments
Collect comments from either a post or another comment (i.e., replies to another comment). Must provide either post_id or comment_id, but not both.
More information can be found here: https://vk.com/dev/wall.getComments
Parameters
- Object vk_helper: Authorized vk_api.VkApi
- int owner_id: Owner ID provided by post/comment/etc
- int post_id: ID of post from which to collect comments
- int comment_id: ID of comment from which to collect comments
- int last_collected_id: ID of the last comment to collected; used as start to continue collecting comments
Returns
List of comments
312 @ staticmethod 313 def expand_profile_fields(dict_of_profile_types): 314 """ 315 Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to 316 differentiate source of data later. 317 """ 318 author_types = {} 319 for profile_type, profiles in dict_of_profile_types.items(): 320 for profile in profiles: 321 if "id" not in profile: 322 raise ProcessorException("Profile missing id field; VK data format incorrect/changed") 323 elif profile.get("id") in author_types: 324 raise ProcessorException("Profile id duplicated across profile types; unable to combine profiles") 325 profile.update({"4CAT_author_profile_type": profile_type}) 326 author_types[profile.get("id")] = profile 327 return author_types
Combine various VK profile and group author information for easy lookup. Add 4CAT_author_profile_type field to differentiate source of data later.
329 @staticmethod 330 def validate_query(query, request, config): 331 """ 332 Validate input for a dataset query on the VK data source. 333 334 Will raise a QueryParametersException if invalid parameters are 335 encountered. Parameters are additionally sanitised. 336 337 :param dict query: Query parameters, from client-side. 338 :param request: Flask request 339 :param ConfigManager|None config: Configuration reader (context-aware) 340 :return dict: Safe query parameters 341 """ 342 # Please provide something... 343 if not query.get("query", None): 344 raise QueryParametersException("Please provide a query.") 345 346 # the dates need to make sense as a range to search within 347 # but, on VK, you can also specify before *or* after only 348 after, before = query.get("daterange") 349 if before and after and before < after: 350 raise QueryParametersException("Date range must start before it ends") 351 352 # TODO: test username and password? 353 354 # if we made it this far, the query can be executed 355 params = { 356 "query": query.get("query"), 357 "query_type": query.get("query_type"), 358 "amount": query.get("amount"), 359 "include_comments": query.get("include_comments"), 360 "min_date": after, 361 "max_date": before, 362 "username": query.get("username"), 363 "password": query.get("password"), 364 } 365 366 return params
Validate input for a dataset query on the VK data source.
Will raise a QueryParametersException if invalid parameters are encountered. Parameters are additionally sanitised.
Parameters
- dict query: Query parameters, from client-side.
- request: Flask request
- ConfigManager|None config: Configuration reader (context-aware)
Returns
Safe query parameters
368 @staticmethod 369 def map_item(item): 370 """ 371 Map a nested VK object to a flat dictionary 372 373 :param item: VK object as originally returned by the VK API 374 :return dict: Dictionary in the format expected by 4CAT 375 """ 376 vk_item_time = datetime.datetime.fromtimestamp(item.get('date')) 377 378 # Process attachments 379 photos = [] 380 videos = [] 381 audio = [] 382 links = [] 383 docs = [] 384 for attachment in item.get("attachments", []): 385 attachment_type = attachment.get("type") 386 attachment = attachment.get(attachment_type) 387 if attachment_type == "photo": 388 if attachment.get("sizes"): 389 photos.append(sorted(attachment.get("sizes"), key=lambda d: d['width'], reverse=True)[0].get('url')) 390 else: 391 photos.append(str(attachment)) 392 elif attachment_type == "video": 393 # TODO: can I get the actual URL? Does not seem like it... 394 videos.append(f"https://vk.com/video{attachment.get('owner_id')}_{attachment.get('id')}") 395 elif attachment_type == "audio": 396 # TODO: Seem unable to create the URL with provided information... 397 audio.append(f"{attachment.get('artist')} - {attachment.get('title')}") 398 elif attachment_type == "link": 399 links.append(attachment.get('url', str(attachment))) 400 elif attachment_type == "doc": 401 docs.append(attachment.get('url', str(attachment))) 402 403 # Use 4cat_item_type to populate different fields 404 tread_id = "" 405 in_reply_to_user = "" 406 in_reply_to_comment_id = "" 407 if item.get("4cat_item_type") == "post": 408 tread_id = item.get("id") 409 elif item.get("4cat_item_type") == "comment": 410 tread_id = item.get("post_id") 411 in_reply_to_user = item.get("reply_to_user") 412 in_reply_to_comment_id = item.get("reply_to_comment") 413 414 author_profile = item.get("author_profile", {}) 415 profile_source = "user" if author_profile.get("4CAT_author_profile_type") == "profile" else "community" if author_profile.get("4CAT_author_profile_type") == "group" else "N/A" 416 # Use source of author profile if "type" not present (e.g., in users profiles do not seem to have type) 417 author_type = author_profile.get("type", profile_source) 418 419 return MappedItem({ 420 "id": item.get("id"), 421 "thread_id": tread_id, 422 "timestamp": vk_item_time.strftime("%Y-%m-%d %H:%M:%S"), 423 "unix_timestamp": int(vk_item_time.timestamp()), 424 "link": f"https://vk.com/wall{item.get('owner_id')}_{item.get('id')}", 425 "item_type": item.get("4cat_item_type"), 426 "body": item.get("text"), 427 "author_id": item.get("from_id"), 428 "author_type": author_type, 429 "author_screen_name": author_profile.get("screen_name"), 430 "author_name": author_profile.get("name", " ".join([author_profile.get("first_name", ""), author_profile.get("last_name", "")])), 431 "author_sex": "F" if author_profile.get("sex") == 1 else "M" if author_profile.get("sex") == 2 else "Not Specified" if author_profile.get("sex") == 0 else author_profile.get("sex", "N/A"), 432 "author_city": author_profile.get("city", {}).get("title", ""), 433 "author_country": author_profile.get("country", {}).get("title", ""), 434 "author_photo": author_profile.get("photo_200", 435 author_profile.get("photo_100", author_profile.get("photo_50", ""))), 436 "author_is_admin": True if author_profile.get("is_admin") == 1 else False if author_profile.get("is_admin") == 0 else author_profile.get("is_admin", "N/A"), 437 "author_is_advertiser": True if author_profile.get("is_advertiser") == 1 else False if author_profile.get( 438 "is_advertiser") == 0 else author_profile.get("is_advertiser", "N/A"), 439 "author_deactivated": author_profile.get("is_deactivated", False), 440 "author_privacy_is_closed": 'closed' if author_profile.get("is_closed") == 1 else 'open' if author_profile.get("is_closed") == 0 else 'private' if author_profile.get("is_closed") == 2 else author_profile.get("is_closed", "N/A"), 441 "author_followers": author_profile.get("followers_count", author_profile.get("members_count", "N/A")), 442 "in_reply_to_user": in_reply_to_user, 443 "in_reply_to_comment_id": in_reply_to_comment_id, 444 "source": item.get("post_source", {}).get("type"), 445 "views": item.get("views", {}).get("count"), 446 "likes": item.get("likes", {}).get("count"), 447 "post_comments": item.get("comments", {}).get("count"), 448 "edited": datetime.datetime.fromtimestamp(item.get("edited")).strftime("%Y-%m-%d %H:%M:%S") if item.get("edited", False) else False, 449 "photos": ", ".join(photos), 450 "videos": ", ".join(videos), 451 "audio": ", ".join(audio), 452 "links": ", ".join(links), 453 "docs": ", ".join(docs), 454 "subject": "", 455 })
Map a nested VK object to a flat dictionary
Parameters
- item: VK object as originally returned by the VK API
Returns
Dictionary in the format expected by 4CAT
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- description
- category
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor