datasources.douban.search_douban
Search Douban groups
1""" 2Search Douban groups 3""" 4import requests 5import datetime 6import time 7import re 8 9from bs4 import BeautifulSoup 10 11from backend.lib.search import Search 12from common.lib.helpers import convert_to_int, strip_tags, UserInput 13from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException 14 15 16class SearchDouban(Search): 17 """ 18 Search Douban groups 19 20 Defines methods that are used to query Douban data from the site directly 21 """ 22 type = "douban-search" # job ID 23 category = "Search" # category 24 title = "Douban Search" # title displayed in UI 25 description = "Scrapes group posts from Douban for a given set of groups" # description displayed in UI 26 extension = "csv" # extension of result file, used internally and in UI 27 is_local = False # Whether this datasource is locally scraped 28 is_static = False # Whether this datasource is still updated 29 30 # not available as a processor for existing datasets 31 accepts = [None] 32 33 max_workers = 1 34 35 @classmethod 36 def get_options(cls, parent_dataset=None, config=None) -> dict: 37 """ 38 Get processor options 39 40 :param parent_dataset DataSet: An object representing the dataset that 41 the processor would be or was run on. Can be used, in conjunction with 42 config, to show some options only to privileged users. 43 :param config ConfigManager|None config: Configuration reader (context-aware) 44 :return dict: Options for this processor 45 """ 46 return { 47 "intro": { 48 "type": UserInput.OPTION_INFO, 49 "help": "You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. " 50 "`https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If " 51 "you enter more than 25 groups, only the first 25 will be scraped." 52 }, 53 "groups": { 54 "type": UserInput.OPTION_TEXT_LARGE, 55 "help": "Groups", 56 "tooltip": "Enter group IDs or URLs, separate with commas or new lines" 57 }, 58 "divider": { 59 "type": UserInput.OPTION_DIVIDER 60 }, 61 "amount": { 62 "type": UserInput.OPTION_TEXT, 63 "help": "Threads per group", 64 "min": 1, 65 "max": 200, 66 "default": 10 67 }, 68 "strip": { 69 "type": UserInput.OPTION_TOGGLE, 70 "help": "Strip HTML?", 71 "default": True 72 }, 73 "divider-2": { 74 "type": UserInput.OPTION_DIVIDER 75 }, 76 "daterange-info": { 77 "type": UserInput.OPTION_INFO, 78 "help": "Note that Douban severely limits the retrieval of older content. Therefore this data source " 79 "can only scrape the most recent topics in a given group. You can optionally limit the scraped " 80 "topics to a given date range, but note that typically only the 500 or so most recent topics in a " 81 "group will be available for scraping." 82 }, 83 "daterange": { 84 "type": UserInput.OPTION_DATERANGE, 85 "help": "Last post between" 86 } 87 } 88 89 def get_items(self, query): 90 """ 91 Get Douban posts 92 93 In the case of Douban, there is no need for multiple pathways, so we 94 can route it all to the one post query method. Will scrape posts from the 95 most recent topics for a given list of groups. Douban prevents scraping 96 old content, so this is mostly useful to get a sense of what a given 97 group is talking about at the moment. 98 99 :param query: Filtered query parameters 100 :return: 101 """ 102 groups = query["groups"].split(",") 103 max_topics = min(convert_to_int(query["amount"], 100), 500) 104 start = query["min_date"] 105 end = query["max_date"] 106 strip = bool(query["strip"]) 107 topics_processed = 0 108 posts_processed = 0 109 110 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"} 111 112 for group in groups: 113 # get URL for group index 114 group = str(group) 115 group_url = "https://www.douban.com/group/%s/discussion?start=" % group 116 117 offset = 0 118 while True: 119 # get list of topics in group, for the given offset 120 fetch_url = group_url + str(offset) 121 request = self.get_douban_url(fetch_url, headers=headers) 122 123 # this would usually mean the group doesn't exist, or we hit some rate limit 124 if request.status_code != 200: 125 self.dataset.update_status( 126 "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group)) 127 break 128 129 self.dataset.update_status("Scraping group %s..." % group) 130 131 # parse the HTML and get links to individual topics, as well as group name 132 overview_page = BeautifulSoup(request.text, 'html.parser') 133 group_name = overview_page.select_one(".group-item .title a").text 134 135 for topic in overview_page.select("table.olt tr:not(.th)"): 136 if self.interrupted: 137 raise ProcessorInterruptedException("Interrupted while scraping Douban topics") 138 139 if topics_processed >= max_topics: 140 break 141 142 # get topic URL, and whether it is an 'elite' topic 143 topic_url = topic.find("a").get("href") 144 topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no" 145 topic_id = topic_url.split("/topic/").pop().split("/")[0] 146 147 # date can be in either of two formats, with or without time 148 try: 149 topic_updated = int( 150 datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp()) 151 except ValueError: 152 topic_updated = int( 153 datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp()) 154 155 # if a date range is given, ignore topics outside of it 156 if start and topic_updated < start: 157 continue 158 159 if end and topic_updated > end: 160 break 161 162 self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % ( 163 posts_processed, offset, min(max_topics, offset + 50), group_name)) 164 165 # request topic page - fortunately all comments are on a single page 166 topic_request = self.get_douban_url(topic_url, headers=headers) 167 time.sleep(5) # don't hit rate limits 168 topic_page = BeautifulSoup(topic_request.text, 'html.parser') 169 topic = topic_page.select_one("#topic-content") 170 171 topics_processed += 1 172 173 # include original post as the first item 174 try: 175 first_post = { 176 "id": topic_id, 177 "group_id": group, 178 "thread_id": topic_id, 179 "group_name": group_name, 180 "subject": topic_page.select_one("h1").text.strip(), 181 "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(), 182 "author": topic.select_one(".user-face img").get("alt"), 183 "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 184 "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"), 185 "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text, 186 "%Y-%m-%d %H:%M:%S").timestamp()), 187 "likes": 0, 188 "is_highlighted": "no", 189 "is_reply": "no", 190 "is_topic_elite": topic_is_elite, 191 "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")]) 192 } 193 except (AttributeError, ValueError): 194 self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id)) 195 continue 196 197 if strip: 198 first_post["body"] = strip_tags(first_post["body"]) 199 200 posts_processed += 1 201 yield first_post 202 203 # now loop through all comments on the page 204 for comment in topic_page.select("ul#comments > li"): 205 comment_data = { 206 "id": comment.get("data-cid"), 207 "group_id": group, 208 "thread_id": topic_id, 209 "group_name": group_name, 210 "subject": "", 211 "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(), 212 "author": comment.select_one(".user-face img").get("alt"), 213 "author_id": 214 comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 215 "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"), 216 "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19], 217 "%Y-%m-%d %H:%M:%S").timestamp()), 218 "likes": convert_to_int( 219 re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown", 220 "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in 221 comment.select( 222 "ul#popular-comments li")] else "no", 223 "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no", 224 "is_topic_elite": topic_is_elite, 225 "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")]) 226 } 227 228 if strip: 229 comment_data["body"] = strip_tags(comment_data["body"]) 230 231 posts_processed += 1 232 yield comment_data 233 234 if offset < max_topics - 50: 235 offset += 50 236 else: 237 break 238 239 def get_douban_url(self, url, **kwargs): 240 """ 241 Get Douban page with requests 242 243 Abstracted away like this so we can easily implement measures to 244 circumvent rate limiting later. 245 246 :param str url: URL to request 247 :return: Response object 248 """ 249 if url[0:2] == "//": 250 url = "https:" + url 251 elif url[0] == "/": 252 url = "https://douban.com" + url 253 254 return requests.get(url, **kwargs) 255 256 def validate_query(query, request, config): 257 """ 258 Validate input for a dataset query on the Douban data source. 259 260 :param dict query: Query parameters, from client-side. 261 :param request: Flask request 262 :param ConfigManager|None config: Configuration reader (context-aware) 263 :return dict: Safe query parameters 264 """ 265 filtered_query = {} 266 267 # the dates need to make sense as a range to search within 268 after, before = query.get("daterange") 269 if before and after and before < after: 270 raise QueryParametersException("Date range must start before it ends") 271 272 filtered_query["min_date"], filtered_query["max_date"] = (after, before) 273 274 # normalize groups to just their IDs, even if a URL was provided, and 275 # limit to 25 276 groups = [group.split("/group/").pop().split("/")[0].strip() for group in 277 query["groups"].replace("\n", ",").split(",")] 278 groups = [group for group in groups if group][:25] 279 if not any(groups): 280 raise QueryParametersException("No valid groups were provided.") 281 282 filtered_query["groups"] = ",".join(groups) 283 284 # max amount of topics is 200 because after that Douban starts throwing 429s 285 filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1) 286 287 # strip HTML from posts? 288 filtered_query["strip"] = bool(query.get("strip", False)) 289 290 return filtered_query
17class SearchDouban(Search): 18 """ 19 Search Douban groups 20 21 Defines methods that are used to query Douban data from the site directly 22 """ 23 type = "douban-search" # job ID 24 category = "Search" # category 25 title = "Douban Search" # title displayed in UI 26 description = "Scrapes group posts from Douban for a given set of groups" # description displayed in UI 27 extension = "csv" # extension of result file, used internally and in UI 28 is_local = False # Whether this datasource is locally scraped 29 is_static = False # Whether this datasource is still updated 30 31 # not available as a processor for existing datasets 32 accepts = [None] 33 34 max_workers = 1 35 36 @classmethod 37 def get_options(cls, parent_dataset=None, config=None) -> dict: 38 """ 39 Get processor options 40 41 :param parent_dataset DataSet: An object representing the dataset that 42 the processor would be or was run on. Can be used, in conjunction with 43 config, to show some options only to privileged users. 44 :param config ConfigManager|None config: Configuration reader (context-aware) 45 :return dict: Options for this processor 46 """ 47 return { 48 "intro": { 49 "type": UserInput.OPTION_INFO, 50 "help": "You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. " 51 "`https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If " 52 "you enter more than 25 groups, only the first 25 will be scraped." 53 }, 54 "groups": { 55 "type": UserInput.OPTION_TEXT_LARGE, 56 "help": "Groups", 57 "tooltip": "Enter group IDs or URLs, separate with commas or new lines" 58 }, 59 "divider": { 60 "type": UserInput.OPTION_DIVIDER 61 }, 62 "amount": { 63 "type": UserInput.OPTION_TEXT, 64 "help": "Threads per group", 65 "min": 1, 66 "max": 200, 67 "default": 10 68 }, 69 "strip": { 70 "type": UserInput.OPTION_TOGGLE, 71 "help": "Strip HTML?", 72 "default": True 73 }, 74 "divider-2": { 75 "type": UserInput.OPTION_DIVIDER 76 }, 77 "daterange-info": { 78 "type": UserInput.OPTION_INFO, 79 "help": "Note that Douban severely limits the retrieval of older content. Therefore this data source " 80 "can only scrape the most recent topics in a given group. You can optionally limit the scraped " 81 "topics to a given date range, but note that typically only the 500 or so most recent topics in a " 82 "group will be available for scraping." 83 }, 84 "daterange": { 85 "type": UserInput.OPTION_DATERANGE, 86 "help": "Last post between" 87 } 88 } 89 90 def get_items(self, query): 91 """ 92 Get Douban posts 93 94 In the case of Douban, there is no need for multiple pathways, so we 95 can route it all to the one post query method. Will scrape posts from the 96 most recent topics for a given list of groups. Douban prevents scraping 97 old content, so this is mostly useful to get a sense of what a given 98 group is talking about at the moment. 99 100 :param query: Filtered query parameters 101 :return: 102 """ 103 groups = query["groups"].split(",") 104 max_topics = min(convert_to_int(query["amount"], 100), 500) 105 start = query["min_date"] 106 end = query["max_date"] 107 strip = bool(query["strip"]) 108 topics_processed = 0 109 posts_processed = 0 110 111 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"} 112 113 for group in groups: 114 # get URL for group index 115 group = str(group) 116 group_url = "https://www.douban.com/group/%s/discussion?start=" % group 117 118 offset = 0 119 while True: 120 # get list of topics in group, for the given offset 121 fetch_url = group_url + str(offset) 122 request = self.get_douban_url(fetch_url, headers=headers) 123 124 # this would usually mean the group doesn't exist, or we hit some rate limit 125 if request.status_code != 200: 126 self.dataset.update_status( 127 "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group)) 128 break 129 130 self.dataset.update_status("Scraping group %s..." % group) 131 132 # parse the HTML and get links to individual topics, as well as group name 133 overview_page = BeautifulSoup(request.text, 'html.parser') 134 group_name = overview_page.select_one(".group-item .title a").text 135 136 for topic in overview_page.select("table.olt tr:not(.th)"): 137 if self.interrupted: 138 raise ProcessorInterruptedException("Interrupted while scraping Douban topics") 139 140 if topics_processed >= max_topics: 141 break 142 143 # get topic URL, and whether it is an 'elite' topic 144 topic_url = topic.find("a").get("href") 145 topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no" 146 topic_id = topic_url.split("/topic/").pop().split("/")[0] 147 148 # date can be in either of two formats, with or without time 149 try: 150 topic_updated = int( 151 datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp()) 152 except ValueError: 153 topic_updated = int( 154 datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp()) 155 156 # if a date range is given, ignore topics outside of it 157 if start and topic_updated < start: 158 continue 159 160 if end and topic_updated > end: 161 break 162 163 self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % ( 164 posts_processed, offset, min(max_topics, offset + 50), group_name)) 165 166 # request topic page - fortunately all comments are on a single page 167 topic_request = self.get_douban_url(topic_url, headers=headers) 168 time.sleep(5) # don't hit rate limits 169 topic_page = BeautifulSoup(topic_request.text, 'html.parser') 170 topic = topic_page.select_one("#topic-content") 171 172 topics_processed += 1 173 174 # include original post as the first item 175 try: 176 first_post = { 177 "id": topic_id, 178 "group_id": group, 179 "thread_id": topic_id, 180 "group_name": group_name, 181 "subject": topic_page.select_one("h1").text.strip(), 182 "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(), 183 "author": topic.select_one(".user-face img").get("alt"), 184 "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 185 "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"), 186 "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text, 187 "%Y-%m-%d %H:%M:%S").timestamp()), 188 "likes": 0, 189 "is_highlighted": "no", 190 "is_reply": "no", 191 "is_topic_elite": topic_is_elite, 192 "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")]) 193 } 194 except (AttributeError, ValueError): 195 self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id)) 196 continue 197 198 if strip: 199 first_post["body"] = strip_tags(first_post["body"]) 200 201 posts_processed += 1 202 yield first_post 203 204 # now loop through all comments on the page 205 for comment in topic_page.select("ul#comments > li"): 206 comment_data = { 207 "id": comment.get("data-cid"), 208 "group_id": group, 209 "thread_id": topic_id, 210 "group_name": group_name, 211 "subject": "", 212 "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(), 213 "author": comment.select_one(".user-face img").get("alt"), 214 "author_id": 215 comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 216 "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"), 217 "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19], 218 "%Y-%m-%d %H:%M:%S").timestamp()), 219 "likes": convert_to_int( 220 re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown", 221 "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in 222 comment.select( 223 "ul#popular-comments li")] else "no", 224 "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no", 225 "is_topic_elite": topic_is_elite, 226 "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")]) 227 } 228 229 if strip: 230 comment_data["body"] = strip_tags(comment_data["body"]) 231 232 posts_processed += 1 233 yield comment_data 234 235 if offset < max_topics - 50: 236 offset += 50 237 else: 238 break 239 240 def get_douban_url(self, url, **kwargs): 241 """ 242 Get Douban page with requests 243 244 Abstracted away like this so we can easily implement measures to 245 circumvent rate limiting later. 246 247 :param str url: URL to request 248 :return: Response object 249 """ 250 if url[0:2] == "//": 251 url = "https:" + url 252 elif url[0] == "/": 253 url = "https://douban.com" + url 254 255 return requests.get(url, **kwargs) 256 257 def validate_query(query, request, config): 258 """ 259 Validate input for a dataset query on the Douban data source. 260 261 :param dict query: Query parameters, from client-side. 262 :param request: Flask request 263 :param ConfigManager|None config: Configuration reader (context-aware) 264 :return dict: Safe query parameters 265 """ 266 filtered_query = {} 267 268 # the dates need to make sense as a range to search within 269 after, before = query.get("daterange") 270 if before and after and before < after: 271 raise QueryParametersException("Date range must start before it ends") 272 273 filtered_query["min_date"], filtered_query["max_date"] = (after, before) 274 275 # normalize groups to just their IDs, even if a URL was provided, and 276 # limit to 25 277 groups = [group.split("/group/").pop().split("/")[0].strip() for group in 278 query["groups"].replace("\n", ",").split(",")] 279 groups = [group for group in groups if group][:25] 280 if not any(groups): 281 raise QueryParametersException("No valid groups were provided.") 282 283 filtered_query["groups"] = ",".join(groups) 284 285 # max amount of topics is 200 because after that Douban starts throwing 429s 286 filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1) 287 288 # strip HTML from posts? 289 filtered_query["strip"] = bool(query.get("strip", False)) 290 291 return filtered_query
Search Douban groups
Defines methods that are used to query Douban data from the site directly
@classmethod
def
get_options(cls, parent_dataset=None, config=None) -> dict:
36 @classmethod 37 def get_options(cls, parent_dataset=None, config=None) -> dict: 38 """ 39 Get processor options 40 41 :param parent_dataset DataSet: An object representing the dataset that 42 the processor would be or was run on. Can be used, in conjunction with 43 config, to show some options only to privileged users. 44 :param config ConfigManager|None config: Configuration reader (context-aware) 45 :return dict: Options for this processor 46 """ 47 return { 48 "intro": { 49 "type": UserInput.OPTION_INFO, 50 "help": "You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. " 51 "`https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If " 52 "you enter more than 25 groups, only the first 25 will be scraped." 53 }, 54 "groups": { 55 "type": UserInput.OPTION_TEXT_LARGE, 56 "help": "Groups", 57 "tooltip": "Enter group IDs or URLs, separate with commas or new lines" 58 }, 59 "divider": { 60 "type": UserInput.OPTION_DIVIDER 61 }, 62 "amount": { 63 "type": UserInput.OPTION_TEXT, 64 "help": "Threads per group", 65 "min": 1, 66 "max": 200, 67 "default": 10 68 }, 69 "strip": { 70 "type": UserInput.OPTION_TOGGLE, 71 "help": "Strip HTML?", 72 "default": True 73 }, 74 "divider-2": { 75 "type": UserInput.OPTION_DIVIDER 76 }, 77 "daterange-info": { 78 "type": UserInput.OPTION_INFO, 79 "help": "Note that Douban severely limits the retrieval of older content. Therefore this data source " 80 "can only scrape the most recent topics in a given group. You can optionally limit the scraped " 81 "topics to a given date range, but note that typically only the 500 or so most recent topics in a " 82 "group will be available for scraping." 83 }, 84 "daterange": { 85 "type": UserInput.OPTION_DATERANGE, 86 "help": "Last post between" 87 } 88 }
Get processor options
Parameters
- parent_dataset DataSet: An object representing the dataset that the processor would be or was run on. Can be used, in conjunction with config, to show some options only to privileged users.
- config ConfigManager|None config: Configuration reader (context-aware)
Returns
Options for this processor
def
get_items(self, query):
90 def get_items(self, query): 91 """ 92 Get Douban posts 93 94 In the case of Douban, there is no need for multiple pathways, so we 95 can route it all to the one post query method. Will scrape posts from the 96 most recent topics for a given list of groups. Douban prevents scraping 97 old content, so this is mostly useful to get a sense of what a given 98 group is talking about at the moment. 99 100 :param query: Filtered query parameters 101 :return: 102 """ 103 groups = query["groups"].split(",") 104 max_topics = min(convert_to_int(query["amount"], 100), 500) 105 start = query["min_date"] 106 end = query["max_date"] 107 strip = bool(query["strip"]) 108 topics_processed = 0 109 posts_processed = 0 110 111 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"} 112 113 for group in groups: 114 # get URL for group index 115 group = str(group) 116 group_url = "https://www.douban.com/group/%s/discussion?start=" % group 117 118 offset = 0 119 while True: 120 # get list of topics in group, for the given offset 121 fetch_url = group_url + str(offset) 122 request = self.get_douban_url(fetch_url, headers=headers) 123 124 # this would usually mean the group doesn't exist, or we hit some rate limit 125 if request.status_code != 200: 126 self.dataset.update_status( 127 "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group)) 128 break 129 130 self.dataset.update_status("Scraping group %s..." % group) 131 132 # parse the HTML and get links to individual topics, as well as group name 133 overview_page = BeautifulSoup(request.text, 'html.parser') 134 group_name = overview_page.select_one(".group-item .title a").text 135 136 for topic in overview_page.select("table.olt tr:not(.th)"): 137 if self.interrupted: 138 raise ProcessorInterruptedException("Interrupted while scraping Douban topics") 139 140 if topics_processed >= max_topics: 141 break 142 143 # get topic URL, and whether it is an 'elite' topic 144 topic_url = topic.find("a").get("href") 145 topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no" 146 topic_id = topic_url.split("/topic/").pop().split("/")[0] 147 148 # date can be in either of two formats, with or without time 149 try: 150 topic_updated = int( 151 datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp()) 152 except ValueError: 153 topic_updated = int( 154 datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp()) 155 156 # if a date range is given, ignore topics outside of it 157 if start and topic_updated < start: 158 continue 159 160 if end and topic_updated > end: 161 break 162 163 self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % ( 164 posts_processed, offset, min(max_topics, offset + 50), group_name)) 165 166 # request topic page - fortunately all comments are on a single page 167 topic_request = self.get_douban_url(topic_url, headers=headers) 168 time.sleep(5) # don't hit rate limits 169 topic_page = BeautifulSoup(topic_request.text, 'html.parser') 170 topic = topic_page.select_one("#topic-content") 171 172 topics_processed += 1 173 174 # include original post as the first item 175 try: 176 first_post = { 177 "id": topic_id, 178 "group_id": group, 179 "thread_id": topic_id, 180 "group_name": group_name, 181 "subject": topic_page.select_one("h1").text.strip(), 182 "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(), 183 "author": topic.select_one(".user-face img").get("alt"), 184 "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 185 "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"), 186 "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text, 187 "%Y-%m-%d %H:%M:%S").timestamp()), 188 "likes": 0, 189 "is_highlighted": "no", 190 "is_reply": "no", 191 "is_topic_elite": topic_is_elite, 192 "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")]) 193 } 194 except (AttributeError, ValueError): 195 self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id)) 196 continue 197 198 if strip: 199 first_post["body"] = strip_tags(first_post["body"]) 200 201 posts_processed += 1 202 yield first_post 203 204 # now loop through all comments on the page 205 for comment in topic_page.select("ul#comments > li"): 206 comment_data = { 207 "id": comment.get("data-cid"), 208 "group_id": group, 209 "thread_id": topic_id, 210 "group_name": group_name, 211 "subject": "", 212 "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(), 213 "author": comment.select_one(".user-face img").get("alt"), 214 "author_id": 215 comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 216 "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"), 217 "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19], 218 "%Y-%m-%d %H:%M:%S").timestamp()), 219 "likes": convert_to_int( 220 re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown", 221 "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in 222 comment.select( 223 "ul#popular-comments li")] else "no", 224 "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no", 225 "is_topic_elite": topic_is_elite, 226 "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")]) 227 } 228 229 if strip: 230 comment_data["body"] = strip_tags(comment_data["body"]) 231 232 posts_processed += 1 233 yield comment_data 234 235 if offset < max_topics - 50: 236 offset += 50 237 else: 238 break
Get Douban posts
In the case of Douban, there is no need for multiple pathways, so we can route it all to the one post query method. Will scrape posts from the most recent topics for a given list of groups. Douban prevents scraping old content, so this is mostly useful to get a sense of what a given group is talking about at the moment.
Parameters
- query: Filtered query parameters
Returns
def
get_douban_url(self, url, **kwargs):
240 def get_douban_url(self, url, **kwargs): 241 """ 242 Get Douban page with requests 243 244 Abstracted away like this so we can easily implement measures to 245 circumvent rate limiting later. 246 247 :param str url: URL to request 248 :return: Response object 249 """ 250 if url[0:2] == "//": 251 url = "https:" + url 252 elif url[0] == "/": 253 url = "https://douban.com" + url 254 255 return requests.get(url, **kwargs)
Get Douban page with requests
Abstracted away like this so we can easily implement measures to circumvent rate limiting later.
Parameters
- str url: URL to request
Returns
Response object
def
validate_query(query, request, config):
257 def validate_query(query, request, config): 258 """ 259 Validate input for a dataset query on the Douban data source. 260 261 :param dict query: Query parameters, from client-side. 262 :param request: Flask request 263 :param ConfigManager|None config: Configuration reader (context-aware) 264 :return dict: Safe query parameters 265 """ 266 filtered_query = {} 267 268 # the dates need to make sense as a range to search within 269 after, before = query.get("daterange") 270 if before and after and before < after: 271 raise QueryParametersException("Date range must start before it ends") 272 273 filtered_query["min_date"], filtered_query["max_date"] = (after, before) 274 275 # normalize groups to just their IDs, even if a URL was provided, and 276 # limit to 25 277 groups = [group.split("/group/").pop().split("/")[0].strip() for group in 278 query["groups"].replace("\n", ",").split(",")] 279 groups = [group for group in groups if group][:25] 280 if not any(groups): 281 raise QueryParametersException("No valid groups were provided.") 282 283 filtered_query["groups"] = ",".join(groups) 284 285 # max amount of topics is 200 because after that Douban starts throwing 429s 286 filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1) 287 288 # strip HTML from posts? 289 filtered_query["strip"] = bool(query.get("strip", False)) 290 291 return filtered_query
Validate input for a dataset query on the Douban data source.
Parameters
- dict query: Query parameters, from client-side.
- request: Flask request
- ConfigManager|None config: Configuration reader (context-aware)
Returns
Safe query parameters
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- run_interruptable_process
- get_queue_id
- is_4cat_class
- backend.lib.search.Search
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- for_cleanup
- work
- after_process
- clean_up_on_error
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor