datasources.douban.search_douban
Search Douban groups
1""" 2Search Douban groups 3""" 4import requests 5import datetime 6import time 7import re 8 9from bs4 import BeautifulSoup 10 11from backend.lib.search import Search 12from common.lib.helpers import convert_to_int, strip_tags, UserInput 13from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException 14 15 16class SearchDouban(Search): 17 """ 18 Search Douban groups 19 20 Defines methods that are used to query Douban data from the site directly 21 """ 22 type = "douban-search" # job ID 23 category = "Search" # category 24 title = "Douban Search" # title displayed in UI 25 description = "Scrapes group posts from Douban for a given set of groups" # description displayed in UI 26 extension = "csv" # extension of result file, used internally and in UI 27 is_local = False # Whether this datasource is locally scraped 28 is_static = False # Whether this datasource is still updated 29 30 # not available as a processor for existing datasets 31 accepts = [None] 32 33 max_workers = 1 34 35 options = { 36 "intro": { 37 "type": UserInput.OPTION_INFO, 38 "help": "You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. " 39 "`https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If " 40 "you enter more than 25 groups, only the first 25 will be scraped." 41 }, 42 "groups": { 43 "type": UserInput.OPTION_TEXT_LARGE, 44 "help": "Groups", 45 "tooltip": "Enter group IDs or URLs, separate with commas or new lines" 46 }, 47 "divider": { 48 "type": UserInput.OPTION_DIVIDER 49 }, 50 "amount": { 51 "type": UserInput.OPTION_TEXT, 52 "help": "Threads per group", 53 "min": 1, 54 "max": 200, 55 "default": 10 56 }, 57 "strip": { 58 "type": UserInput.OPTION_TOGGLE, 59 "help": "Strip HTML?", 60 "default": True 61 }, 62 "divider-2": { 63 "type": UserInput.OPTION_DIVIDER 64 }, 65 "daterange-info": { 66 "type": UserInput.OPTION_INFO, 67 "help": "Note that Douban severely limits the retrieval of older content. Therefore this data source " 68 "can only scrape the most recent topics in a given group. You can optionally limit the scraped " 69 "topics to a given date range, but note that typically only the 500 or so most recent topics in a " 70 "group will be available for scraping." 71 }, 72 "daterange": { 73 "type": UserInput.OPTION_DATERANGE, 74 "help": "Last post between" 75 } 76 } 77 78 def get_items(self, query): 79 """ 80 Get Douban posts 81 82 In the case of Douban, there is no need for multiple pathways, so we 83 can route it all to the one post query method. Will scrape posts from the 84 most recent topics for a given list of groups. Douban prevents scraping 85 old content, so this is mostly useful to get a sense of what a given 86 group is talking about at the moment. 87 88 :param query: Filtered query parameters 89 :return: 90 """ 91 groups = query["groups"].split(",") 92 max_topics = min(convert_to_int(query["amount"], 100), 500) 93 start = query["min_date"] 94 end = query["max_date"] 95 strip = bool(query["strip"]) 96 topics_processed = 0 97 posts_processed = 0 98 99 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"} 100 101 for group in groups: 102 # get URL for group index 103 group = str(group) 104 group_url = "https://www.douban.com/group/%s/discussion?start=" % group 105 106 offset = 0 107 while True: 108 # get list of topics in group, for the given offset 109 fetch_url = group_url + str(offset) 110 request = self.get_douban_url(fetch_url, headers=headers) 111 112 # this would usually mean the group doesn't exist, or we hit some rate limit 113 if request.status_code != 200: 114 self.dataset.update_status( 115 "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group)) 116 break 117 118 self.dataset.update_status("Scraping group %s..." % group) 119 120 # parse the HTML and get links to individual topics, as well as group name 121 overview_page = BeautifulSoup(request.text, 'html.parser') 122 group_name = overview_page.select_one(".group-item .title a").text 123 124 for topic in overview_page.select("table.olt tr:not(.th)"): 125 if self.interrupted: 126 raise ProcessorInterruptedException("Interrupted while scraping Douban topics") 127 128 if topics_processed >= max_topics: 129 break 130 131 # get topic URL, and whether it is an 'elite' topic 132 topic_url = topic.find("a").get("href") 133 topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no" 134 topic_id = topic_url.split("/topic/").pop().split("/")[0] 135 136 # date can be in either of two formats, with or without time 137 try: 138 topic_updated = int( 139 datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp()) 140 except ValueError: 141 topic_updated = int( 142 datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp()) 143 144 # if a date range is given, ignore topics outside of it 145 if start and topic_updated < start: 146 continue 147 148 if end and topic_updated > end: 149 break 150 151 self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % ( 152 posts_processed, offset, min(max_topics, offset + 50), group_name)) 153 154 # request topic page - fortunately all comments are on a single page 155 topic_request = self.get_douban_url(topic_url, headers=headers) 156 time.sleep(5) # don't hit rate limits 157 topic_page = BeautifulSoup(topic_request.text, 'html.parser') 158 topic = topic_page.select_one("#topic-content") 159 160 topics_processed += 1 161 162 # include original post as the first item 163 try: 164 first_post = { 165 "id": topic_id, 166 "group_id": group, 167 "thread_id": topic_id, 168 "group_name": group_name, 169 "subject": topic_page.select_one("h1").text.strip(), 170 "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(), 171 "author": topic.select_one(".user-face img").get("alt"), 172 "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 173 "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"), 174 "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text, 175 "%Y-%m-%d %H:%M:%S").timestamp()), 176 "likes": 0, 177 "is_highlighted": "no", 178 "is_reply": "no", 179 "is_topic_elite": topic_is_elite, 180 "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")]) 181 } 182 except (AttributeError, ValueError): 183 self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id)) 184 continue 185 186 if strip: 187 first_post["body"] = strip_tags(first_post["body"]) 188 189 posts_processed += 1 190 yield first_post 191 192 # now loop through all comments on the page 193 for comment in topic_page.select("ul#comments > li"): 194 comment_data = { 195 "id": comment.get("data-cid"), 196 "group_id": group, 197 "thread_id": topic_id, 198 "group_name": group_name, 199 "subject": "", 200 "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(), 201 "author": comment.select_one(".user-face img").get("alt"), 202 "author_id": 203 comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 204 "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"), 205 "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19], 206 "%Y-%m-%d %H:%M:%S").timestamp()), 207 "likes": convert_to_int( 208 re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown", 209 "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in 210 comment.select( 211 "ul#popular-comments li")] else "no", 212 "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no", 213 "is_topic_elite": topic_is_elite, 214 "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")]) 215 } 216 217 if strip: 218 comment_data["body"] = strip_tags(comment_data["body"]) 219 220 posts_processed += 1 221 yield comment_data 222 223 if offset < max_topics - 50: 224 offset += 50 225 else: 226 break 227 228 def get_douban_url(self, url, **kwargs): 229 """ 230 Get Douban page with requests 231 232 Abstracted away like this so we can easily implement measures to 233 circumvent rate limiting later. 234 235 :param str url: URL to request 236 :return: Response object 237 """ 238 if url[0:2] == "//": 239 url = "https:" + url 240 elif url[0] == "/": 241 url = "https://douban.com" + url 242 243 return requests.get(url, **kwargs) 244 245 def validate_query(query, request, user): 246 """ 247 Validate input for a dataset query on the Douban data source. 248 249 :param dict query: Query parameters, from client-side. 250 :param request: Flask request 251 :param User user: User object of user who has submitted the query 252 :return dict: Safe query parameters 253 """ 254 filtered_query = {} 255 256 # the dates need to make sense as a range to search within 257 after, before = query.get("daterange") 258 if before and after and before < after: 259 raise QueryParametersException("Date range must start before it ends") 260 261 filtered_query["min_date"], filtered_query["max_date"] = (after, before) 262 263 # normalize groups to just their IDs, even if a URL was provided, and 264 # limit to 25 265 groups = [group.split("/group/").pop().split("/")[0].strip() for group in 266 query["groups"].replace("\n", ",").split(",")] 267 groups = [group for group in groups if group][:25] 268 if not any(groups): 269 raise QueryParametersException("No valid groups were provided.") 270 271 filtered_query["groups"] = ",".join(groups) 272 273 # max amount of topics is 200 because after that Douban starts throwing 429s 274 filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1) 275 276 # strip HTML from posts? 277 filtered_query["strip"] = bool(query.get("strip", False)) 278 279 return filtered_query
17class SearchDouban(Search): 18 """ 19 Search Douban groups 20 21 Defines methods that are used to query Douban data from the site directly 22 """ 23 type = "douban-search" # job ID 24 category = "Search" # category 25 title = "Douban Search" # title displayed in UI 26 description = "Scrapes group posts from Douban for a given set of groups" # description displayed in UI 27 extension = "csv" # extension of result file, used internally and in UI 28 is_local = False # Whether this datasource is locally scraped 29 is_static = False # Whether this datasource is still updated 30 31 # not available as a processor for existing datasets 32 accepts = [None] 33 34 max_workers = 1 35 36 options = { 37 "intro": { 38 "type": UserInput.OPTION_INFO, 39 "help": "You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. " 40 "`https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If " 41 "you enter more than 25 groups, only the first 25 will be scraped." 42 }, 43 "groups": { 44 "type": UserInput.OPTION_TEXT_LARGE, 45 "help": "Groups", 46 "tooltip": "Enter group IDs or URLs, separate with commas or new lines" 47 }, 48 "divider": { 49 "type": UserInput.OPTION_DIVIDER 50 }, 51 "amount": { 52 "type": UserInput.OPTION_TEXT, 53 "help": "Threads per group", 54 "min": 1, 55 "max": 200, 56 "default": 10 57 }, 58 "strip": { 59 "type": UserInput.OPTION_TOGGLE, 60 "help": "Strip HTML?", 61 "default": True 62 }, 63 "divider-2": { 64 "type": UserInput.OPTION_DIVIDER 65 }, 66 "daterange-info": { 67 "type": UserInput.OPTION_INFO, 68 "help": "Note that Douban severely limits the retrieval of older content. Therefore this data source " 69 "can only scrape the most recent topics in a given group. You can optionally limit the scraped " 70 "topics to a given date range, but note that typically only the 500 or so most recent topics in a " 71 "group will be available for scraping." 72 }, 73 "daterange": { 74 "type": UserInput.OPTION_DATERANGE, 75 "help": "Last post between" 76 } 77 } 78 79 def get_items(self, query): 80 """ 81 Get Douban posts 82 83 In the case of Douban, there is no need for multiple pathways, so we 84 can route it all to the one post query method. Will scrape posts from the 85 most recent topics for a given list of groups. Douban prevents scraping 86 old content, so this is mostly useful to get a sense of what a given 87 group is talking about at the moment. 88 89 :param query: Filtered query parameters 90 :return: 91 """ 92 groups = query["groups"].split(",") 93 max_topics = min(convert_to_int(query["amount"], 100), 500) 94 start = query["min_date"] 95 end = query["max_date"] 96 strip = bool(query["strip"]) 97 topics_processed = 0 98 posts_processed = 0 99 100 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"} 101 102 for group in groups: 103 # get URL for group index 104 group = str(group) 105 group_url = "https://www.douban.com/group/%s/discussion?start=" % group 106 107 offset = 0 108 while True: 109 # get list of topics in group, for the given offset 110 fetch_url = group_url + str(offset) 111 request = self.get_douban_url(fetch_url, headers=headers) 112 113 # this would usually mean the group doesn't exist, or we hit some rate limit 114 if request.status_code != 200: 115 self.dataset.update_status( 116 "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group)) 117 break 118 119 self.dataset.update_status("Scraping group %s..." % group) 120 121 # parse the HTML and get links to individual topics, as well as group name 122 overview_page = BeautifulSoup(request.text, 'html.parser') 123 group_name = overview_page.select_one(".group-item .title a").text 124 125 for topic in overview_page.select("table.olt tr:not(.th)"): 126 if self.interrupted: 127 raise ProcessorInterruptedException("Interrupted while scraping Douban topics") 128 129 if topics_processed >= max_topics: 130 break 131 132 # get topic URL, and whether it is an 'elite' topic 133 topic_url = topic.find("a").get("href") 134 topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no" 135 topic_id = topic_url.split("/topic/").pop().split("/")[0] 136 137 # date can be in either of two formats, with or without time 138 try: 139 topic_updated = int( 140 datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp()) 141 except ValueError: 142 topic_updated = int( 143 datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp()) 144 145 # if a date range is given, ignore topics outside of it 146 if start and topic_updated < start: 147 continue 148 149 if end and topic_updated > end: 150 break 151 152 self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % ( 153 posts_processed, offset, min(max_topics, offset + 50), group_name)) 154 155 # request topic page - fortunately all comments are on a single page 156 topic_request = self.get_douban_url(topic_url, headers=headers) 157 time.sleep(5) # don't hit rate limits 158 topic_page = BeautifulSoup(topic_request.text, 'html.parser') 159 topic = topic_page.select_one("#topic-content") 160 161 topics_processed += 1 162 163 # include original post as the first item 164 try: 165 first_post = { 166 "id": topic_id, 167 "group_id": group, 168 "thread_id": topic_id, 169 "group_name": group_name, 170 "subject": topic_page.select_one("h1").text.strip(), 171 "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(), 172 "author": topic.select_one(".user-face img").get("alt"), 173 "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 174 "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"), 175 "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text, 176 "%Y-%m-%d %H:%M:%S").timestamp()), 177 "likes": 0, 178 "is_highlighted": "no", 179 "is_reply": "no", 180 "is_topic_elite": topic_is_elite, 181 "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")]) 182 } 183 except (AttributeError, ValueError): 184 self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id)) 185 continue 186 187 if strip: 188 first_post["body"] = strip_tags(first_post["body"]) 189 190 posts_processed += 1 191 yield first_post 192 193 # now loop through all comments on the page 194 for comment in topic_page.select("ul#comments > li"): 195 comment_data = { 196 "id": comment.get("data-cid"), 197 "group_id": group, 198 "thread_id": topic_id, 199 "group_name": group_name, 200 "subject": "", 201 "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(), 202 "author": comment.select_one(".user-face img").get("alt"), 203 "author_id": 204 comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 205 "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"), 206 "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19], 207 "%Y-%m-%d %H:%M:%S").timestamp()), 208 "likes": convert_to_int( 209 re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown", 210 "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in 211 comment.select( 212 "ul#popular-comments li")] else "no", 213 "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no", 214 "is_topic_elite": topic_is_elite, 215 "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")]) 216 } 217 218 if strip: 219 comment_data["body"] = strip_tags(comment_data["body"]) 220 221 posts_processed += 1 222 yield comment_data 223 224 if offset < max_topics - 50: 225 offset += 50 226 else: 227 break 228 229 def get_douban_url(self, url, **kwargs): 230 """ 231 Get Douban page with requests 232 233 Abstracted away like this so we can easily implement measures to 234 circumvent rate limiting later. 235 236 :param str url: URL to request 237 :return: Response object 238 """ 239 if url[0:2] == "//": 240 url = "https:" + url 241 elif url[0] == "/": 242 url = "https://douban.com" + url 243 244 return requests.get(url, **kwargs) 245 246 def validate_query(query, request, user): 247 """ 248 Validate input for a dataset query on the Douban data source. 249 250 :param dict query: Query parameters, from client-side. 251 :param request: Flask request 252 :param User user: User object of user who has submitted the query 253 :return dict: Safe query parameters 254 """ 255 filtered_query = {} 256 257 # the dates need to make sense as a range to search within 258 after, before = query.get("daterange") 259 if before and after and before < after: 260 raise QueryParametersException("Date range must start before it ends") 261 262 filtered_query["min_date"], filtered_query["max_date"] = (after, before) 263 264 # normalize groups to just their IDs, even if a URL was provided, and 265 # limit to 25 266 groups = [group.split("/group/").pop().split("/")[0].strip() for group in 267 query["groups"].replace("\n", ",").split(",")] 268 groups = [group for group in groups if group][:25] 269 if not any(groups): 270 raise QueryParametersException("No valid groups were provided.") 271 272 filtered_query["groups"] = ",".join(groups) 273 274 # max amount of topics is 200 because after that Douban starts throwing 429s 275 filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1) 276 277 # strip HTML from posts? 278 filtered_query["strip"] = bool(query.get("strip", False)) 279 280 return filtered_query
Search Douban groups
Defines methods that are used to query Douban data from the site directly
options =
{'intro': {'type': 'info', 'help': 'You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. `https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If you enter more than 25 groups, only the first 25 will be scraped.'}, 'groups': {'type': 'textarea', 'help': 'Groups', 'tooltip': 'Enter group IDs or URLs, separate with commas or new lines'}, 'divider': {'type': 'divider'}, 'amount': {'type': 'string', 'help': 'Threads per group', 'min': 1, 'max': 200, 'default': 10}, 'strip': {'type': 'toggle', 'help': 'Strip HTML?', 'default': True}, 'divider-2': {'type': 'divider'}, 'daterange-info': {'type': 'info', 'help': 'Note that Douban severely limits the retrieval of older content. Therefore this data source can only scrape the most recent topics in a given group. You can optionally limit the scraped topics to a given date range, but note that typically only the 500 or so most recent topics in a group will be available for scraping.'}, 'daterange': {'type': 'daterange', 'help': 'Last post between'}}
def
get_items(self, query):
79 def get_items(self, query): 80 """ 81 Get Douban posts 82 83 In the case of Douban, there is no need for multiple pathways, so we 84 can route it all to the one post query method. Will scrape posts from the 85 most recent topics for a given list of groups. Douban prevents scraping 86 old content, so this is mostly useful to get a sense of what a given 87 group is talking about at the moment. 88 89 :param query: Filtered query parameters 90 :return: 91 """ 92 groups = query["groups"].split(",") 93 max_topics = min(convert_to_int(query["amount"], 100), 500) 94 start = query["min_date"] 95 end = query["max_date"] 96 strip = bool(query["strip"]) 97 topics_processed = 0 98 posts_processed = 0 99 100 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"} 101 102 for group in groups: 103 # get URL for group index 104 group = str(group) 105 group_url = "https://www.douban.com/group/%s/discussion?start=" % group 106 107 offset = 0 108 while True: 109 # get list of topics in group, for the given offset 110 fetch_url = group_url + str(offset) 111 request = self.get_douban_url(fetch_url, headers=headers) 112 113 # this would usually mean the group doesn't exist, or we hit some rate limit 114 if request.status_code != 200: 115 self.dataset.update_status( 116 "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group)) 117 break 118 119 self.dataset.update_status("Scraping group %s..." % group) 120 121 # parse the HTML and get links to individual topics, as well as group name 122 overview_page = BeautifulSoup(request.text, 'html.parser') 123 group_name = overview_page.select_one(".group-item .title a").text 124 125 for topic in overview_page.select("table.olt tr:not(.th)"): 126 if self.interrupted: 127 raise ProcessorInterruptedException("Interrupted while scraping Douban topics") 128 129 if topics_processed >= max_topics: 130 break 131 132 # get topic URL, and whether it is an 'elite' topic 133 topic_url = topic.find("a").get("href") 134 topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no" 135 topic_id = topic_url.split("/topic/").pop().split("/")[0] 136 137 # date can be in either of two formats, with or without time 138 try: 139 topic_updated = int( 140 datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp()) 141 except ValueError: 142 topic_updated = int( 143 datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp()) 144 145 # if a date range is given, ignore topics outside of it 146 if start and topic_updated < start: 147 continue 148 149 if end and topic_updated > end: 150 break 151 152 self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % ( 153 posts_processed, offset, min(max_topics, offset + 50), group_name)) 154 155 # request topic page - fortunately all comments are on a single page 156 topic_request = self.get_douban_url(topic_url, headers=headers) 157 time.sleep(5) # don't hit rate limits 158 topic_page = BeautifulSoup(topic_request.text, 'html.parser') 159 topic = topic_page.select_one("#topic-content") 160 161 topics_processed += 1 162 163 # include original post as the first item 164 try: 165 first_post = { 166 "id": topic_id, 167 "group_id": group, 168 "thread_id": topic_id, 169 "group_name": group_name, 170 "subject": topic_page.select_one("h1").text.strip(), 171 "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(), 172 "author": topic.select_one(".user-face img").get("alt"), 173 "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 174 "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"), 175 "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text, 176 "%Y-%m-%d %H:%M:%S").timestamp()), 177 "likes": 0, 178 "is_highlighted": "no", 179 "is_reply": "no", 180 "is_topic_elite": topic_is_elite, 181 "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")]) 182 } 183 except (AttributeError, ValueError): 184 self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id)) 185 continue 186 187 if strip: 188 first_post["body"] = strip_tags(first_post["body"]) 189 190 posts_processed += 1 191 yield first_post 192 193 # now loop through all comments on the page 194 for comment in topic_page.select("ul#comments > li"): 195 comment_data = { 196 "id": comment.get("data-cid"), 197 "group_id": group, 198 "thread_id": topic_id, 199 "group_name": group_name, 200 "subject": "", 201 "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(), 202 "author": comment.select_one(".user-face img").get("alt"), 203 "author_id": 204 comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0], 205 "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"), 206 "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19], 207 "%Y-%m-%d %H:%M:%S").timestamp()), 208 "likes": convert_to_int( 209 re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown", 210 "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in 211 comment.select( 212 "ul#popular-comments li")] else "no", 213 "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no", 214 "is_topic_elite": topic_is_elite, 215 "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")]) 216 } 217 218 if strip: 219 comment_data["body"] = strip_tags(comment_data["body"]) 220 221 posts_processed += 1 222 yield comment_data 223 224 if offset < max_topics - 50: 225 offset += 50 226 else: 227 break
Get Douban posts
In the case of Douban, there is no need for multiple pathways, so we can route it all to the one post query method. Will scrape posts from the most recent topics for a given list of groups. Douban prevents scraping old content, so this is mostly useful to get a sense of what a given group is talking about at the moment.
Parameters
- query: Filtered query parameters
Returns
def
get_douban_url(self, url, **kwargs):
229 def get_douban_url(self, url, **kwargs): 230 """ 231 Get Douban page with requests 232 233 Abstracted away like this so we can easily implement measures to 234 circumvent rate limiting later. 235 236 :param str url: URL to request 237 :return: Response object 238 """ 239 if url[0:2] == "//": 240 url = "https:" + url 241 elif url[0] == "/": 242 url = "https://douban.com" + url 243 244 return requests.get(url, **kwargs)
Get Douban page with requests
Abstracted away like this so we can easily implement measures to circumvent rate limiting later.
Parameters
- str url: URL to request
Returns
Response object
def
validate_query(query, request, user):
246 def validate_query(query, request, user): 247 """ 248 Validate input for a dataset query on the Douban data source. 249 250 :param dict query: Query parameters, from client-side. 251 :param request: Flask request 252 :param User user: User object of user who has submitted the query 253 :return dict: Safe query parameters 254 """ 255 filtered_query = {} 256 257 # the dates need to make sense as a range to search within 258 after, before = query.get("daterange") 259 if before and after and before < after: 260 raise QueryParametersException("Date range must start before it ends") 261 262 filtered_query["min_date"], filtered_query["max_date"] = (after, before) 263 264 # normalize groups to just their IDs, even if a URL was provided, and 265 # limit to 25 266 groups = [group.split("/group/").pop().split("/")[0].strip() for group in 267 query["groups"].replace("\n", ",").split(",")] 268 groups = [group for group in groups if group][:25] 269 if not any(groups): 270 raise QueryParametersException("No valid groups were provided.") 271 272 filtered_query["groups"] = ",".join(groups) 273 274 # max amount of topics is 200 because after that Douban starts throwing 429s 275 filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1) 276 277 # strip HTML from posts? 278 filtered_query["strip"] = bool(query.get("strip", False)) 279 280 return filtered_query
Validate input for a dataset query on the Douban data source.
Parameters
- dict query: Query parameters, from client-side.
- request: Flask request
- User user: User object of user who has submitted the query
Returns
Safe query parameters
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor