Edit on GitHub

datasources.douban.search_douban

Search Douban groups

  1"""
  2Search Douban groups
  3"""
  4import requests
  5import datetime
  6import time
  7import re
  8
  9from bs4 import BeautifulSoup
 10
 11from backend.lib.search import Search
 12from common.lib.helpers import convert_to_int, strip_tags, UserInput
 13from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException
 14
 15
 16class SearchDouban(Search):
 17    """
 18    Search Douban groups
 19
 20    Defines methods that are used to query Douban data from the site directly
 21    """
 22    type = "douban-search"  # job ID
 23    category = "Search"  # category
 24    title = "Douban Search"  # title displayed in UI
 25    description = "Scrapes group posts from Douban for a given set of groups"  # description displayed in UI
 26    extension = "csv"  # extension of result file, used internally and in UI
 27    is_local = False    # Whether this datasource is locally scraped
 28    is_static = False   # Whether this datasource is still updated
 29
 30    # not available as a processor for existing datasets
 31    accepts = [None]
 32
 33    max_workers = 1
 34
 35    @classmethod
 36    def get_options(cls, parent_dataset=None, config=None) -> dict:
 37        """
 38        Get processor options
 39
 40        :param parent_dataset DataSet:  An object representing the dataset that
 41            the processor would be or was run on. Can be used, in conjunction with
 42            config, to show some options only to privileged users.
 43        :param config ConfigManager|None config:  Configuration reader (context-aware)
 44        :return dict:   Options for this processor
 45        """
 46        return {
 47            "intro": {
 48                "type": UserInput.OPTION_INFO,
 49                "help": "You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. "
 50                        "`https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If "
 51                        "you enter more than 25 groups, only the first 25 will be scraped."
 52            },
 53            "groups": {
 54                "type": UserInput.OPTION_TEXT_LARGE,
 55                "help": "Groups",
 56                "tooltip": "Enter group IDs or URLs, separate with commas or new lines"
 57            },
 58            "divider": {
 59                "type": UserInput.OPTION_DIVIDER
 60            },
 61            "amount": {
 62                "type": UserInput.OPTION_TEXT,
 63                "help": "Threads per group",
 64                "min": 1,
 65                "max": 200,
 66                "default": 10
 67            },
 68            "strip": {
 69                "type": UserInput.OPTION_TOGGLE,
 70                "help": "Strip HTML?",
 71                "default": True
 72            },
 73            "divider-2": {
 74                "type": UserInput.OPTION_DIVIDER
 75            },
 76            "daterange-info": {
 77                "type": UserInput.OPTION_INFO,
 78                "help": "Note that Douban severely limits the retrieval of older content. Therefore this data source "
 79                        "can only scrape the most recent topics in a given group. You can optionally limit the scraped "
 80                        "topics to a given date range, but note that typically only the 500 or so most recent topics in a "
 81                        "group will be available for scraping."
 82            },
 83            "daterange": {
 84                "type": UserInput.OPTION_DATERANGE,
 85                "help": "Last post between"
 86            }
 87        }
 88
 89    def get_items(self, query):
 90        """
 91        Get Douban posts
 92
 93        In the case of Douban, there is no need for multiple pathways, so we
 94        can route it all to the one post query method. Will scrape posts from the
 95        most recent topics for a given list of groups. Douban prevents scraping
 96        old content, so this is mostly useful to get a sense of what a given
 97        group is talking about at the moment.
 98
 99        :param query:  Filtered query parameters
100        :return:
101        """
102        groups = query["groups"].split(",")
103        max_topics = min(convert_to_int(query["amount"], 100), 500)
104        start = query["min_date"]
105        end = query["max_date"]
106        strip = bool(query["strip"])
107        topics_processed = 0
108        posts_processed = 0
109
110        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}
111
112        for group in groups:
113            # get URL for group index
114            group = str(group)
115            group_url = "https://www.douban.com/group/%s/discussion?start=" % group
116
117            offset = 0
118            while True:
119                # get list of topics in group, for the given offset
120                fetch_url = group_url + str(offset)
121                request = self.get_douban_url(fetch_url, headers=headers)
122
123                # this would usually mean the group doesn't exist, or we hit some rate limit
124                if request.status_code != 200:
125                    self.dataset.update_status(
126                        "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group))
127                    break
128
129                self.dataset.update_status("Scraping group %s..." % group)
130
131                # parse the HTML and get links to individual topics, as well as group name
132                overview_page = BeautifulSoup(request.text, 'html.parser')
133                group_name = overview_page.select_one(".group-item .title a").text
134
135                for topic in overview_page.select("table.olt tr:not(.th)"):
136                    if self.interrupted:
137                        raise ProcessorInterruptedException("Interrupted while scraping Douban topics")
138
139                    if topics_processed >= max_topics:
140                        break
141
142                    # get topic URL, and whether it is an 'elite' topic
143                    topic_url = topic.find("a").get("href")
144                    topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no"
145                    topic_id = topic_url.split("/topic/").pop().split("/")[0]
146
147                    # date can be in either of two formats, with or without time
148                    try:
149                        topic_updated = int(
150                            datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp())
151                    except ValueError:
152                        topic_updated = int(
153                            datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp())
154
155                    # if a date range is given, ignore topics outside of it
156                    if start and topic_updated < start:
157                        continue
158
159                    if end and topic_updated > end:
160                        break
161
162                    self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % (
163                    posts_processed, offset, min(max_topics, offset + 50), group_name))
164
165                    # request topic page - fortunately all comments are on a single page
166                    topic_request = self.get_douban_url(topic_url, headers=headers)
167                    time.sleep(5)  # don't hit rate limits
168                    topic_page = BeautifulSoup(topic_request.text, 'html.parser')
169                    topic = topic_page.select_one("#topic-content")
170
171                    topics_processed += 1
172
173                    # include original post as the first item
174                    try:
175                        first_post = {
176                            "id": topic_id,
177                            "group_id": group,
178                            "thread_id": topic_id,
179                            "group_name": group_name,
180                            "subject": topic_page.select_one("h1").text.strip(),
181                            "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(),
182                            "author": topic.select_one(".user-face img").get("alt"),
183                            "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
184                            "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"),
185                            "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text,
186                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
187                            "likes": 0,
188                            "is_highlighted": "no",
189                            "is_reply": "no",
190                            "is_topic_elite": topic_is_elite,
191                            "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")])
192                        }
193                    except (AttributeError, ValueError):
194                        self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id))
195                        continue
196
197                    if strip:
198                        first_post["body"] = strip_tags(first_post["body"])
199
200                    posts_processed += 1
201                    yield first_post
202
203                    # now loop through all comments on the page
204                    for comment in topic_page.select("ul#comments > li"):
205                        comment_data = {
206                            "id": comment.get("data-cid"),
207                            "group_id": group,
208                            "thread_id": topic_id,
209                            "group_name": group_name,
210                            "subject": "",
211                            "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(),
212                            "author": comment.select_one(".user-face img").get("alt"),
213                            "author_id":
214                                comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
215                            "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"),
216                            "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19],
217                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
218                            "likes": convert_to_int(
219                                re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown",
220                            "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in
221                                                                                   comment.select(
222                                                                                       "ul#popular-comments li")] else "no",
223                            "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no",
224                            "is_topic_elite": topic_is_elite,
225                            "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")])
226                        }
227
228                        if strip:
229                            comment_data["body"] = strip_tags(comment_data["body"])
230
231                        posts_processed += 1
232                        yield comment_data
233
234                if offset < max_topics - 50:
235                    offset += 50
236                else:
237                    break
238
239    def get_douban_url(self, url, **kwargs):
240        """
241        Get Douban page with requests
242
243        Abstracted away like this so we can easily implement measures to
244        circumvent rate limiting later.
245
246        :param str url:  URL to request
247        :return:  Response object
248        """
249        if url[0:2] == "//":
250            url = "https:" + url
251        elif url[0] == "/":
252            url = "https://douban.com" + url
253
254        return requests.get(url, **kwargs)
255
256    def validate_query(query, request, config):
257        """
258        Validate input for a dataset query on the Douban data source.
259
260        :param dict query:  Query parameters, from client-side.
261        :param request:  Flask request
262        :param ConfigManager|None config:  Configuration reader (context-aware)
263        :return dict:  Safe query parameters
264        """
265        filtered_query = {}
266
267        # the dates need to make sense as a range to search within
268        after, before = query.get("daterange")
269        if before and after and before < after:
270            raise QueryParametersException("Date range must start before it ends")
271
272        filtered_query["min_date"], filtered_query["max_date"] = (after, before)
273
274        # normalize groups to just their IDs, even if a URL was provided, and
275        # limit to 25
276        groups = [group.split("/group/").pop().split("/")[0].strip() for group in
277                  query["groups"].replace("\n", ",").split(",")]
278        groups = [group for group in groups if group][:25]
279        if not any(groups):
280            raise QueryParametersException("No valid groups were provided.")
281
282        filtered_query["groups"] = ",".join(groups)
283
284        # max amount of topics is 200 because after that Douban starts throwing 429s
285        filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1)
286
287        # strip HTML from posts?
288        filtered_query["strip"] = bool(query.get("strip", False))
289
290        return filtered_query
class SearchDouban(backend.lib.search.Search):
 17class SearchDouban(Search):
 18    """
 19    Search Douban groups
 20
 21    Defines methods that are used to query Douban data from the site directly
 22    """
 23    type = "douban-search"  # job ID
 24    category = "Search"  # category
 25    title = "Douban Search"  # title displayed in UI
 26    description = "Scrapes group posts from Douban for a given set of groups"  # description displayed in UI
 27    extension = "csv"  # extension of result file, used internally and in UI
 28    is_local = False    # Whether this datasource is locally scraped
 29    is_static = False   # Whether this datasource is still updated
 30
 31    # not available as a processor for existing datasets
 32    accepts = [None]
 33
 34    max_workers = 1
 35
 36    @classmethod
 37    def get_options(cls, parent_dataset=None, config=None) -> dict:
 38        """
 39        Get processor options
 40
 41        :param parent_dataset DataSet:  An object representing the dataset that
 42            the processor would be or was run on. Can be used, in conjunction with
 43            config, to show some options only to privileged users.
 44        :param config ConfigManager|None config:  Configuration reader (context-aware)
 45        :return dict:   Options for this processor
 46        """
 47        return {
 48            "intro": {
 49                "type": UserInput.OPTION_INFO,
 50                "help": "You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. "
 51                        "`https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If "
 52                        "you enter more than 25 groups, only the first 25 will be scraped."
 53            },
 54            "groups": {
 55                "type": UserInput.OPTION_TEXT_LARGE,
 56                "help": "Groups",
 57                "tooltip": "Enter group IDs or URLs, separate with commas or new lines"
 58            },
 59            "divider": {
 60                "type": UserInput.OPTION_DIVIDER
 61            },
 62            "amount": {
 63                "type": UserInput.OPTION_TEXT,
 64                "help": "Threads per group",
 65                "min": 1,
 66                "max": 200,
 67                "default": 10
 68            },
 69            "strip": {
 70                "type": UserInput.OPTION_TOGGLE,
 71                "help": "Strip HTML?",
 72                "default": True
 73            },
 74            "divider-2": {
 75                "type": UserInput.OPTION_DIVIDER
 76            },
 77            "daterange-info": {
 78                "type": UserInput.OPTION_INFO,
 79                "help": "Note that Douban severely limits the retrieval of older content. Therefore this data source "
 80                        "can only scrape the most recent topics in a given group. You can optionally limit the scraped "
 81                        "topics to a given date range, but note that typically only the 500 or so most recent topics in a "
 82                        "group will be available for scraping."
 83            },
 84            "daterange": {
 85                "type": UserInput.OPTION_DATERANGE,
 86                "help": "Last post between"
 87            }
 88        }
 89
 90    def get_items(self, query):
 91        """
 92        Get Douban posts
 93
 94        In the case of Douban, there is no need for multiple pathways, so we
 95        can route it all to the one post query method. Will scrape posts from the
 96        most recent topics for a given list of groups. Douban prevents scraping
 97        old content, so this is mostly useful to get a sense of what a given
 98        group is talking about at the moment.
 99
100        :param query:  Filtered query parameters
101        :return:
102        """
103        groups = query["groups"].split(",")
104        max_topics = min(convert_to_int(query["amount"], 100), 500)
105        start = query["min_date"]
106        end = query["max_date"]
107        strip = bool(query["strip"])
108        topics_processed = 0
109        posts_processed = 0
110
111        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}
112
113        for group in groups:
114            # get URL for group index
115            group = str(group)
116            group_url = "https://www.douban.com/group/%s/discussion?start=" % group
117
118            offset = 0
119            while True:
120                # get list of topics in group, for the given offset
121                fetch_url = group_url + str(offset)
122                request = self.get_douban_url(fetch_url, headers=headers)
123
124                # this would usually mean the group doesn't exist, or we hit some rate limit
125                if request.status_code != 200:
126                    self.dataset.update_status(
127                        "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group))
128                    break
129
130                self.dataset.update_status("Scraping group %s..." % group)
131
132                # parse the HTML and get links to individual topics, as well as group name
133                overview_page = BeautifulSoup(request.text, 'html.parser')
134                group_name = overview_page.select_one(".group-item .title a").text
135
136                for topic in overview_page.select("table.olt tr:not(.th)"):
137                    if self.interrupted:
138                        raise ProcessorInterruptedException("Interrupted while scraping Douban topics")
139
140                    if topics_processed >= max_topics:
141                        break
142
143                    # get topic URL, and whether it is an 'elite' topic
144                    topic_url = topic.find("a").get("href")
145                    topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no"
146                    topic_id = topic_url.split("/topic/").pop().split("/")[0]
147
148                    # date can be in either of two formats, with or without time
149                    try:
150                        topic_updated = int(
151                            datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp())
152                    except ValueError:
153                        topic_updated = int(
154                            datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp())
155
156                    # if a date range is given, ignore topics outside of it
157                    if start and topic_updated < start:
158                        continue
159
160                    if end and topic_updated > end:
161                        break
162
163                    self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % (
164                    posts_processed, offset, min(max_topics, offset + 50), group_name))
165
166                    # request topic page - fortunately all comments are on a single page
167                    topic_request = self.get_douban_url(topic_url, headers=headers)
168                    time.sleep(5)  # don't hit rate limits
169                    topic_page = BeautifulSoup(topic_request.text, 'html.parser')
170                    topic = topic_page.select_one("#topic-content")
171
172                    topics_processed += 1
173
174                    # include original post as the first item
175                    try:
176                        first_post = {
177                            "id": topic_id,
178                            "group_id": group,
179                            "thread_id": topic_id,
180                            "group_name": group_name,
181                            "subject": topic_page.select_one("h1").text.strip(),
182                            "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(),
183                            "author": topic.select_one(".user-face img").get("alt"),
184                            "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
185                            "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"),
186                            "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text,
187                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
188                            "likes": 0,
189                            "is_highlighted": "no",
190                            "is_reply": "no",
191                            "is_topic_elite": topic_is_elite,
192                            "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")])
193                        }
194                    except (AttributeError, ValueError):
195                        self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id))
196                        continue
197
198                    if strip:
199                        first_post["body"] = strip_tags(first_post["body"])
200
201                    posts_processed += 1
202                    yield first_post
203
204                    # now loop through all comments on the page
205                    for comment in topic_page.select("ul#comments > li"):
206                        comment_data = {
207                            "id": comment.get("data-cid"),
208                            "group_id": group,
209                            "thread_id": topic_id,
210                            "group_name": group_name,
211                            "subject": "",
212                            "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(),
213                            "author": comment.select_one(".user-face img").get("alt"),
214                            "author_id":
215                                comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
216                            "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"),
217                            "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19],
218                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
219                            "likes": convert_to_int(
220                                re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown",
221                            "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in
222                                                                                   comment.select(
223                                                                                       "ul#popular-comments li")] else "no",
224                            "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no",
225                            "is_topic_elite": topic_is_elite,
226                            "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")])
227                        }
228
229                        if strip:
230                            comment_data["body"] = strip_tags(comment_data["body"])
231
232                        posts_processed += 1
233                        yield comment_data
234
235                if offset < max_topics - 50:
236                    offset += 50
237                else:
238                    break
239
240    def get_douban_url(self, url, **kwargs):
241        """
242        Get Douban page with requests
243
244        Abstracted away like this so we can easily implement measures to
245        circumvent rate limiting later.
246
247        :param str url:  URL to request
248        :return:  Response object
249        """
250        if url[0:2] == "//":
251            url = "https:" + url
252        elif url[0] == "/":
253            url = "https://douban.com" + url
254
255        return requests.get(url, **kwargs)
256
257    def validate_query(query, request, config):
258        """
259        Validate input for a dataset query on the Douban data source.
260
261        :param dict query:  Query parameters, from client-side.
262        :param request:  Flask request
263        :param ConfigManager|None config:  Configuration reader (context-aware)
264        :return dict:  Safe query parameters
265        """
266        filtered_query = {}
267
268        # the dates need to make sense as a range to search within
269        after, before = query.get("daterange")
270        if before and after and before < after:
271            raise QueryParametersException("Date range must start before it ends")
272
273        filtered_query["min_date"], filtered_query["max_date"] = (after, before)
274
275        # normalize groups to just their IDs, even if a URL was provided, and
276        # limit to 25
277        groups = [group.split("/group/").pop().split("/")[0].strip() for group in
278                  query["groups"].replace("\n", ",").split(",")]
279        groups = [group for group in groups if group][:25]
280        if not any(groups):
281            raise QueryParametersException("No valid groups were provided.")
282
283        filtered_query["groups"] = ",".join(groups)
284
285        # max amount of topics is 200 because after that Douban starts throwing 429s
286        filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1)
287
288        # strip HTML from posts?
289        filtered_query["strip"] = bool(query.get("strip", False))
290
291        return filtered_query

Search Douban groups

Defines methods that are used to query Douban data from the site directly

type = 'douban-search'
category = 'Search'
title = 'Douban Search'
description = 'Scrapes group posts from Douban for a given set of groups'
extension = 'csv'
is_local = False
is_static = False
accepts = [None]
max_workers = 1
@classmethod
def get_options(cls, parent_dataset=None, config=None) -> dict:
36    @classmethod
37    def get_options(cls, parent_dataset=None, config=None) -> dict:
38        """
39        Get processor options
40
41        :param parent_dataset DataSet:  An object representing the dataset that
42            the processor would be or was run on. Can be used, in conjunction with
43            config, to show some options only to privileged users.
44        :param config ConfigManager|None config:  Configuration reader (context-aware)
45        :return dict:   Options for this processor
46        """
47        return {
48            "intro": {
49                "type": UserInput.OPTION_INFO,
50                "help": "You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. "
51                        "`https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If "
52                        "you enter more than 25 groups, only the first 25 will be scraped."
53            },
54            "groups": {
55                "type": UserInput.OPTION_TEXT_LARGE,
56                "help": "Groups",
57                "tooltip": "Enter group IDs or URLs, separate with commas or new lines"
58            },
59            "divider": {
60                "type": UserInput.OPTION_DIVIDER
61            },
62            "amount": {
63                "type": UserInput.OPTION_TEXT,
64                "help": "Threads per group",
65                "min": 1,
66                "max": 200,
67                "default": 10
68            },
69            "strip": {
70                "type": UserInput.OPTION_TOGGLE,
71                "help": "Strip HTML?",
72                "default": True
73            },
74            "divider-2": {
75                "type": UserInput.OPTION_DIVIDER
76            },
77            "daterange-info": {
78                "type": UserInput.OPTION_INFO,
79                "help": "Note that Douban severely limits the retrieval of older content. Therefore this data source "
80                        "can only scrape the most recent topics in a given group. You can optionally limit the scraped "
81                        "topics to a given date range, but note that typically only the 500 or so most recent topics in a "
82                        "group will be available for scraping."
83            },
84            "daterange": {
85                "type": UserInput.OPTION_DATERANGE,
86                "help": "Last post between"
87            }
88        }

Get processor options

Parameters
  • parent_dataset DataSet: An object representing the dataset that the processor would be or was run on. Can be used, in conjunction with config, to show some options only to privileged users.
  • config ConfigManager|None config: Configuration reader (context-aware)
Returns

Options for this processor

def get_items(self, query):
 90    def get_items(self, query):
 91        """
 92        Get Douban posts
 93
 94        In the case of Douban, there is no need for multiple pathways, so we
 95        can route it all to the one post query method. Will scrape posts from the
 96        most recent topics for a given list of groups. Douban prevents scraping
 97        old content, so this is mostly useful to get a sense of what a given
 98        group is talking about at the moment.
 99
100        :param query:  Filtered query parameters
101        :return:
102        """
103        groups = query["groups"].split(",")
104        max_topics = min(convert_to_int(query["amount"], 100), 500)
105        start = query["min_date"]
106        end = query["max_date"]
107        strip = bool(query["strip"])
108        topics_processed = 0
109        posts_processed = 0
110
111        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}
112
113        for group in groups:
114            # get URL for group index
115            group = str(group)
116            group_url = "https://www.douban.com/group/%s/discussion?start=" % group
117
118            offset = 0
119            while True:
120                # get list of topics in group, for the given offset
121                fetch_url = group_url + str(offset)
122                request = self.get_douban_url(fetch_url, headers=headers)
123
124                # this would usually mean the group doesn't exist, or we hit some rate limit
125                if request.status_code != 200:
126                    self.dataset.update_status(
127                        "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group))
128                    break
129
130                self.dataset.update_status("Scraping group %s..." % group)
131
132                # parse the HTML and get links to individual topics, as well as group name
133                overview_page = BeautifulSoup(request.text, 'html.parser')
134                group_name = overview_page.select_one(".group-item .title a").text
135
136                for topic in overview_page.select("table.olt tr:not(.th)"):
137                    if self.interrupted:
138                        raise ProcessorInterruptedException("Interrupted while scraping Douban topics")
139
140                    if topics_processed >= max_topics:
141                        break
142
143                    # get topic URL, and whether it is an 'elite' topic
144                    topic_url = topic.find("a").get("href")
145                    topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no"
146                    topic_id = topic_url.split("/topic/").pop().split("/")[0]
147
148                    # date can be in either of two formats, with or without time
149                    try:
150                        topic_updated = int(
151                            datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp())
152                    except ValueError:
153                        topic_updated = int(
154                            datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp())
155
156                    # if a date range is given, ignore topics outside of it
157                    if start and topic_updated < start:
158                        continue
159
160                    if end and topic_updated > end:
161                        break
162
163                    self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % (
164                    posts_processed, offset, min(max_topics, offset + 50), group_name))
165
166                    # request topic page - fortunately all comments are on a single page
167                    topic_request = self.get_douban_url(topic_url, headers=headers)
168                    time.sleep(5)  # don't hit rate limits
169                    topic_page = BeautifulSoup(topic_request.text, 'html.parser')
170                    topic = topic_page.select_one("#topic-content")
171
172                    topics_processed += 1
173
174                    # include original post as the first item
175                    try:
176                        first_post = {
177                            "id": topic_id,
178                            "group_id": group,
179                            "thread_id": topic_id,
180                            "group_name": group_name,
181                            "subject": topic_page.select_one("h1").text.strip(),
182                            "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(),
183                            "author": topic.select_one(".user-face img").get("alt"),
184                            "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
185                            "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"),
186                            "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text,
187                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
188                            "likes": 0,
189                            "is_highlighted": "no",
190                            "is_reply": "no",
191                            "is_topic_elite": topic_is_elite,
192                            "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")])
193                        }
194                    except (AttributeError, ValueError):
195                        self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id))
196                        continue
197
198                    if strip:
199                        first_post["body"] = strip_tags(first_post["body"])
200
201                    posts_processed += 1
202                    yield first_post
203
204                    # now loop through all comments on the page
205                    for comment in topic_page.select("ul#comments > li"):
206                        comment_data = {
207                            "id": comment.get("data-cid"),
208                            "group_id": group,
209                            "thread_id": topic_id,
210                            "group_name": group_name,
211                            "subject": "",
212                            "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(),
213                            "author": comment.select_one(".user-face img").get("alt"),
214                            "author_id":
215                                comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
216                            "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"),
217                            "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19],
218                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
219                            "likes": convert_to_int(
220                                re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown",
221                            "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in
222                                                                                   comment.select(
223                                                                                       "ul#popular-comments li")] else "no",
224                            "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no",
225                            "is_topic_elite": topic_is_elite,
226                            "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")])
227                        }
228
229                        if strip:
230                            comment_data["body"] = strip_tags(comment_data["body"])
231
232                        posts_processed += 1
233                        yield comment_data
234
235                if offset < max_topics - 50:
236                    offset += 50
237                else:
238                    break

Get Douban posts

In the case of Douban, there is no need for multiple pathways, so we can route it all to the one post query method. Will scrape posts from the most recent topics for a given list of groups. Douban prevents scraping old content, so this is mostly useful to get a sense of what a given group is talking about at the moment.

Parameters
  • query: Filtered query parameters
Returns
def get_douban_url(self, url, **kwargs):
240    def get_douban_url(self, url, **kwargs):
241        """
242        Get Douban page with requests
243
244        Abstracted away like this so we can easily implement measures to
245        circumvent rate limiting later.
246
247        :param str url:  URL to request
248        :return:  Response object
249        """
250        if url[0:2] == "//":
251            url = "https:" + url
252        elif url[0] == "/":
253            url = "https://douban.com" + url
254
255        return requests.get(url, **kwargs)

Get Douban page with requests

Abstracted away like this so we can easily implement measures to circumvent rate limiting later.

Parameters
  • str url: URL to request
Returns

Response object

def validate_query(query, request, config):
257    def validate_query(query, request, config):
258        """
259        Validate input for a dataset query on the Douban data source.
260
261        :param dict query:  Query parameters, from client-side.
262        :param request:  Flask request
263        :param ConfigManager|None config:  Configuration reader (context-aware)
264        :return dict:  Safe query parameters
265        """
266        filtered_query = {}
267
268        # the dates need to make sense as a range to search within
269        after, before = query.get("daterange")
270        if before and after and before < after:
271            raise QueryParametersException("Date range must start before it ends")
272
273        filtered_query["min_date"], filtered_query["max_date"] = (after, before)
274
275        # normalize groups to just their IDs, even if a URL was provided, and
276        # limit to 25
277        groups = [group.split("/group/").pop().split("/")[0].strip() for group in
278                  query["groups"].replace("\n", ",").split(",")]
279        groups = [group for group in groups if group][:25]
280        if not any(groups):
281            raise QueryParametersException("No valid groups were provided.")
282
283        filtered_query["groups"] = ",".join(groups)
284
285        # max amount of topics is 200 because after that Douban starts throwing 429s
286        filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1)
287
288        # strip HTML from posts?
289        filtered_query["strip"] = bool(query.get("strip", False))
290
291        return filtered_query

Validate input for a dataset query on the Douban data source.

Parameters
  • dict query: Query parameters, from client-side.
  • request: Flask request
  • ConfigManager|None config: Configuration reader (context-aware)
Returns

Safe query parameters