Edit on GitHub

datasources.douban.search_douban

Search Douban groups

  1"""
  2Search Douban groups
  3"""
  4import requests
  5import datetime
  6import time
  7import re
  8
  9from bs4 import BeautifulSoup
 10
 11from backend.lib.search import Search
 12from common.lib.helpers import convert_to_int, strip_tags, UserInput
 13from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException
 14
 15
 16class SearchDouban(Search):
 17    """
 18    Search Douban groups
 19
 20    Defines methods that are used to query Douban data from the site directly
 21    """
 22    type = "douban-search"  # job ID
 23    category = "Search"  # category
 24    title = "Douban Search"  # title displayed in UI
 25    description = "Scrapes group posts from Douban for a given set of groups"  # description displayed in UI
 26    extension = "csv"  # extension of result file, used internally and in UI
 27    is_local = False    # Whether this datasource is locally scraped
 28    is_static = False   # Whether this datasource is still updated
 29
 30    # not available as a processor for existing datasets
 31    accepts = [None]
 32
 33    max_workers = 1
 34
 35    options = {
 36        "intro": {
 37            "type": UserInput.OPTION_INFO,
 38            "help": "You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. "
 39                    "`https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If "
 40                    "you enter more than 25 groups, only the first 25 will be scraped."
 41        },
 42        "groups": {
 43            "type": UserInput.OPTION_TEXT_LARGE,
 44            "help": "Groups",
 45            "tooltip": "Enter group IDs or URLs, separate with commas or new lines"
 46        },
 47        "divider": {
 48            "type": UserInput.OPTION_DIVIDER
 49        },
 50        "amount": {
 51            "type": UserInput.OPTION_TEXT,
 52            "help": "Threads per group",
 53            "min": 1,
 54            "max": 200,
 55            "default": 10
 56        },
 57        "strip": {
 58            "type": UserInput.OPTION_TOGGLE,
 59            "help": "Strip HTML?",
 60            "default": True
 61        },
 62        "divider-2": {
 63            "type": UserInput.OPTION_DIVIDER
 64        },
 65        "daterange-info": {
 66            "type": UserInput.OPTION_INFO,
 67            "help": "Note that Douban severely limits the retrieval of older content. Therefore this data source "
 68                    "can only scrape the most recent topics in a given group. You can optionally limit the scraped "
 69                    "topics to a given date range, but note that typically only the 500 or so most recent topics in a "
 70                    "group will be available for scraping."
 71        },
 72        "daterange": {
 73            "type": UserInput.OPTION_DATERANGE,
 74            "help": "Last post between"
 75        }
 76    }
 77
 78    def get_items(self, query):
 79        """
 80        Get Douban posts
 81
 82        In the case of Douban, there is no need for multiple pathways, so we
 83        can route it all to the one post query method. Will scrape posts from the
 84        most recent topics for a given list of groups. Douban prevents scraping
 85        old content, so this is mostly useful to get a sense of what a given
 86        group is talking about at the moment.
 87
 88        :param query:  Filtered query parameters
 89        :return:
 90        """
 91        groups = query["groups"].split(",")
 92        max_topics = min(convert_to_int(query["amount"], 100), 500)
 93        start = query["min_date"]
 94        end = query["max_date"]
 95        strip = bool(query["strip"])
 96        topics_processed = 0
 97        posts_processed = 0
 98
 99        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}
100
101        for group in groups:
102            # get URL for group index
103            group = str(group)
104            group_url = "https://www.douban.com/group/%s/discussion?start=" % group
105
106            offset = 0
107            while True:
108                # get list of topics in group, for the given offset
109                fetch_url = group_url + str(offset)
110                request = self.get_douban_url(fetch_url, headers=headers)
111
112                # this would usually mean the group doesn't exist, or we hit some rate limit
113                if request.status_code != 200:
114                    self.dataset.update_status(
115                        "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group))
116                    break
117
118                self.dataset.update_status("Scraping group %s..." % group)
119
120                # parse the HTML and get links to individual topics, as well as group name
121                overview_page = BeautifulSoup(request.text, 'html.parser')
122                group_name = overview_page.select_one(".group-item .title a").text
123
124                for topic in overview_page.select("table.olt tr:not(.th)"):
125                    if self.interrupted:
126                        raise ProcessorInterruptedException("Interrupted while scraping Douban topics")
127
128                    if topics_processed >= max_topics:
129                        break
130
131                    # get topic URL, and whether it is an 'elite' topic
132                    topic_url = topic.find("a").get("href")
133                    topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no"
134                    topic_id = topic_url.split("/topic/").pop().split("/")[0]
135
136                    # date can be in either of two formats, with or without time
137                    try:
138                        topic_updated = int(
139                            datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp())
140                    except ValueError:
141                        topic_updated = int(
142                            datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp())
143
144                    # if a date range is given, ignore topics outside of it
145                    if start and topic_updated < start:
146                        continue
147
148                    if end and topic_updated > end:
149                        break
150
151                    self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % (
152                    posts_processed, offset, min(max_topics, offset + 50), group_name))
153
154                    # request topic page - fortunately all comments are on a single page
155                    topic_request = self.get_douban_url(topic_url, headers=headers)
156                    time.sleep(5)  # don't hit rate limits
157                    topic_page = BeautifulSoup(topic_request.text, 'html.parser')
158                    topic = topic_page.select_one("#topic-content")
159
160                    topics_processed += 1
161
162                    # include original post as the first item
163                    try:
164                        first_post = {
165                            "id": topic_id,
166                            "group_id": group,
167                            "thread_id": topic_id,
168                            "group_name": group_name,
169                            "subject": topic_page.select_one("h1").text.strip(),
170                            "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(),
171                            "author": topic.select_one(".user-face img").get("alt"),
172                            "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
173                            "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"),
174                            "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text,
175                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
176                            "likes": 0,
177                            "is_highlighted": "no",
178                            "is_reply": "no",
179                            "is_topic_elite": topic_is_elite,
180                            "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")])
181                        }
182                    except (AttributeError, ValueError):
183                        self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id))
184                        continue
185
186                    if strip:
187                        first_post["body"] = strip_tags(first_post["body"])
188
189                    posts_processed += 1
190                    yield first_post
191
192                    # now loop through all comments on the page
193                    for comment in topic_page.select("ul#comments > li"):
194                        comment_data = {
195                            "id": comment.get("data-cid"),
196                            "group_id": group,
197                            "thread_id": topic_id,
198                            "group_name": group_name,
199                            "subject": "",
200                            "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(),
201                            "author": comment.select_one(".user-face img").get("alt"),
202                            "author_id":
203                                comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
204                            "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"),
205                            "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19],
206                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
207                            "likes": convert_to_int(
208                                re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown",
209                            "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in
210                                                                                   comment.select(
211                                                                                       "ul#popular-comments li")] else "no",
212                            "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no",
213                            "is_topic_elite": topic_is_elite,
214                            "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")])
215                        }
216
217                        if strip:
218                            comment_data["body"] = strip_tags(comment_data["body"])
219
220                        posts_processed += 1
221                        yield comment_data
222
223                if offset < max_topics - 50:
224                    offset += 50
225                else:
226                    break
227
228    def get_douban_url(self, url, **kwargs):
229        """
230        Get Douban page with requests
231
232        Abstracted away like this so we can easily implement measures to
233        circumvent rate limiting later.
234
235        :param str url:  URL to request
236        :return:  Response object
237        """
238        if url[0:2] == "//":
239            url = "https:" + url
240        elif url[0] == "/":
241            url = "https://douban.com" + url
242
243        return requests.get(url, **kwargs)
244
245    def validate_query(query, request, user):
246        """
247        Validate input for a dataset query on the Douban data source.
248
249        :param dict query:  Query parameters, from client-side.
250        :param request:  Flask request
251        :param User user:  User object of user who has submitted the query
252        :return dict:  Safe query parameters
253        """
254        filtered_query = {}
255
256        # the dates need to make sense as a range to search within
257        after, before = query.get("daterange")
258        if before and after and before < after:
259            raise QueryParametersException("Date range must start before it ends")
260
261        filtered_query["min_date"], filtered_query["max_date"] = (after, before)
262
263        # normalize groups to just their IDs, even if a URL was provided, and
264        # limit to 25
265        groups = [group.split("/group/").pop().split("/")[0].strip() for group in
266                  query["groups"].replace("\n", ",").split(",")]
267        groups = [group for group in groups if group][:25]
268        if not any(groups):
269            raise QueryParametersException("No valid groups were provided.")
270
271        filtered_query["groups"] = ",".join(groups)
272
273        # max amount of topics is 200 because after that Douban starts throwing 429s
274        filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1)
275
276        # strip HTML from posts?
277        filtered_query["strip"] = bool(query.get("strip", False))
278
279        return filtered_query
class SearchDouban(backend.lib.search.Search):
 17class SearchDouban(Search):
 18    """
 19    Search Douban groups
 20
 21    Defines methods that are used to query Douban data from the site directly
 22    """
 23    type = "douban-search"  # job ID
 24    category = "Search"  # category
 25    title = "Douban Search"  # title displayed in UI
 26    description = "Scrapes group posts from Douban for a given set of groups"  # description displayed in UI
 27    extension = "csv"  # extension of result file, used internally and in UI
 28    is_local = False    # Whether this datasource is locally scraped
 29    is_static = False   # Whether this datasource is still updated
 30
 31    # not available as a processor for existing datasets
 32    accepts = [None]
 33
 34    max_workers = 1
 35
 36    options = {
 37        "intro": {
 38            "type": UserInput.OPTION_INFO,
 39            "help": "You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. "
 40                    "`https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If "
 41                    "you enter more than 25 groups, only the first 25 will be scraped."
 42        },
 43        "groups": {
 44            "type": UserInput.OPTION_TEXT_LARGE,
 45            "help": "Groups",
 46            "tooltip": "Enter group IDs or URLs, separate with commas or new lines"
 47        },
 48        "divider": {
 49            "type": UserInput.OPTION_DIVIDER
 50        },
 51        "amount": {
 52            "type": UserInput.OPTION_TEXT,
 53            "help": "Threads per group",
 54            "min": 1,
 55            "max": 200,
 56            "default": 10
 57        },
 58        "strip": {
 59            "type": UserInput.OPTION_TOGGLE,
 60            "help": "Strip HTML?",
 61            "default": True
 62        },
 63        "divider-2": {
 64            "type": UserInput.OPTION_DIVIDER
 65        },
 66        "daterange-info": {
 67            "type": UserInput.OPTION_INFO,
 68            "help": "Note that Douban severely limits the retrieval of older content. Therefore this data source "
 69                    "can only scrape the most recent topics in a given group. You can optionally limit the scraped "
 70                    "topics to a given date range, but note that typically only the 500 or so most recent topics in a "
 71                    "group will be available for scraping."
 72        },
 73        "daterange": {
 74            "type": UserInput.OPTION_DATERANGE,
 75            "help": "Last post between"
 76        }
 77    }
 78
 79    def get_items(self, query):
 80        """
 81        Get Douban posts
 82
 83        In the case of Douban, there is no need for multiple pathways, so we
 84        can route it all to the one post query method. Will scrape posts from the
 85        most recent topics for a given list of groups. Douban prevents scraping
 86        old content, so this is mostly useful to get a sense of what a given
 87        group is talking about at the moment.
 88
 89        :param query:  Filtered query parameters
 90        :return:
 91        """
 92        groups = query["groups"].split(",")
 93        max_topics = min(convert_to_int(query["amount"], 100), 500)
 94        start = query["min_date"]
 95        end = query["max_date"]
 96        strip = bool(query["strip"])
 97        topics_processed = 0
 98        posts_processed = 0
 99
100        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}
101
102        for group in groups:
103            # get URL for group index
104            group = str(group)
105            group_url = "https://www.douban.com/group/%s/discussion?start=" % group
106
107            offset = 0
108            while True:
109                # get list of topics in group, for the given offset
110                fetch_url = group_url + str(offset)
111                request = self.get_douban_url(fetch_url, headers=headers)
112
113                # this would usually mean the group doesn't exist, or we hit some rate limit
114                if request.status_code != 200:
115                    self.dataset.update_status(
116                        "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group))
117                    break
118
119                self.dataset.update_status("Scraping group %s..." % group)
120
121                # parse the HTML and get links to individual topics, as well as group name
122                overview_page = BeautifulSoup(request.text, 'html.parser')
123                group_name = overview_page.select_one(".group-item .title a").text
124
125                for topic in overview_page.select("table.olt tr:not(.th)"):
126                    if self.interrupted:
127                        raise ProcessorInterruptedException("Interrupted while scraping Douban topics")
128
129                    if topics_processed >= max_topics:
130                        break
131
132                    # get topic URL, and whether it is an 'elite' topic
133                    topic_url = topic.find("a").get("href")
134                    topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no"
135                    topic_id = topic_url.split("/topic/").pop().split("/")[0]
136
137                    # date can be in either of two formats, with or without time
138                    try:
139                        topic_updated = int(
140                            datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp())
141                    except ValueError:
142                        topic_updated = int(
143                            datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp())
144
145                    # if a date range is given, ignore topics outside of it
146                    if start and topic_updated < start:
147                        continue
148
149                    if end and topic_updated > end:
150                        break
151
152                    self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % (
153                    posts_processed, offset, min(max_topics, offset + 50), group_name))
154
155                    # request topic page - fortunately all comments are on a single page
156                    topic_request = self.get_douban_url(topic_url, headers=headers)
157                    time.sleep(5)  # don't hit rate limits
158                    topic_page = BeautifulSoup(topic_request.text, 'html.parser')
159                    topic = topic_page.select_one("#topic-content")
160
161                    topics_processed += 1
162
163                    # include original post as the first item
164                    try:
165                        first_post = {
166                            "id": topic_id,
167                            "group_id": group,
168                            "thread_id": topic_id,
169                            "group_name": group_name,
170                            "subject": topic_page.select_one("h1").text.strip(),
171                            "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(),
172                            "author": topic.select_one(".user-face img").get("alt"),
173                            "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
174                            "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"),
175                            "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text,
176                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
177                            "likes": 0,
178                            "is_highlighted": "no",
179                            "is_reply": "no",
180                            "is_topic_elite": topic_is_elite,
181                            "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")])
182                        }
183                    except (AttributeError, ValueError):
184                        self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id))
185                        continue
186
187                    if strip:
188                        first_post["body"] = strip_tags(first_post["body"])
189
190                    posts_processed += 1
191                    yield first_post
192
193                    # now loop through all comments on the page
194                    for comment in topic_page.select("ul#comments > li"):
195                        comment_data = {
196                            "id": comment.get("data-cid"),
197                            "group_id": group,
198                            "thread_id": topic_id,
199                            "group_name": group_name,
200                            "subject": "",
201                            "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(),
202                            "author": comment.select_one(".user-face img").get("alt"),
203                            "author_id":
204                                comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
205                            "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"),
206                            "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19],
207                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
208                            "likes": convert_to_int(
209                                re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown",
210                            "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in
211                                                                                   comment.select(
212                                                                                       "ul#popular-comments li")] else "no",
213                            "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no",
214                            "is_topic_elite": topic_is_elite,
215                            "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")])
216                        }
217
218                        if strip:
219                            comment_data["body"] = strip_tags(comment_data["body"])
220
221                        posts_processed += 1
222                        yield comment_data
223
224                if offset < max_topics - 50:
225                    offset += 50
226                else:
227                    break
228
229    def get_douban_url(self, url, **kwargs):
230        """
231        Get Douban page with requests
232
233        Abstracted away like this so we can easily implement measures to
234        circumvent rate limiting later.
235
236        :param str url:  URL to request
237        :return:  Response object
238        """
239        if url[0:2] == "//":
240            url = "https:" + url
241        elif url[0] == "/":
242            url = "https://douban.com" + url
243
244        return requests.get(url, **kwargs)
245
246    def validate_query(query, request, user):
247        """
248        Validate input for a dataset query on the Douban data source.
249
250        :param dict query:  Query parameters, from client-side.
251        :param request:  Flask request
252        :param User user:  User object of user who has submitted the query
253        :return dict:  Safe query parameters
254        """
255        filtered_query = {}
256
257        # the dates need to make sense as a range to search within
258        after, before = query.get("daterange")
259        if before and after and before < after:
260            raise QueryParametersException("Date range must start before it ends")
261
262        filtered_query["min_date"], filtered_query["max_date"] = (after, before)
263
264        # normalize groups to just their IDs, even if a URL was provided, and
265        # limit to 25
266        groups = [group.split("/group/").pop().split("/")[0].strip() for group in
267                  query["groups"].replace("\n", ",").split(",")]
268        groups = [group for group in groups if group][:25]
269        if not any(groups):
270            raise QueryParametersException("No valid groups were provided.")
271
272        filtered_query["groups"] = ",".join(groups)
273
274        # max amount of topics is 200 because after that Douban starts throwing 429s
275        filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1)
276
277        # strip HTML from posts?
278        filtered_query["strip"] = bool(query.get("strip", False))
279
280        return filtered_query

Search Douban groups

Defines methods that are used to query Douban data from the site directly

type = 'douban-search'
category = 'Search'
title = 'Douban Search'
description = 'Scrapes group posts from Douban for a given set of groups'
extension = 'csv'
is_local = False
is_static = False
accepts = [None]
max_workers = 1
options = {'intro': {'type': 'info', 'help': 'You can enter the groups to scrape as group IDs (e.g. `687802`) or URLs (e.g. `https://www.douban.com/group/687802/`. Separate multiple groups with commas or new lines. If you enter more than 25 groups, only the first 25 will be scraped.'}, 'groups': {'type': 'textarea', 'help': 'Groups', 'tooltip': 'Enter group IDs or URLs, separate with commas or new lines'}, 'divider': {'type': 'divider'}, 'amount': {'type': 'string', 'help': 'Threads per group', 'min': 1, 'max': 200, 'default': 10}, 'strip': {'type': 'toggle', 'help': 'Strip HTML?', 'default': True}, 'divider-2': {'type': 'divider'}, 'daterange-info': {'type': 'info', 'help': 'Note that Douban severely limits the retrieval of older content. Therefore this data source can only scrape the most recent topics in a given group. You can optionally limit the scraped topics to a given date range, but note that typically only the 500 or so most recent topics in a group will be available for scraping.'}, 'daterange': {'type': 'daterange', 'help': 'Last post between'}}
def get_items(self, query):
 79    def get_items(self, query):
 80        """
 81        Get Douban posts
 82
 83        In the case of Douban, there is no need for multiple pathways, so we
 84        can route it all to the one post query method. Will scrape posts from the
 85        most recent topics for a given list of groups. Douban prevents scraping
 86        old content, so this is mostly useful to get a sense of what a given
 87        group is talking about at the moment.
 88
 89        :param query:  Filtered query parameters
 90        :return:
 91        """
 92        groups = query["groups"].split(",")
 93        max_topics = min(convert_to_int(query["amount"], 100), 500)
 94        start = query["min_date"]
 95        end = query["max_date"]
 96        strip = bool(query["strip"])
 97        topics_processed = 0
 98        posts_processed = 0
 99
100        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"}
101
102        for group in groups:
103            # get URL for group index
104            group = str(group)
105            group_url = "https://www.douban.com/group/%s/discussion?start=" % group
106
107            offset = 0
108            while True:
109                # get list of topics in group, for the given offset
110                fetch_url = group_url + str(offset)
111                request = self.get_douban_url(fetch_url, headers=headers)
112
113                # this would usually mean the group doesn't exist, or we hit some rate limit
114                if request.status_code != 200:
115                    self.dataset.update_status(
116                        "Got response code %i for group %s. Continuing with next group..." % (request.status_code, group))
117                    break
118
119                self.dataset.update_status("Scraping group %s..." % group)
120
121                # parse the HTML and get links to individual topics, as well as group name
122                overview_page = BeautifulSoup(request.text, 'html.parser')
123                group_name = overview_page.select_one(".group-item .title a").text
124
125                for topic in overview_page.select("table.olt tr:not(.th)"):
126                    if self.interrupted:
127                        raise ProcessorInterruptedException("Interrupted while scraping Douban topics")
128
129                    if topics_processed >= max_topics:
130                        break
131
132                    # get topic URL, and whether it is an 'elite' topic
133                    topic_url = topic.find("a").get("href")
134                    topic_is_elite = "yes" if bool(topic.select_one(".elite_topic_lable")) else "no"
135                    topic_id = topic_url.split("/topic/").pop().split("/")[0]
136
137                    # date can be in either of two formats, with or without time
138                    try:
139                        topic_updated = int(
140                            datetime.datetime.strptime(topic.select_one(".time").text, "%m-%d %H:%M").timestamp())
141                    except ValueError:
142                        topic_updated = int(
143                            datetime.datetime.strptime(topic.select_one(".time").text, "%Y-%m-%d").timestamp())
144
145                    # if a date range is given, ignore topics outside of it
146                    if start and topic_updated < start:
147                        continue
148
149                    if end and topic_updated > end:
150                        break
151
152                    self.dataset.update_status("%i posts scraped. Scraping topics %i-%i from group %s" % (
153                    posts_processed, offset, min(max_topics, offset + 50), group_name))
154
155                    # request topic page - fortunately all comments are on a single page
156                    topic_request = self.get_douban_url(topic_url, headers=headers)
157                    time.sleep(5)  # don't hit rate limits
158                    topic_page = BeautifulSoup(topic_request.text, 'html.parser')
159                    topic = topic_page.select_one("#topic-content")
160
161                    topics_processed += 1
162
163                    # include original post as the first item
164                    try:
165                        first_post = {
166                            "id": topic_id,
167                            "group_id": group,
168                            "thread_id": topic_id,
169                            "group_name": group_name,
170                            "subject": topic_page.select_one("h1").text.strip(),
171                            "body": topic_page.select_one(".topic-richtext").decode_contents(formatter="html").strip(),
172                            "author": topic.select_one(".user-face img").get("alt"),
173                            "author_id": topic.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
174                            "author_avatar": topic.select_one(".user-face img").get("src").replace("/u", "/ul"),
175                            "timestamp": int(datetime.datetime.strptime(topic.select_one(".create-time").text,
176                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
177                            "likes": 0,
178                            "is_highlighted": "no",
179                            "is_reply": "no",
180                            "is_topic_elite": topic_is_elite,
181                            "image_urls": ",".join([img.get("src") for img in topic.select(".topic-richtext img")])
182                        }
183                    except (AttributeError, ValueError):
184                        self.dataset.log("Unexpected data format when parsing topic %s/%s, skipping" % (group_name, topic_id))
185                        continue
186
187                    if strip:
188                        first_post["body"] = strip_tags(first_post["body"])
189
190                    posts_processed += 1
191                    yield first_post
192
193                    # now loop through all comments on the page
194                    for comment in topic_page.select("ul#comments > li"):
195                        comment_data = {
196                            "id": comment.get("data-cid"),
197                            "group_id": group,
198                            "thread_id": topic_id,
199                            "group_name": group_name,
200                            "subject": "",
201                            "body": comment.select_one(".reply-content").decode_contents(formatter="html").strip(),
202                            "author": comment.select_one(".user-face img").get("alt"),
203                            "author_id":
204                                comment.select_one(".user-face a").get("href").split("/people/").pop().split("/")[0],
205                            "author_avatar": comment.select_one(".user-face img").get("src").replace("/u", "/ul"),
206                            "timestamp": int(datetime.datetime.strptime(comment.select_one(".pubtime").text.strip()[:19],
207                                                                        "%Y-%m-%d %H:%M:%S").timestamp()),
208                            "likes": convert_to_int(
209                                re.sub(r"[^0-9]", "", comment.select_one(".comment-vote.lnk-fav").text), 0) if comment.select_one(".comment-vote.lnk-fav") else "Unknown",
210                            "is_highlighted": "yes" if comment.get("data-cid") in [hl.get("data-cid") for hl in
211                                                                                   comment.select(
212                                                                                       "ul#popular-comments li")] else "no",
213                            "is_reply": "yes" if comment.select_one(".reply-quote-content") else "no",
214                            "is_topic_elite": topic_is_elite,
215                            "image_urls": ",".join([img.get("src") for img in comment.select(".reply-content img")])
216                        }
217
218                        if strip:
219                            comment_data["body"] = strip_tags(comment_data["body"])
220
221                        posts_processed += 1
222                        yield comment_data
223
224                if offset < max_topics - 50:
225                    offset += 50
226                else:
227                    break

Get Douban posts

In the case of Douban, there is no need for multiple pathways, so we can route it all to the one post query method. Will scrape posts from the most recent topics for a given list of groups. Douban prevents scraping old content, so this is mostly useful to get a sense of what a given group is talking about at the moment.

Parameters
  • query: Filtered query parameters
Returns
def get_douban_url(self, url, **kwargs):
229    def get_douban_url(self, url, **kwargs):
230        """
231        Get Douban page with requests
232
233        Abstracted away like this so we can easily implement measures to
234        circumvent rate limiting later.
235
236        :param str url:  URL to request
237        :return:  Response object
238        """
239        if url[0:2] == "//":
240            url = "https:" + url
241        elif url[0] == "/":
242            url = "https://douban.com" + url
243
244        return requests.get(url, **kwargs)

Get Douban page with requests

Abstracted away like this so we can easily implement measures to circumvent rate limiting later.

Parameters
  • str url: URL to request
Returns

Response object

def validate_query(query, request, user):
246    def validate_query(query, request, user):
247        """
248        Validate input for a dataset query on the Douban data source.
249
250        :param dict query:  Query parameters, from client-side.
251        :param request:  Flask request
252        :param User user:  User object of user who has submitted the query
253        :return dict:  Safe query parameters
254        """
255        filtered_query = {}
256
257        # the dates need to make sense as a range to search within
258        after, before = query.get("daterange")
259        if before and after and before < after:
260            raise QueryParametersException("Date range must start before it ends")
261
262        filtered_query["min_date"], filtered_query["max_date"] = (after, before)
263
264        # normalize groups to just their IDs, even if a URL was provided, and
265        # limit to 25
266        groups = [group.split("/group/").pop().split("/")[0].strip() for group in
267                  query["groups"].replace("\n", ",").split(",")]
268        groups = [group for group in groups if group][:25]
269        if not any(groups):
270            raise QueryParametersException("No valid groups were provided.")
271
272        filtered_query["groups"] = ",".join(groups)
273
274        # max amount of topics is 200 because after that Douban starts throwing 429s
275        filtered_query["amount"] = max(min(convert_to_int(query["amount"], 10), 200), 1)
276
277        # strip HTML from posts?
278        filtered_query["strip"] = bool(query.get("strip", False))
279
280        return filtered_query

Validate input for a dataset query on the Douban data source.

Parameters
  • dict query: Query parameters, from client-side.
  • request: Flask request
  • User user: User object of user who has submitted the query
Returns

Safe query parameters