Edit on GitHub

datasources.bitchute.search_bitchute

Search Bitchute

Scrape Bitchute videos via the Bitchute web API

View Source

  1"""
  2Search Bitchute
  3
  4Scrape Bitchute videos via the Bitchute web API
  5"""
  6import dateparser
  7import requests
  8import json
  9import time
 10import re
 11
 12from itertools import chain
 13from bs4 import BeautifulSoup
 14
 15from common.lib.helpers import UserInput, strip_tags
 16from backend.lib.search import Search
 17from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException
 18
 19
 20class SearchBitChute(Search):
 21    """
 22    BitChute scraper
 23    """
 24    type = "bitchute-search"  # job ID
 25    category = "Search"  # category
 26    title = "Search BitChute"  # title displayed in UI
 27    description = "Retrieve BitChute videos"  # description displayed in UI
 28    extension = "csv"  # extension of result file, used internally and in UI
 29    is_local = False # Whether this datasource is locally scraped
 30    is_static = False   # Whether this datasource is still updated
 31
 32    # not available as a processor for existing datasets
 33    accepts = [None]
 34
 35    # let's not get rate limited
 36    max_workers = 1
 37
 38    # scraping happens in one or the other method, so keep track of this internally
 39    max_items = 0
 40
 41    options = {
 42        "intro": {
 43            "type": UserInput.OPTION_INFO,
 44            "help": "Videos are scraped in the order they are returned by [BitChute](https://bitchute.com)'s search "
 45                    "function.\n\nYou can scrape results for up to **fifteen** items at a time. Separate the items "
 46                    "with commas or blank lines. When searching for usernames, there is no need to include @ in front."
 47        },
 48        "search_type": {
 49            "type": UserInput.OPTION_CHOICE,
 50            "help": "Search by",
 51            "options": {
 52                "search": "Search query",
 53                "user": "Username",
 54                "url": "Video URL or ID"
 55            },
 56            "default": "search"
 57        },
 58        "query": {
 59            "type": UserInput.OPTION_TEXT_LARGE,
 60            "help": "Query"
 61        },
 62        "max_posts": {
 63            "type": UserInput.OPTION_TEXT,
 64            "help": "Videos per item",
 65            "min": 0,
 66            "max": 2500,
 67            "default": 10
 68        },
 69        "divider": {
 70            "type": UserInput.OPTION_DIVIDER
 71        },
 72        "enrichment-info": {
 73            "type": UserInput.OPTION_INFO,
 74            "help": "You can optionally scrape more details - exact publication date, likes, dislikes, category, "
 75                    "comment count and channel subscriber count - for each video. Note that this takes a couple of "
 76                    "seconds per video (which can add up!). Consider doing a basic query first and then repeating it "
 77                    "with more details only if necessary."
 78        },
 79        "search_scope": {
 80            "type": UserInput.OPTION_CHOICE,
 81            "help": "Search scope",
 82            "options": {
 83                "basic": "Basic",
 84                "detail": "Detailed",
 85                "comments": "Detailed, also scrape video comments"
 86            },
 87            "default": "basic"
 88        }
 89
 90    }
 91
 92    def get_items(self, query):
 93        """
 94        Run custom search
 95
 96        Fetches data from BitChute for either users or search queries
 97        """
 98        # ready our parameters
 99        parameters = self.dataset.get_parameters()
100        self.max_items = parameters.get("items", 100)
101        queries = [query.strip() for query in parameters.get("query", "").split(",")]
102        num_query = 0
103        detail = parameters.get("scope", "basic")
104        query_type = parameters.get("item_type", "search")
105
106        # bitchute uses a CSRF cookie that needs to be included on each request. The only way to obtain it is by
107        # visiting the site, so do just that and extract the CSRF token from the page:
108        session = requests.Session()
109        session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
110        request = session.get("https://www.bitchute.com/search")
111        csrftoken = BeautifulSoup(request.text, 'html.parser').findAll("input", {"name": "csrfmiddlewaretoken"})[0].get(
112            "value")
113        time.sleep(0.25)
114
115        self.dataset.update_status("Querying BitChute")
116        results = []
117        for query in queries:
118            num_query += 1
119            query = query.strip()
120
121            if query_type == "search":
122                results.append(self.get_videos_query(session, query, csrftoken, detail))
123            elif query_type == "url":
124                if "/video/" in query:
125                    query = query.split("/video/")[1].split("/")[0]
126                    # else assume bare ID
127
128                self.dataset.update_status("Getting details for video '%s' (%i/%i)" % (query, num_query, len(queries)))
129                results.append(self.get_videos_id(session, query, csrftoken, detail))
130            else:
131                results.append(self.get_videos_user(session, query, csrftoken, detail))
132
133            self.dataset.update_progress(num_query / len(queries))
134
135        return chain(*results)
136
137    def get_videos_id(self, session, video_id, csrftoken, detail):
138        dummy_video = {
139            "query_type": "video id",
140            "query": video_id,
141            "id": video_id,
142            "thread_id": video_id,
143            "subject": "",
144            "body": "",
145            "author": "",
146            "author_id": "",
147            "timestamp": None,
148            "url": "https://www.bitchute.com/video/" + video_id + "/",
149            "views": None,
150            "length": None,
151            "thumbnail_image": None,
152
153        }
154
155        # we can't use the BitChute search, so do one request per URL, and
156        # get details for 'free'
157        if detail == "basic":
158            detail = "detail"
159
160        video, comments = self.append_details(dummy_video, detail)
161        if not video:
162            # unrecoverable error while scraping details
163            return
164
165        yield video
166        for comment in comments:
167            # these need to be yielded *after* the video because else the result file will have the comments
168            # before the video, which is weird
169            yield comment
170
171    def get_videos_user(self, session, user, csrftoken, detail):
172        """
173        Scrape videos for given BitChute user
174
175        :param session:  HTTP Session to use
176        :param str user:  Username to scrape videos for
177        :param str csrftoken:  CSRF token to use for requests
178        :param str detail:  Detail level to scrape, basic/detail/comments
179
180        :return:  Video data dictionaries, as a generator
181        """
182        offset = 0
183        num_items = 0
184        base_url = "https://www.bitchute.com/channel/%s/" % user
185        url = base_url + "extend/"
186
187        container = session.get(base_url)
188        container_soup = BeautifulSoup(container.text, 'html.parser')
189        headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"}
190
191        while True:
192            self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, user))
193
194            if self.interrupted:
195                raise ProcessorInterruptedException("Interrupted while scraping BitChute")
196
197            post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)}
198
199            try:
200                self.dataset.log("Fetching data for BitChute video %s" % url)
201                request = session.post(url, data=post_data, headers=headers)
202                if request.status_code != 200:
203                    raise ConnectionError()
204                response = request.json()
205            except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e:
206                self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e, is_final=True)
207                return
208
209            soup = BeautifulSoup(response["html"], 'html.parser')
210            videos = soup.select(".channel-videos-container")
211            comments = []
212
213            if len(videos) == 0 or num_items >= self.max_items:
214                break
215
216            for video_element in videos:
217                if num_items >= self.max_items:
218                    break
219                else:
220                    num_items += 1
221
222                offset += 1
223
224                link = video_element.select_one(".channel-videos-title a")
225                video = {
226                    "query_type": "user id",
227                    "query": user,
228                    "id": link["href"].split("/")[-2],
229                    "thread_id": link["href"].split("/")[-2],
230                    "subject": link.text,
231                    "body": strip_tags(video_element.select_one(".channel-videos-text").text),
232                    "author": container_soup.select_one(".details .name a").text,
233                    "author_id": container_soup.select_one(".details .name a")["href"].split("/")[2],
234                    "timestamp": int(
235                        dateparser.parse(
236                            video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()),
237                    "url": "https://www.bitchute.com" + link["href"],
238                    "views": video_element.select_one(".video-views").text.strip(),
239                    "length": video_element.select_one(".video-duration").text.strip(),
240                    "thumbnail_image": video_element.select_one(".channel-videos-image img")["src"],
241                }
242
243                if detail != "basic":
244                    video, comments = self.append_details(video, detail)
245                    if not video:
246                        # unrecoverable error while scraping details
247                        return
248
249                yield video
250                for comment in comments:
251                    # these need to be yielded *after* the video because else the result file will have the comments
252                    # before the video, which is weird
253                    yield comment
254
255
256    def get_videos_query(self, session, query, csrftoken, detail):
257        """
258        Scrape videos for given BitChute search query
259
260        :param session:  HTTP Session to use
261        :param str user:  Search query to scrape videos for
262        :param str csrftoken:  CSRF token to use for requests
263        :param str detail:  Detail level to scrape, basic/detail/comments
264
265        :return:  Video data dictionaries, as a generator
266        """
267        page = 0
268        num_items = 0
269        while True:
270            self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, query))
271
272            if self.interrupted:
273                raise ProcessorInterruptedException("Interrupted while scraping BitChute")
274
275            # prepare the request - the CSRF param *must* be the first or the request will fail
276            post_data = {"csrfmiddlewaretoken": csrftoken, "query": query, "kind": "video", "duration": "",
277                         "sort": "", "page": str(page)}
278            headers = {'Referer': "https://www.bitchute.com/search", 'Origin': "https://www.bitchute.com/search"}
279            response = self.request_from_bitchute(session, "POST", "https://www.bitchute.com/api/search/list/", headers, post_data)
280
281            if not response["success"] or response["count"] == 0 or num_items >= self.max_items:
282                break
283
284            comments = []
285            for video_data in response["results"]:
286                if num_items >= self.max_items:
287                    break
288                else:
289                    num_items += 1
290
291                # note: deleted videos will have a published date of 'None'. To
292                # avoid crashing the backend the easiest way is to set it to something
293                # that is obviously not a valid date in this context.
294                if video_data["published"] is None:
295                    video_data["published"] = "1970-01-01"
296                # this is only included as '5 months ago' and so forth, not exact date
297                # so use dateparser to at least approximate the date
298                try:
299                    dt = dateparser.parse(video_data["published"])
300                except Exception as e:
301                    self.log.warning('dateparser.parse error reading video_data["published"]: %s' % str(video_data['published']))
302                    raise e
303
304                video = {
305                    "query_type": "search",
306                    "query": query,
307                    "id": video_data["id"],
308                    "thread_id": video_data["id"],
309                    "subject": video_data["name"],
310                    "body": video_data["description"],
311                    "author": video_data["channel_name"],
312                    "author_id": video_data["channel_path"].split("/")[2],
313                    "timestamp": int(dt.timestamp()),
314                    "url": "https://www.bitchute.com" + video_data["path"],
315                    "views": video_data["views"],
316                    "length": video_data["duration"],
317                    "thumbnail_image": video_data["images"]["thumbnail"]
318                }
319
320                if detail != "basic":
321                    video, comments = self.append_details(video, detail)
322                    if not video:
323                        # unrecoverable error while scraping details
324                        return
325
326                yield video
327                for comment in comments:
328                    # these need to be yielded *after* the video because else the result file will have the comments
329                    # before the video, which is weird
330                    yield comment
331
332            page += 1
333
334    def append_details(self, video, detail):
335        """
336        Append extra metadata to video data
337
338        Fetches the BitChute video detail page to scrape extra data for the given video.
339
340        :param dict video:  Video details as scraped so far
341        :param str detail:  Detail level. If 'comments', also scrape video comments.
342
343        :return dict:  Tuple, first item: updated video data, second: list of comments
344        """
345        comments = []
346
347        video = {
348            **video,
349            "likes": "",
350            "dislikes": "",
351            "channel_subscribers": "",
352            "comments": "",
353            "hashtags": "",
354            "parent_id": "",
355            "video_url": ""
356        }
357
358        try:
359            # to get more details per video, we need to request the actual video detail page
360            # start a new session, to not interfere with the CSRF token from the search session
361            video_session = requests.session()
362            video_page = video_session.get(video["url"])
363
364            if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \
365                    "<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \
366                    "<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \
367                    "<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text:
368                if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text:
369                    video["category"] = "moderated-illegal"
370                    return (video, [])
371
372                elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text:
373                    video["category"] = "moderated-nsfl"
374                    return (video, [])
375
376                elif "Incitement to Hatred" in video_page.text:
377                    video["category"] = "moderated-incitement-hatred"
378                    return (video, [])
379
380                elif "Threats or Incitement to Violence" in video_page.text:
381                    video["category"] = "moderated-incitement-violence"
382                    return (video, [])
383
384                elif "Platform Misuse" in video_page.text:
385                    video["category"] = "moderated-misuse"
386                    return (video, [])
387
388                elif "Terrorism &amp; Violent Extremism" in video_page.text:
389                    video["category"] = "moderated-terrorism-extremism"
390                    return (video, [])
391
392                elif "Copyright</h4>" in video_page.text:
393                    video["category"] = "moderated-copyright"
394                    return (video, [])
395
396                elif "Contains Holocaust Denial" in video_page.text:
397                    video["category"] = "moderated-holocaust-denial"
398                    return (video, [])
399
400                elif "Harmful Activities" in video_page.text:
401                    video["category"] = "moderated-harmful-activity"
402                    return (video, [])
403
404                else:
405                    video["category"] = "moderated-other"
406                    self.log.warning("Unknown moderated reason for BitChute video %s" % video["id"])
407                    return (video, [])
408
409            elif "<iframe class=\"rumble\"" in video_page.text:
410                # some videos are actually embeds from rumble?
411                # these are iframes, so at the moment we cannot simply extract
412                # their info from the page, so we skip them. In the future we
413                # could add an extra request to get the relevant info, but so
414                # far the only examples I've seen are actually 'video not found'
415                video = {
416                    **video,
417                    "category": "error-embed-from-rumble"
418                }
419                return (video, [])
420
421            elif video_page.status_code != 200:
422                video = {
423                    **video,
424                    "category": "error-%i" % video_page.status_code
425                }
426                return (video, [])
427
428            soup = BeautifulSoup(video_page.text, 'html.parser')
429            video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value")
430
431            video["video_url"] = soup.select_one("video#player source").get("src")
432            video["thumbnail_image"] = soup.select_one("video#player").get("poster")
433            video["subject"] = soup.select_one("h1#video-title").text
434            video["author"] = soup.select_one("div.channel-banner p.name a").text
435            video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
436            video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
437
438            # we need *two more requests* to get the comment count and like/dislike counts
439            # this seems to be because bitchute uses a third-party comment widget
440            video_session.headers = {'Referer': video["url"], 'Origin': video["url"]}
441            counts = self.request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken})
442
443            if detail == "comments":
444                # if comments are also to be scraped, this is another request to make, which returns
445                # a convenient JSON response with all the comments to the video
446                # we need yet another token for this, which we can extract from a bit of inline
447                # javascript on the page
448                comment_script = None
449                for line in video_page.text.split("\n"):
450                    if "initComments(" in line:
451                        comment_script = line.split("initComments(")[1]
452                        break
453
454                if not comment_script:
455                    # no script to extract comments from, cannot load
456                    comment_count = -1
457                else:
458                    # make the request
459                    comment_count = 0
460                    url = comment_script.split("'")[1]
461                    comment_csrf = comment_script.split("'")[3]
462                    comments_data = self.request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0})
463
464                    for comment in comments_data:
465                        comment_count += 1
466
467                        if comment.get("profile_picture_url", None):
468                            thumbnail_image = url + comment.get("profile_picture_url")
469                        else:
470                            thumbnail_image = ""
471
472                        comments.append({
473                            "query_type": video["query_type"],
474                            "query": video["query"],
475                            "id": comment["id"],
476                            "thread_id": video["id"],
477                            "subject": "",
478                            "body": comment["content"],
479                            "author": comment["fullname"],
480                            "author_id": comment["creator"],
481                            "timestamp": int(dateparser.parse(comment["created"]).timestamp()),
482                            "url": "",
483                            "views": "",
484                            "length": "",
485                            "hashtags": "",
486                            "thumbnail_image": thumbnail_image,
487                            "likes": comment["upvote_count"],
488                            "category": "comment",
489                            "dislikes": "",
490                            "channel_subscribers": "",
491                            "comments": "",
492                            "parent_id": comment.get("parent", "") if "parent" in comment else video["id"],
493                        })
494
495            else:
496                # if we don't need the full comments, we still need another request to get the *amount*
497                # of comments,
498                comment_count = self.request_from_bitchute(video_session, "POST",
499                    "https://commentfreely.bitchute.com/api/get_comment_count/",
500                    data={"csrfmiddlewaretoken": video_csfrtoken,
501                          "cf_thread": "bc_" + video["id"]})["commentCount"]
502
503        except RuntimeError as e:
504            # we wrap this in one big try-catch because doing it for each request separarely is tedious
505            # hm... maybe this should be in a helper function
506            self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e,
507                                       is_final=True)
508            return (None, None)
509
510        # again, no structured info available for the publication date, but at least we can extract the
511        # exact day it was uploaded
512        try:
513            published = dateparser.parse(
514                soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1])
515        except AttributeError as e:
516            # publication date not on page?
517            published = None
518
519        # merge data
520        video = {
521            **video,
522            "category": re.findall(r'<td><a href="/category/([^/]+)/"', video_page.text)[0],
523            "likes": counts["like_count"],
524            "dislikes": counts["dislike_count"],
525            "channel_subscribers": counts["subscriber_count"],
526            "comments": comment_count,
527            "parent_id": "",
528            "hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
529            "views": counts["view_count"]
530        }
531
532        if published:
533            video["timestamp"] = int(published.timestamp())
534
535        # may need to be increased? bitchute doesn't seem particularly strict
536        time.sleep(0.25)
537        return (video, comments)
538
539    def request_from_bitchute(self, session, method, url, headers=None, data=None):
540        """
541        Request something via the BitChute API (or non-API)
542
543        To avoid having to write the same error-checking everywhere, this takes
544        care of retrying on failure, et cetera
545
546        :param session:  Requests session
547        :param str method: GET or POST
548        :param str url:  URL to fetch
549        :param dict header:  Headers to pass with the request
550        :param dict data:  Data/params to send with the request
551
552        :return:  Requests response
553        """
554        retries = 0
555        response = None
556        while retries < 3:
557            try:
558                if method.lower() == "post":
559                    request = session.post(url, headers=headers, data=data)
560                elif method.lower() == "get":
561                    request = session.get(url, headers=headers, params=data)
562                else:
563                    raise NotImplemented()
564
565                if request.status_code >= 300:
566                    raise ValueError("Response %i from BitChute for URL %s, need to retry" % (request.status_code, url))
567
568                response = request.json()
569                return response
570
571            except (ConnectionResetError, requests.RequestException, ValueError) as e:
572                retries += 1
573                time.sleep(retries * 2)
574
575            except json.JSONDecodeError as e:
576                self.log.warning("Error decoding JSON: %s\n\n%s" % (e, request.text))
577
578        if not response:
579            self.log.warning("Failed BitChute request to %s %i times, aborting" % (url, retries))
580            raise RuntimeError()
581
582        return response
583
584
585    def validate_query(query, request, user):
586        """
587        Validate BitChute query input
588
589        :param dict query:  Query parameters, from client-side.
590        :param request:  Flask request
591        :param User user:  User object of user who has submitted the query
592        :return dict:  Safe query parameters
593        """
594        # no query 4 u
595        if not query.get("query", "").strip():
596            raise QueryParametersException("You must provide a search query.")
597
598        # reformat queries to be a comma-separated list with no wrapping
599        # whitespace
600        items = query.get("query").replace("\n", ",")
601        if len(items.split(",")) > 15 and query.get("search_type") != "url":
602            raise QueryParametersException("You cannot query more than 15 items at a time.")
603
604        # simple!
605        return {
606            "items": query.get("max_posts"),
607            "query": items,
608            "scope": query.get("search_scope"),
609            "item_type": query.get("search_type")
610        }

class SearchBitChute(backend.lib.search.Search): View Source

 21class SearchBitChute(Search):
 22    """
 23    BitChute scraper
 24    """
 25    type = "bitchute-search"  # job ID
 26    category = "Search"  # category
 27    title = "Search BitChute"  # title displayed in UI
 28    description = "Retrieve BitChute videos"  # description displayed in UI
 29    extension = "csv"  # extension of result file, used internally and in UI
 30    is_local = False # Whether this datasource is locally scraped
 31    is_static = False   # Whether this datasource is still updated
 32
 33    # not available as a processor for existing datasets
 34    accepts = [None]
 35
 36    # let's not get rate limited
 37    max_workers = 1
 38
 39    # scraping happens in one or the other method, so keep track of this internally
 40    max_items = 0
 41
 42    options = {
 43        "intro": {
 44            "type": UserInput.OPTION_INFO,
 45            "help": "Videos are scraped in the order they are returned by [BitChute](https://bitchute.com)'s search "
 46                    "function.\n\nYou can scrape results for up to **fifteen** items at a time. Separate the items "
 47                    "with commas or blank lines. When searching for usernames, there is no need to include @ in front."
 48        },
 49        "search_type": {
 50            "type": UserInput.OPTION_CHOICE,
 51            "help": "Search by",
 52            "options": {
 53                "search": "Search query",
 54                "user": "Username",
 55                "url": "Video URL or ID"
 56            },
 57            "default": "search"
 58        },
 59        "query": {
 60            "type": UserInput.OPTION_TEXT_LARGE,
 61            "help": "Query"
 62        },
 63        "max_posts": {
 64            "type": UserInput.OPTION_TEXT,
 65            "help": "Videos per item",
 66            "min": 0,
 67            "max": 2500,
 68            "default": 10
 69        },
 70        "divider": {
 71            "type": UserInput.OPTION_DIVIDER
 72        },
 73        "enrichment-info": {
 74            "type": UserInput.OPTION_INFO,
 75            "help": "You can optionally scrape more details - exact publication date, likes, dislikes, category, "
 76                    "comment count and channel subscriber count - for each video. Note that this takes a couple of "
 77                    "seconds per video (which can add up!). Consider doing a basic query first and then repeating it "
 78                    "with more details only if necessary."
 79        },
 80        "search_scope": {
 81            "type": UserInput.OPTION_CHOICE,
 82            "help": "Search scope",
 83            "options": {
 84                "basic": "Basic",
 85                "detail": "Detailed",
 86                "comments": "Detailed, also scrape video comments"
 87            },
 88            "default": "basic"
 89        }
 90
 91    }
 92
 93    def get_items(self, query):
 94        """
 95        Run custom search
 96
 97        Fetches data from BitChute for either users or search queries
 98        """
 99        # ready our parameters
100        parameters = self.dataset.get_parameters()
101        self.max_items = parameters.get("items", 100)
102        queries = [query.strip() for query in parameters.get("query", "").split(",")]
103        num_query = 0
104        detail = parameters.get("scope", "basic")
105        query_type = parameters.get("item_type", "search")
106
107        # bitchute uses a CSRF cookie that needs to be included on each request. The only way to obtain it is by
108        # visiting the site, so do just that and extract the CSRF token from the page:
109        session = requests.Session()
110        session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
111        request = session.get("https://www.bitchute.com/search")
112        csrftoken = BeautifulSoup(request.text, 'html.parser').findAll("input", {"name": "csrfmiddlewaretoken"})[0].get(
113            "value")
114        time.sleep(0.25)
115
116        self.dataset.update_status("Querying BitChute")
117        results = []
118        for query in queries:
119            num_query += 1
120            query = query.strip()
121
122            if query_type == "search":
123                results.append(self.get_videos_query(session, query, csrftoken, detail))
124            elif query_type == "url":
125                if "/video/" in query:
126                    query = query.split("/video/")[1].split("/")[0]
127                    # else assume bare ID
128
129                self.dataset.update_status("Getting details for video '%s' (%i/%i)" % (query, num_query, len(queries)))
130                results.append(self.get_videos_id(session, query, csrftoken, detail))
131            else:
132                results.append(self.get_videos_user(session, query, csrftoken, detail))
133
134            self.dataset.update_progress(num_query / len(queries))
135
136        return chain(*results)
137
138    def get_videos_id(self, session, video_id, csrftoken, detail):
139        dummy_video = {
140            "query_type": "video id",
141            "query": video_id,
142            "id": video_id,
143            "thread_id": video_id,
144            "subject": "",
145            "body": "",
146            "author": "",
147            "author_id": "",
148            "timestamp": None,
149            "url": "https://www.bitchute.com/video/" + video_id + "/",
150            "views": None,
151            "length": None,
152            "thumbnail_image": None,
153
154        }
155
156        # we can't use the BitChute search, so do one request per URL, and
157        # get details for 'free'
158        if detail == "basic":
159            detail = "detail"
160
161        video, comments = self.append_details(dummy_video, detail)
162        if not video:
163            # unrecoverable error while scraping details
164            return
165
166        yield video
167        for comment in comments:
168            # these need to be yielded *after* the video because else the result file will have the comments
169            # before the video, which is weird
170            yield comment
171
172    def get_videos_user(self, session, user, csrftoken, detail):
173        """
174        Scrape videos for given BitChute user
175
176        :param session:  HTTP Session to use
177        :param str user:  Username to scrape videos for
178        :param str csrftoken:  CSRF token to use for requests
179        :param str detail:  Detail level to scrape, basic/detail/comments
180
181        :return:  Video data dictionaries, as a generator
182        """
183        offset = 0
184        num_items = 0
185        base_url = "https://www.bitchute.com/channel/%s/" % user
186        url = base_url + "extend/"
187
188        container = session.get(base_url)
189        container_soup = BeautifulSoup(container.text, 'html.parser')
190        headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"}
191
192        while True:
193            self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, user))
194
195            if self.interrupted:
196                raise ProcessorInterruptedException("Interrupted while scraping BitChute")
197
198            post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)}
199
200            try:
201                self.dataset.log("Fetching data for BitChute video %s" % url)
202                request = session.post(url, data=post_data, headers=headers)
203                if request.status_code != 200:
204                    raise ConnectionError()
205                response = request.json()
206            except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e:
207                self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e, is_final=True)
208                return
209
210            soup = BeautifulSoup(response["html"], 'html.parser')
211            videos = soup.select(".channel-videos-container")
212            comments = []
213
214            if len(videos) == 0 or num_items >= self.max_items:
215                break
216
217            for video_element in videos:
218                if num_items >= self.max_items:
219                    break
220                else:
221                    num_items += 1
222
223                offset += 1
224
225                link = video_element.select_one(".channel-videos-title a")
226                video = {
227                    "query_type": "user id",
228                    "query": user,
229                    "id": link["href"].split("/")[-2],
230                    "thread_id": link["href"].split("/")[-2],
231                    "subject": link.text,
232                    "body": strip_tags(video_element.select_one(".channel-videos-text").text),
233                    "author": container_soup.select_one(".details .name a").text,
234                    "author_id": container_soup.select_one(".details .name a")["href"].split("/")[2],
235                    "timestamp": int(
236                        dateparser.parse(
237                            video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()),
238                    "url": "https://www.bitchute.com" + link["href"],
239                    "views": video_element.select_one(".video-views").text.strip(),
240                    "length": video_element.select_one(".video-duration").text.strip(),
241                    "thumbnail_image": video_element.select_one(".channel-videos-image img")["src"],
242                }
243
244                if detail != "basic":
245                    video, comments = self.append_details(video, detail)
246                    if not video:
247                        # unrecoverable error while scraping details
248                        return
249
250                yield video
251                for comment in comments:
252                    # these need to be yielded *after* the video because else the result file will have the comments
253                    # before the video, which is weird
254                    yield comment
255
256
257    def get_videos_query(self, session, query, csrftoken, detail):
258        """
259        Scrape videos for given BitChute search query
260
261        :param session:  HTTP Session to use
262        :param str user:  Search query to scrape videos for
263        :param str csrftoken:  CSRF token to use for requests
264        :param str detail:  Detail level to scrape, basic/detail/comments
265
266        :return:  Video data dictionaries, as a generator
267        """
268        page = 0
269        num_items = 0
270        while True:
271            self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, query))
272
273            if self.interrupted:
274                raise ProcessorInterruptedException("Interrupted while scraping BitChute")
275
276            # prepare the request - the CSRF param *must* be the first or the request will fail
277            post_data = {"csrfmiddlewaretoken": csrftoken, "query": query, "kind": "video", "duration": "",
278                         "sort": "", "page": str(page)}
279            headers = {'Referer': "https://www.bitchute.com/search", 'Origin': "https://www.bitchute.com/search"}
280            response = self.request_from_bitchute(session, "POST", "https://www.bitchute.com/api/search/list/", headers, post_data)
281
282            if not response["success"] or response["count"] == 0 or num_items >= self.max_items:
283                break
284
285            comments = []
286            for video_data in response["results"]:
287                if num_items >= self.max_items:
288                    break
289                else:
290                    num_items += 1
291
292                # note: deleted videos will have a published date of 'None'. To
293                # avoid crashing the backend the easiest way is to set it to something
294                # that is obviously not a valid date in this context.
295                if video_data["published"] is None:
296                    video_data["published"] = "1970-01-01"
297                # this is only included as '5 months ago' and so forth, not exact date
298                # so use dateparser to at least approximate the date
299                try:
300                    dt = dateparser.parse(video_data["published"])
301                except Exception as e:
302                    self.log.warning('dateparser.parse error reading video_data["published"]: %s' % str(video_data['published']))
303                    raise e
304
305                video = {
306                    "query_type": "search",
307                    "query": query,
308                    "id": video_data["id"],
309                    "thread_id": video_data["id"],
310                    "subject": video_data["name"],
311                    "body": video_data["description"],
312                    "author": video_data["channel_name"],
313                    "author_id": video_data["channel_path"].split("/")[2],
314                    "timestamp": int(dt.timestamp()),
315                    "url": "https://www.bitchute.com" + video_data["path"],
316                    "views": video_data["views"],
317                    "length": video_data["duration"],
318                    "thumbnail_image": video_data["images"]["thumbnail"]
319                }
320
321                if detail != "basic":
322                    video, comments = self.append_details(video, detail)
323                    if not video:
324                        # unrecoverable error while scraping details
325                        return
326
327                yield video
328                for comment in comments:
329                    # these need to be yielded *after* the video because else the result file will have the comments
330                    # before the video, which is weird
331                    yield comment
332
333            page += 1
334
335    def append_details(self, video, detail):
336        """
337        Append extra metadata to video data
338
339        Fetches the BitChute video detail page to scrape extra data for the given video.
340
341        :param dict video:  Video details as scraped so far
342        :param str detail:  Detail level. If 'comments', also scrape video comments.
343
344        :return dict:  Tuple, first item: updated video data, second: list of comments
345        """
346        comments = []
347
348        video = {
349            **video,
350            "likes": "",
351            "dislikes": "",
352            "channel_subscribers": "",
353            "comments": "",
354            "hashtags": "",
355            "parent_id": "",
356            "video_url": ""
357        }
358
359        try:
360            # to get more details per video, we need to request the actual video detail page
361            # start a new session, to not interfere with the CSRF token from the search session
362            video_session = requests.session()
363            video_page = video_session.get(video["url"])
364
365            if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \
366                    "<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \
367                    "<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \
368                    "<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text:
369                if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text:
370                    video["category"] = "moderated-illegal"
371                    return (video, [])
372
373                elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text:
374                    video["category"] = "moderated-nsfl"
375                    return (video, [])
376
377                elif "Incitement to Hatred" in video_page.text:
378                    video["category"] = "moderated-incitement-hatred"
379                    return (video, [])
380
381                elif "Threats or Incitement to Violence" in video_page.text:
382                    video["category"] = "moderated-incitement-violence"
383                    return (video, [])
384
385                elif "Platform Misuse" in video_page.text:
386                    video["category"] = "moderated-misuse"
387                    return (video, [])
388
389                elif "Terrorism &amp; Violent Extremism" in video_page.text:
390                    video["category"] = "moderated-terrorism-extremism"
391                    return (video, [])
392
393                elif "Copyright</h4>" in video_page.text:
394                    video["category"] = "moderated-copyright"
395                    return (video, [])
396
397                elif "Contains Holocaust Denial" in video_page.text:
398                    video["category"] = "moderated-holocaust-denial"
399                    return (video, [])
400
401                elif "Harmful Activities" in video_page.text:
402                    video["category"] = "moderated-harmful-activity"
403                    return (video, [])
404
405                else:
406                    video["category"] = "moderated-other"
407                    self.log.warning("Unknown moderated reason for BitChute video %s" % video["id"])
408                    return (video, [])
409
410            elif "<iframe class=\"rumble\"" in video_page.text:
411                # some videos are actually embeds from rumble?
412                # these are iframes, so at the moment we cannot simply extract
413                # their info from the page, so we skip them. In the future we
414                # could add an extra request to get the relevant info, but so
415                # far the only examples I've seen are actually 'video not found'
416                video = {
417                    **video,
418                    "category": "error-embed-from-rumble"
419                }
420                return (video, [])
421
422            elif video_page.status_code != 200:
423                video = {
424                    **video,
425                    "category": "error-%i" % video_page.status_code
426                }
427                return (video, [])
428
429            soup = BeautifulSoup(video_page.text, 'html.parser')
430            video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value")
431
432            video["video_url"] = soup.select_one("video#player source").get("src")
433            video["thumbnail_image"] = soup.select_one("video#player").get("poster")
434            video["subject"] = soup.select_one("h1#video-title").text
435            video["author"] = soup.select_one("div.channel-banner p.name a").text
436            video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
437            video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
438
439            # we need *two more requests* to get the comment count and like/dislike counts
440            # this seems to be because bitchute uses a third-party comment widget
441            video_session.headers = {'Referer': video["url"], 'Origin': video["url"]}
442            counts = self.request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken})
443
444            if detail == "comments":
445                # if comments are also to be scraped, this is another request to make, which returns
446                # a convenient JSON response with all the comments to the video
447                # we need yet another token for this, which we can extract from a bit of inline
448                # javascript on the page
449                comment_script = None
450                for line in video_page.text.split("\n"):
451                    if "initComments(" in line:
452                        comment_script = line.split("initComments(")[1]
453                        break
454
455                if not comment_script:
456                    # no script to extract comments from, cannot load
457                    comment_count = -1
458                else:
459                    # make the request
460                    comment_count = 0
461                    url = comment_script.split("'")[1]
462                    comment_csrf = comment_script.split("'")[3]
463                    comments_data = self.request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0})
464
465                    for comment in comments_data:
466                        comment_count += 1
467
468                        if comment.get("profile_picture_url", None):
469                            thumbnail_image = url + comment.get("profile_picture_url")
470                        else:
471                            thumbnail_image = ""
472
473                        comments.append({
474                            "query_type": video["query_type"],
475                            "query": video["query"],
476                            "id": comment["id"],
477                            "thread_id": video["id"],
478                            "subject": "",
479                            "body": comment["content"],
480                            "author": comment["fullname"],
481                            "author_id": comment["creator"],
482                            "timestamp": int(dateparser.parse(comment["created"]).timestamp()),
483                            "url": "",
484                            "views": "",
485                            "length": "",
486                            "hashtags": "",
487                            "thumbnail_image": thumbnail_image,
488                            "likes": comment["upvote_count"],
489                            "category": "comment",
490                            "dislikes": "",
491                            "channel_subscribers": "",
492                            "comments": "",
493                            "parent_id": comment.get("parent", "") if "parent" in comment else video["id"],
494                        })
495
496            else:
497                # if we don't need the full comments, we still need another request to get the *amount*
498                # of comments,
499                comment_count = self.request_from_bitchute(video_session, "POST",
500                    "https://commentfreely.bitchute.com/api/get_comment_count/",
501                    data={"csrfmiddlewaretoken": video_csfrtoken,
502                          "cf_thread": "bc_" + video["id"]})["commentCount"]
503
504        except RuntimeError as e:
505            # we wrap this in one big try-catch because doing it for each request separarely is tedious
506            # hm... maybe this should be in a helper function
507            self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e,
508                                       is_final=True)
509            return (None, None)
510
511        # again, no structured info available for the publication date, but at least we can extract the
512        # exact day it was uploaded
513        try:
514            published = dateparser.parse(
515                soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1])
516        except AttributeError as e:
517            # publication date not on page?
518            published = None
519
520        # merge data
521        video = {
522            **video,
523            "category": re.findall(r'<td><a href="/category/([^/]+)/"', video_page.text)[0],
524            "likes": counts["like_count"],
525            "dislikes": counts["dislike_count"],
526            "channel_subscribers": counts["subscriber_count"],
527            "comments": comment_count,
528            "parent_id": "",
529            "hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
530            "views": counts["view_count"]
531        }
532
533        if published:
534            video["timestamp"] = int(published.timestamp())
535
536        # may need to be increased? bitchute doesn't seem particularly strict
537        time.sleep(0.25)
538        return (video, comments)
539
540    def request_from_bitchute(self, session, method, url, headers=None, data=None):
541        """
542        Request something via the BitChute API (or non-API)
543
544        To avoid having to write the same error-checking everywhere, this takes
545        care of retrying on failure, et cetera
546
547        :param session:  Requests session
548        :param str method: GET or POST
549        :param str url:  URL to fetch
550        :param dict header:  Headers to pass with the request
551        :param dict data:  Data/params to send with the request
552
553        :return:  Requests response
554        """
555        retries = 0
556        response = None
557        while retries < 3:
558            try:
559                if method.lower() == "post":
560                    request = session.post(url, headers=headers, data=data)
561                elif method.lower() == "get":
562                    request = session.get(url, headers=headers, params=data)
563                else:
564                    raise NotImplemented()
565
566                if request.status_code >= 300:
567                    raise ValueError("Response %i from BitChute for URL %s, need to retry" % (request.status_code, url))
568
569                response = request.json()
570                return response
571
572            except (ConnectionResetError, requests.RequestException, ValueError) as e:
573                retries += 1
574                time.sleep(retries * 2)
575
576            except json.JSONDecodeError as e:
577                self.log.warning("Error decoding JSON: %s\n\n%s" % (e, request.text))
578
579        if not response:
580            self.log.warning("Failed BitChute request to %s %i times, aborting" % (url, retries))
581            raise RuntimeError()
582
583        return response
584
585
586    def validate_query(query, request, user):
587        """
588        Validate BitChute query input
589
590        :param dict query:  Query parameters, from client-side.
591        :param request:  Flask request
592        :param User user:  User object of user who has submitted the query
593        :return dict:  Safe query parameters
594        """
595        # no query 4 u
596        if not query.get("query", "").strip():
597            raise QueryParametersException("You must provide a search query.")
598
599        # reformat queries to be a comma-separated list with no wrapping
600        # whitespace
601        items = query.get("query").replace("\n", ",")
602        if len(items.split(",")) > 15 and query.get("search_type") != "url":
603            raise QueryParametersException("You cannot query more than 15 items at a time.")
604
605        # simple!
606        return {
607            "items": query.get("max_posts"),
608            "query": items,
609            "scope": query.get("search_scope"),
610            "item_type": query.get("search_type")
611        }

BitChute scraper

type = 'bitchute-search'

category = 'Search'

title = 'Search BitChute'

description = 'Retrieve BitChute videos'

extension = 'csv'

is_local = False

is_static = False

accepts = [None]

max_workers = 1

max_items = 0

options = {'intro': {'type': 'info', 'help': "Videos are scraped in the order they are returned by [BitChute](https://bitchute.com)'s search function.\n\nYou can scrape results for up to **fifteen** items at a time. Separate the items with commas or blank lines. When searching for usernames, there is no need to include @ in front."}, 'search_type': {'type': 'choice', 'help': 'Search by', 'options': {'search': 'Search query', 'user': 'Username', 'url': 'Video URL or ID'}, 'default': 'search'}, 'query': {'type': 'textarea', 'help': 'Query'}, 'max_posts': {'type': 'string', 'help': 'Videos per item', 'min': 0, 'max': 2500, 'default': 10}, 'divider': {'type': 'divider'}, 'enrichment-info': {'type': 'info', 'help': 'You can optionally scrape more details - exact publication date, likes, dislikes, category, comment count and channel subscriber count - for each video. Note that this takes a couple of seconds per video (which can add up!). Consider doing a basic query first and then repeating it with more details only if necessary.'}, 'search_scope': {'type': 'choice', 'help': 'Search scope', 'options': {'basic': 'Basic', 'detail': 'Detailed', 'comments': 'Detailed, also scrape video comments'}, 'default': 'basic'}}

def get_items(self, query): View Source

 93    def get_items(self, query):
 94        """
 95        Run custom search
 96
 97        Fetches data from BitChute for either users or search queries
 98        """
 99        # ready our parameters
100        parameters = self.dataset.get_parameters()
101        self.max_items = parameters.get("items", 100)
102        queries = [query.strip() for query in parameters.get("query", "").split(",")]
103        num_query = 0
104        detail = parameters.get("scope", "basic")
105        query_type = parameters.get("item_type", "search")
106
107        # bitchute uses a CSRF cookie that needs to be included on each request. The only way to obtain it is by
108        # visiting the site, so do just that and extract the CSRF token from the page:
109        session = requests.Session()
110        session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
111        request = session.get("https://www.bitchute.com/search")
112        csrftoken = BeautifulSoup(request.text, 'html.parser').findAll("input", {"name": "csrfmiddlewaretoken"})[0].get(
113            "value")
114        time.sleep(0.25)
115
116        self.dataset.update_status("Querying BitChute")
117        results = []
118        for query in queries:
119            num_query += 1
120            query = query.strip()
121
122            if query_type == "search":
123                results.append(self.get_videos_query(session, query, csrftoken, detail))
124            elif query_type == "url":
125                if "/video/" in query:
126                    query = query.split("/video/")[1].split("/")[0]
127                    # else assume bare ID
128
129                self.dataset.update_status("Getting details for video '%s' (%i/%i)" % (query, num_query, len(queries)))
130                results.append(self.get_videos_id(session, query, csrftoken, detail))
131            else:
132                results.append(self.get_videos_user(session, query, csrftoken, detail))
133
134            self.dataset.update_progress(num_query / len(queries))
135
136        return chain(*results)

Run custom search

Fetches data from BitChute for either users or search queries

def get_videos_id(self, session, video_id, csrftoken, detail): View Source

138    def get_videos_id(self, session, video_id, csrftoken, detail):
139        dummy_video = {
140            "query_type": "video id",
141            "query": video_id,
142            "id": video_id,
143            "thread_id": video_id,
144            "subject": "",
145            "body": "",
146            "author": "",
147            "author_id": "",
148            "timestamp": None,
149            "url": "https://www.bitchute.com/video/" + video_id + "/",
150            "views": None,
151            "length": None,
152            "thumbnail_image": None,
153
154        }
155
156        # we can't use the BitChute search, so do one request per URL, and
157        # get details for 'free'
158        if detail == "basic":
159            detail = "detail"
160
161        video, comments = self.append_details(dummy_video, detail)
162        if not video:
163            # unrecoverable error while scraping details
164            return
165
166        yield video
167        for comment in comments:
168            # these need to be yielded *after* the video because else the result file will have the comments
169            # before the video, which is weird
170            yield comment

def get_videos_user(self, session, user, csrftoken, detail): View Source

172    def get_videos_user(self, session, user, csrftoken, detail):
173        """
174        Scrape videos for given BitChute user
175
176        :param session:  HTTP Session to use
177        :param str user:  Username to scrape videos for
178        :param str csrftoken:  CSRF token to use for requests
179        :param str detail:  Detail level to scrape, basic/detail/comments
180
181        :return:  Video data dictionaries, as a generator
182        """
183        offset = 0
184        num_items = 0
185        base_url = "https://www.bitchute.com/channel/%s/" % user
186        url = base_url + "extend/"
187
188        container = session.get(base_url)
189        container_soup = BeautifulSoup(container.text, 'html.parser')
190        headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"}
191
192        while True:
193            self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, user))
194
195            if self.interrupted:
196                raise ProcessorInterruptedException("Interrupted while scraping BitChute")
197
198            post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)}
199
200            try:
201                self.dataset.log("Fetching data for BitChute video %s" % url)
202                request = session.post(url, data=post_data, headers=headers)
203                if request.status_code != 200:
204                    raise ConnectionError()
205                response = request.json()
206            except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e:
207                self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e, is_final=True)
208                return
209
210            soup = BeautifulSoup(response["html"], 'html.parser')
211            videos = soup.select(".channel-videos-container")
212            comments = []
213
214            if len(videos) == 0 or num_items >= self.max_items:
215                break
216
217            for video_element in videos:
218                if num_items >= self.max_items:
219                    break
220                else:
221                    num_items += 1
222
223                offset += 1
224
225                link = video_element.select_one(".channel-videos-title a")
226                video = {
227                    "query_type": "user id",
228                    "query": user,
229                    "id": link["href"].split("/")[-2],
230                    "thread_id": link["href"].split("/")[-2],
231                    "subject": link.text,
232                    "body": strip_tags(video_element.select_one(".channel-videos-text").text),
233                    "author": container_soup.select_one(".details .name a").text,
234                    "author_id": container_soup.select_one(".details .name a")["href"].split("/")[2],
235                    "timestamp": int(
236                        dateparser.parse(
237                            video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()),
238                    "url": "https://www.bitchute.com" + link["href"],
239                    "views": video_element.select_one(".video-views").text.strip(),
240                    "length": video_element.select_one(".video-duration").text.strip(),
241                    "thumbnail_image": video_element.select_one(".channel-videos-image img")["src"],
242                }
243
244                if detail != "basic":
245                    video, comments = self.append_details(video, detail)
246                    if not video:
247                        # unrecoverable error while scraping details
248                        return
249
250                yield video
251                for comment in comments:
252                    # these need to be yielded *after* the video because else the result file will have the comments
253                    # before the video, which is weird
254                    yield comment

Scrape videos for given BitChute user

Parameters

session: HTTP Session to use
str user: Username to scrape videos for
str csrftoken: CSRF token to use for requests
str detail: Detail level to scrape, basic/detail/comments

Returns

Video data dictionaries, as a generator

def get_videos_query(self, session, query, csrftoken, detail): View Source

257    def get_videos_query(self, session, query, csrftoken, detail):
258        """
259        Scrape videos for given BitChute search query
260
261        :param session:  HTTP Session to use
262        :param str user:  Search query to scrape videos for
263        :param str csrftoken:  CSRF token to use for requests
264        :param str detail:  Detail level to scrape, basic/detail/comments
265
266        :return:  Video data dictionaries, as a generator
267        """
268        page = 0
269        num_items = 0
270        while True:
271            self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, query))
272
273            if self.interrupted:
274                raise ProcessorInterruptedException("Interrupted while scraping BitChute")
275
276            # prepare the request - the CSRF param *must* be the first or the request will fail
277            post_data = {"csrfmiddlewaretoken": csrftoken, "query": query, "kind": "video", "duration": "",
278                         "sort": "", "page": str(page)}
279            headers = {'Referer': "https://www.bitchute.com/search", 'Origin': "https://www.bitchute.com/search"}
280            response = self.request_from_bitchute(session, "POST", "https://www.bitchute.com/api/search/list/", headers, post_data)
281
282            if not response["success"] or response["count"] == 0 or num_items >= self.max_items:
283                break
284
285            comments = []
286            for video_data in response["results"]:
287                if num_items >= self.max_items:
288                    break
289                else:
290                    num_items += 1
291
292                # note: deleted videos will have a published date of 'None'. To
293                # avoid crashing the backend the easiest way is to set it to something
294                # that is obviously not a valid date in this context.
295                if video_data["published"] is None:
296                    video_data["published"] = "1970-01-01"
297                # this is only included as '5 months ago' and so forth, not exact date
298                # so use dateparser to at least approximate the date
299                try:
300                    dt = dateparser.parse(video_data["published"])
301                except Exception as e:
302                    self.log.warning('dateparser.parse error reading video_data["published"]: %s' % str(video_data['published']))
303                    raise e
304
305                video = {
306                    "query_type": "search",
307                    "query": query,
308                    "id": video_data["id"],
309                    "thread_id": video_data["id"],
310                    "subject": video_data["name"],
311                    "body": video_data["description"],
312                    "author": video_data["channel_name"],
313                    "author_id": video_data["channel_path"].split("/")[2],
314                    "timestamp": int(dt.timestamp()),
315                    "url": "https://www.bitchute.com" + video_data["path"],
316                    "views": video_data["views"],
317                    "length": video_data["duration"],
318                    "thumbnail_image": video_data["images"]["thumbnail"]
319                }
320
321                if detail != "basic":
322                    video, comments = self.append_details(video, detail)
323                    if not video:
324                        # unrecoverable error while scraping details
325                        return
326
327                yield video
328                for comment in comments:
329                    # these need to be yielded *after* the video because else the result file will have the comments
330                    # before the video, which is weird
331                    yield comment
332
333            page += 1

Scrape videos for given BitChute search query

Parameters

session: HTTP Session to use
str user: Search query to scrape videos for
str csrftoken: CSRF token to use for requests
str detail: Detail level to scrape, basic/detail/comments

Returns

Video data dictionaries, as a generator

def append_details(self, video, detail): View Source

335    def append_details(self, video, detail):
336        """
337        Append extra metadata to video data
338
339        Fetches the BitChute video detail page to scrape extra data for the given video.
340
341        :param dict video:  Video details as scraped so far
342        :param str detail:  Detail level. If 'comments', also scrape video comments.
343
344        :return dict:  Tuple, first item: updated video data, second: list of comments
345        """
346        comments = []
347
348        video = {
349            **video,
350            "likes": "",
351            "dislikes": "",
352            "channel_subscribers": "",
353            "comments": "",
354            "hashtags": "",
355            "parent_id": "",
356            "video_url": ""
357        }
358
359        try:
360            # to get more details per video, we need to request the actual video detail page
361            # start a new session, to not interfere with the CSRF token from the search session
362            video_session = requests.session()
363            video_page = video_session.get(video["url"])
364
365            if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \
366                    "<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \
367                    "<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \
368                    "<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text:
369                if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text:
370                    video["category"] = "moderated-illegal"
371                    return (video, [])
372
373                elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text:
374                    video["category"] = "moderated-nsfl"
375                    return (video, [])
376
377                elif "Incitement to Hatred" in video_page.text:
378                    video["category"] = "moderated-incitement-hatred"
379                    return (video, [])
380
381                elif "Threats or Incitement to Violence" in video_page.text:
382                    video["category"] = "moderated-incitement-violence"
383                    return (video, [])
384
385                elif "Platform Misuse" in video_page.text:
386                    video["category"] = "moderated-misuse"
387                    return (video, [])
388
389                elif "Terrorism &amp; Violent Extremism" in video_page.text:
390                    video["category"] = "moderated-terrorism-extremism"
391                    return (video, [])
392
393                elif "Copyright</h4>" in video_page.text:
394                    video["category"] = "moderated-copyright"
395                    return (video, [])
396
397                elif "Contains Holocaust Denial" in video_page.text:
398                    video["category"] = "moderated-holocaust-denial"
399                    return (video, [])
400
401                elif "Harmful Activities" in video_page.text:
402                    video["category"] = "moderated-harmful-activity"
403                    return (video, [])
404
405                else:
406                    video["category"] = "moderated-other"
407                    self.log.warning("Unknown moderated reason for BitChute video %s" % video["id"])
408                    return (video, [])
409
410            elif "<iframe class=\"rumble\"" in video_page.text:
411                # some videos are actually embeds from rumble?
412                # these are iframes, so at the moment we cannot simply extract
413                # their info from the page, so we skip them. In the future we
414                # could add an extra request to get the relevant info, but so
415                # far the only examples I've seen are actually 'video not found'
416                video = {
417                    **video,
418                    "category": "error-embed-from-rumble"
419                }
420                return (video, [])
421
422            elif video_page.status_code != 200:
423                video = {
424                    **video,
425                    "category": "error-%i" % video_page.status_code
426                }
427                return (video, [])
428
429            soup = BeautifulSoup(video_page.text, 'html.parser')
430            video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value")
431
432            video["video_url"] = soup.select_one("video#player source").get("src")
433            video["thumbnail_image"] = soup.select_one("video#player").get("poster")
434            video["subject"] = soup.select_one("h1#video-title").text
435            video["author"] = soup.select_one("div.channel-banner p.name a").text
436            video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
437            video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
438
439            # we need *two more requests* to get the comment count and like/dislike counts
440            # this seems to be because bitchute uses a third-party comment widget
441            video_session.headers = {'Referer': video["url"], 'Origin': video["url"]}
442            counts = self.request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken})
443
444            if detail == "comments":
445                # if comments are also to be scraped, this is another request to make, which returns
446                # a convenient JSON response with all the comments to the video
447                # we need yet another token for this, which we can extract from a bit of inline
448                # javascript on the page
449                comment_script = None
450                for line in video_page.text.split("\n"):
451                    if "initComments(" in line:
452                        comment_script = line.split("initComments(")[1]
453                        break
454
455                if not comment_script:
456                    # no script to extract comments from, cannot load
457                    comment_count = -1
458                else:
459                    # make the request
460                    comment_count = 0
461                    url = comment_script.split("'")[1]
462                    comment_csrf = comment_script.split("'")[3]
463                    comments_data = self.request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0})
464
465                    for comment in comments_data:
466                        comment_count += 1
467
468                        if comment.get("profile_picture_url", None):
469                            thumbnail_image = url + comment.get("profile_picture_url")
470                        else:
471                            thumbnail_image = ""
472
473                        comments.append({
474                            "query_type": video["query_type"],
475                            "query": video["query"],
476                            "id": comment["id"],
477                            "thread_id": video["id"],
478                            "subject": "",
479                            "body": comment["content"],
480                            "author": comment["fullname"],
481                            "author_id": comment["creator"],
482                            "timestamp": int(dateparser.parse(comment["created"]).timestamp()),
483                            "url": "",
484                            "views": "",
485                            "length": "",
486                            "hashtags": "",
487                            "thumbnail_image": thumbnail_image,
488                            "likes": comment["upvote_count"],
489                            "category": "comment",
490                            "dislikes": "",
491                            "channel_subscribers": "",
492                            "comments": "",
493                            "parent_id": comment.get("parent", "") if "parent" in comment else video["id"],
494                        })
495
496            else:
497                # if we don't need the full comments, we still need another request to get the *amount*
498                # of comments,
499                comment_count = self.request_from_bitchute(video_session, "POST",
500                    "https://commentfreely.bitchute.com/api/get_comment_count/",
501                    data={"csrfmiddlewaretoken": video_csfrtoken,
502                          "cf_thread": "bc_" + video["id"]})["commentCount"]
503
504        except RuntimeError as e:
505            # we wrap this in one big try-catch because doing it for each request separarely is tedious
506            # hm... maybe this should be in a helper function
507            self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e,
508                                       is_final=True)
509            return (None, None)
510
511        # again, no structured info available for the publication date, but at least we can extract the
512        # exact day it was uploaded
513        try:
514            published = dateparser.parse(
515                soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1])
516        except AttributeError as e:
517            # publication date not on page?
518            published = None
519
520        # merge data
521        video = {
522            **video,
523            "category": re.findall(r'<td><a href="/category/([^/]+)/"', video_page.text)[0],
524            "likes": counts["like_count"],
525            "dislikes": counts["dislike_count"],
526            "channel_subscribers": counts["subscriber_count"],
527            "comments": comment_count,
528            "parent_id": "",
529            "hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
530            "views": counts["view_count"]
531        }
532
533        if published:
534            video["timestamp"] = int(published.timestamp())
535
536        # may need to be increased? bitchute doesn't seem particularly strict
537        time.sleep(0.25)
538        return (video, comments)

Append extra metadata to video data

Fetches the BitChute video detail page to scrape extra data for the given video.

Parameters

dict video: Video details as scraped so far
str detail: Detail level. If 'comments', also scrape video comments.

Returns

list of comments

def request_from_bitchute(self, session, method, url, headers=None, data=None): View Source

540    def request_from_bitchute(self, session, method, url, headers=None, data=None):
541        """
542        Request something via the BitChute API (or non-API)
543
544        To avoid having to write the same error-checking everywhere, this takes
545        care of retrying on failure, et cetera
546
547        :param session:  Requests session
548        :param str method: GET or POST
549        :param str url:  URL to fetch
550        :param dict header:  Headers to pass with the request
551        :param dict data:  Data/params to send with the request
552
553        :return:  Requests response
554        """
555        retries = 0
556        response = None
557        while retries < 3:
558            try:
559                if method.lower() == "post":
560                    request = session.post(url, headers=headers, data=data)
561                elif method.lower() == "get":
562                    request = session.get(url, headers=headers, params=data)
563                else:
564                    raise NotImplemented()
565
566                if request.status_code >= 300:
567                    raise ValueError("Response %i from BitChute for URL %s, need to retry" % (request.status_code, url))
568
569                response = request.json()
570                return response
571
572            except (ConnectionResetError, requests.RequestException, ValueError) as e:
573                retries += 1
574                time.sleep(retries * 2)
575
576            except json.JSONDecodeError as e:
577                self.log.warning("Error decoding JSON: %s\n\n%s" % (e, request.text))
578
579        if not response:
580            self.log.warning("Failed BitChute request to %s %i times, aborting" % (url, retries))
581            raise RuntimeError()
582
583        return response

Request something via the BitChute API (or non-API)

To avoid having to write the same error-checking everywhere, this takes care of retrying on failure, et cetera

Parameters

session: Requests session
str method: GET or POST
str url: URL to fetch
dict header: Headers to pass with the request
dict data: Data/params to send with the request

Returns

Requests response

def validate_query(query, request, user): View Source

586    def validate_query(query, request, user):
587        """
588        Validate BitChute query input
589
590        :param dict query:  Query parameters, from client-side.
591        :param request:  Flask request
592        :param User user:  User object of user who has submitted the query
593        :return dict:  Safe query parameters
594        """
595        # no query 4 u
596        if not query.get("query", "").strip():
597            raise QueryParametersException("You must provide a search query.")
598
599        # reformat queries to be a comma-separated list with no wrapping
600        # whitespace
601        items = query.get("query").replace("\n", ",")
602        if len(items.split(",")) > 15 and query.get("search_type") != "url":
603            raise QueryParametersException("You cannot query more than 15 items at a time.")
604
605        # simple!
606        return {
607            "items": query.get("max_posts"),
608            "query": items,
609            "scope": query.get("search_scope"),
610            "item_type": query.get("search_type")
611        }

Validate BitChute query input

Parameters

dict query: Query parameters, from client-side.
request: Flask request
User user: User object of user who has submitted the query

Returns

Safe query parameters

Inherited Members

backend.lib.worker.BasicWorker: BasicWorker; INTERRUPT_NONE; INTERRUPT_RETRY; INTERRUPT_CANCEL; queue; log; manager; interrupted; modules; init_time; name; run; clean_up; request_interrupt; is_4cat_class
backend.lib.search.Search: prefix; return_cols; import_error_count; import_warning_count; process; search; import_from_file; items_to_csv; items_to_ndjson; items_to_archive
backend.lib.processor.BasicProcessor: db; job; dataset; owner; source_dataset; source_file; config; is_running_in_preset; is_hidden; filepath; work; after_process; remove_files; abort; add_field_to_parent; iterate_archive_contents; unpack_archive_contents; extract_archived_file_by_name; write_csv_items_and_finish; write_archive_and_finish; create_standalone; map_item_method_available; get_mapped_item; is_filter; get_options; get_status; is_top_dataset; is_from_collector; get_extension; is_rankable; exclude_followup_processors; is_4cat_processor