datasources.bitchute.search_bitchute
Search Bitchute
Scrape Bitchute videos via the Bitchute web API
1""" 2Search Bitchute 3 4Scrape Bitchute videos via the Bitchute web API 5""" 6import dateparser 7import requests 8import json 9import time 10import re 11 12from itertools import chain 13from bs4 import BeautifulSoup 14 15from common.lib.helpers import UserInput, strip_tags 16from backend.lib.search import Search 17from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException 18 19 20class SearchBitChute(Search): 21 """ 22 BitChute scraper 23 """ 24 type = "bitchute-search" # job ID 25 category = "Search" # category 26 title = "Search BitChute" # title displayed in UI 27 description = "Retrieve BitChute videos" # description displayed in UI 28 extension = "csv" # extension of result file, used internally and in UI 29 is_local = False # Whether this datasource is locally scraped 30 is_static = False # Whether this datasource is still updated 31 32 # not available as a processor for existing datasets 33 accepts = [None] 34 35 # let's not get rate limited 36 max_workers = 1 37 38 # scraping happens in one or the other method, so keep track of this internally 39 max_items = 0 40 41 options = { 42 "intro": { 43 "type": UserInput.OPTION_INFO, 44 "help": "Videos are scraped in the order they are returned by [BitChute](https://bitchute.com)'s search " 45 "function.\n\nYou can scrape results for up to **fifteen** items at a time. Separate the items " 46 "with commas or blank lines. When searching for usernames, there is no need to include @ in front." 47 }, 48 "search_type": { 49 "type": UserInput.OPTION_CHOICE, 50 "help": "Search by", 51 "options": { 52 "search": "Search query", 53 "user": "Username", 54 "url": "Video URL or ID" 55 }, 56 "default": "search" 57 }, 58 "query": { 59 "type": UserInput.OPTION_TEXT_LARGE, 60 "help": "Query" 61 }, 62 "max_posts": { 63 "type": UserInput.OPTION_TEXT, 64 "help": "Videos per item", 65 "min": 0, 66 "max": 2500, 67 "default": 10 68 }, 69 "divider": { 70 "type": UserInput.OPTION_DIVIDER 71 }, 72 "enrichment-info": { 73 "type": UserInput.OPTION_INFO, 74 "help": "You can optionally scrape more details - exact publication date, likes, dislikes, category, " 75 "comment count and channel subscriber count - for each video. Note that this takes a couple of " 76 "seconds per video (which can add up!). Consider doing a basic query first and then repeating it " 77 "with more details only if necessary." 78 }, 79 "search_scope": { 80 "type": UserInput.OPTION_CHOICE, 81 "help": "Search scope", 82 "options": { 83 "basic": "Basic", 84 "detail": "Detailed", 85 "comments": "Detailed, also scrape video comments" 86 }, 87 "default": "basic" 88 } 89 90 } 91 92 def get_items(self, query): 93 """ 94 Run custom search 95 96 Fetches data from BitChute for either users or search queries 97 """ 98 # ready our parameters 99 parameters = self.dataset.get_parameters() 100 self.max_items = parameters.get("items", 100) 101 queries = [query.strip() for query in parameters.get("query", "").split(",")] 102 num_query = 0 103 detail = parameters.get("scope", "basic") 104 query_type = parameters.get("item_type", "search") 105 106 # bitchute uses a CSRF cookie that needs to be included on each request. The only way to obtain it is by 107 # visiting the site, so do just that and extract the CSRF token from the page: 108 session = requests.Session() 109 session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0" 110 request = session.get("https://www.bitchute.com/search") 111 csrftoken = BeautifulSoup(request.text, 'html.parser').findAll("input", {"name": "csrfmiddlewaretoken"})[0].get( 112 "value") 113 time.sleep(0.25) 114 115 self.dataset.update_status("Querying BitChute") 116 results = [] 117 for query in queries: 118 num_query += 1 119 query = query.strip() 120 121 if query_type == "search": 122 results.append(self.get_videos_query(session, query, csrftoken, detail)) 123 elif query_type == "url": 124 if "/video/" in query: 125 query = query.split("/video/")[1].split("/")[0] 126 # else assume bare ID 127 128 self.dataset.update_status("Getting details for video '%s' (%i/%i)" % (query, num_query, len(queries))) 129 results.append(self.get_videos_id(session, query, csrftoken, detail)) 130 else: 131 results.append(self.get_videos_user(session, query, csrftoken, detail)) 132 133 self.dataset.update_progress(num_query / len(queries)) 134 135 return chain(*results) 136 137 def get_videos_id(self, session, video_id, csrftoken, detail): 138 dummy_video = { 139 "query_type": "video id", 140 "query": video_id, 141 "id": video_id, 142 "thread_id": video_id, 143 "subject": "", 144 "body": "", 145 "author": "", 146 "author_id": "", 147 "timestamp": None, 148 "url": "https://www.bitchute.com/video/" + video_id + "/", 149 "views": None, 150 "length": None, 151 "thumbnail_image": None, 152 153 } 154 155 # we can't use the BitChute search, so do one request per URL, and 156 # get details for 'free' 157 if detail == "basic": 158 detail = "detail" 159 160 video, comments = self.append_details(dummy_video, detail) 161 if not video: 162 # unrecoverable error while scraping details 163 return 164 165 yield video 166 for comment in comments: 167 # these need to be yielded *after* the video because else the result file will have the comments 168 # before the video, which is weird 169 yield comment 170 171 def get_videos_user(self, session, user, csrftoken, detail): 172 """ 173 Scrape videos for given BitChute user 174 175 :param session: HTTP Session to use 176 :param str user: Username to scrape videos for 177 :param str csrftoken: CSRF token to use for requests 178 :param str detail: Detail level to scrape, basic/detail/comments 179 180 :return: Video data dictionaries, as a generator 181 """ 182 offset = 0 183 num_items = 0 184 base_url = "https://www.bitchute.com/channel/%s/" % user 185 url = base_url + "extend/" 186 187 container = session.get(base_url) 188 container_soup = BeautifulSoup(container.text, 'html.parser') 189 headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"} 190 191 while True: 192 self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, user)) 193 194 if self.interrupted: 195 raise ProcessorInterruptedException("Interrupted while scraping BitChute") 196 197 post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)} 198 199 try: 200 self.dataset.log("Fetching data for BitChute video %s" % url) 201 request = session.post(url, data=post_data, headers=headers) 202 if request.status_code != 200: 203 raise ConnectionError() 204 response = request.json() 205 except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e: 206 self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e, is_final=True) 207 return 208 209 soup = BeautifulSoup(response["html"], 'html.parser') 210 videos = soup.select(".channel-videos-container") 211 comments = [] 212 213 if len(videos) == 0 or num_items >= self.max_items: 214 break 215 216 for video_element in videos: 217 if num_items >= self.max_items: 218 break 219 else: 220 num_items += 1 221 222 offset += 1 223 224 link = video_element.select_one(".channel-videos-title a") 225 video = { 226 "query_type": "user id", 227 "query": user, 228 "id": link["href"].split("/")[-2], 229 "thread_id": link["href"].split("/")[-2], 230 "subject": link.text, 231 "body": strip_tags(video_element.select_one(".channel-videos-text").text), 232 "author": container_soup.select_one(".details .name a").text, 233 "author_id": container_soup.select_one(".details .name a")["href"].split("/")[2], 234 "timestamp": int( 235 dateparser.parse( 236 video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()), 237 "url": "https://www.bitchute.com" + link["href"], 238 "views": video_element.select_one(".video-views").text.strip(), 239 "length": video_element.select_one(".video-duration").text.strip(), 240 "thumbnail_image": video_element.select_one(".channel-videos-image img")["src"], 241 } 242 243 if detail != "basic": 244 video, comments = self.append_details(video, detail) 245 if not video: 246 # unrecoverable error while scraping details 247 return 248 249 yield video 250 for comment in comments: 251 # these need to be yielded *after* the video because else the result file will have the comments 252 # before the video, which is weird 253 yield comment 254 255 256 def get_videos_query(self, session, query, csrftoken, detail): 257 """ 258 Scrape videos for given BitChute search query 259 260 :param session: HTTP Session to use 261 :param str user: Search query to scrape videos for 262 :param str csrftoken: CSRF token to use for requests 263 :param str detail: Detail level to scrape, basic/detail/comments 264 265 :return: Video data dictionaries, as a generator 266 """ 267 page = 0 268 num_items = 0 269 while True: 270 self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, query)) 271 272 if self.interrupted: 273 raise ProcessorInterruptedException("Interrupted while scraping BitChute") 274 275 # prepare the request - the CSRF param *must* be the first or the request will fail 276 post_data = {"csrfmiddlewaretoken": csrftoken, "query": query, "kind": "video", "duration": "", 277 "sort": "", "page": str(page)} 278 headers = {'Referer': "https://www.bitchute.com/search", 'Origin': "https://www.bitchute.com/search"} 279 response = self.request_from_bitchute(session, "POST", "https://www.bitchute.com/api/search/list/", headers, post_data) 280 281 if not response["success"] or response["count"] == 0 or num_items >= self.max_items: 282 break 283 284 comments = [] 285 for video_data in response["results"]: 286 if num_items >= self.max_items: 287 break 288 else: 289 num_items += 1 290 291 # note: deleted videos will have a published date of 'None'. To 292 # avoid crashing the backend the easiest way is to set it to something 293 # that is obviously not a valid date in this context. 294 if video_data["published"] is None: 295 video_data["published"] = "1970-01-01" 296 # this is only included as '5 months ago' and so forth, not exact date 297 # so use dateparser to at least approximate the date 298 try: 299 dt = dateparser.parse(video_data["published"]) 300 except Exception as e: 301 self.log.warning('dateparser.parse error reading video_data["published"]: %s' % str(video_data['published'])) 302 raise e 303 304 video = { 305 "query_type": "search", 306 "query": query, 307 "id": video_data["id"], 308 "thread_id": video_data["id"], 309 "subject": video_data["name"], 310 "body": video_data["description"], 311 "author": video_data["channel_name"], 312 "author_id": video_data["channel_path"].split("/")[2], 313 "timestamp": int(dt.timestamp()), 314 "url": "https://www.bitchute.com" + video_data["path"], 315 "views": video_data["views"], 316 "length": video_data["duration"], 317 "thumbnail_image": video_data["images"]["thumbnail"] 318 } 319 320 if detail != "basic": 321 video, comments = self.append_details(video, detail) 322 if not video: 323 # unrecoverable error while scraping details 324 return 325 326 yield video 327 for comment in comments: 328 # these need to be yielded *after* the video because else the result file will have the comments 329 # before the video, which is weird 330 yield comment 331 332 page += 1 333 334 def append_details(self, video, detail): 335 """ 336 Append extra metadata to video data 337 338 Fetches the BitChute video detail page to scrape extra data for the given video. 339 340 :param dict video: Video details as scraped so far 341 :param str detail: Detail level. If 'comments', also scrape video comments. 342 343 :return dict: Tuple, first item: updated video data, second: list of comments 344 """ 345 comments = [] 346 347 video = { 348 **video, 349 "likes": "", 350 "dislikes": "", 351 "channel_subscribers": "", 352 "comments": "", 353 "hashtags": "", 354 "parent_id": "", 355 "video_url": "" 356 } 357 358 try: 359 # to get more details per video, we need to request the actual video detail page 360 # start a new session, to not interfere with the CSRF token from the search session 361 video_session = requests.session() 362 video_page = video_session.get(video["url"]) 363 364 if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \ 365 "<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \ 366 "<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \ 367 "<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text: 368 if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text: 369 video["category"] = "moderated-illegal" 370 return (video, []) 371 372 elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text: 373 video["category"] = "moderated-nsfl" 374 return (video, []) 375 376 elif "Incitement to Hatred" in video_page.text: 377 video["category"] = "moderated-incitement-hatred" 378 return (video, []) 379 380 elif "Threats or Incitement to Violence" in video_page.text: 381 video["category"] = "moderated-incitement-violence" 382 return (video, []) 383 384 elif "Platform Misuse" in video_page.text: 385 video["category"] = "moderated-misuse" 386 return (video, []) 387 388 elif "Terrorism & Violent Extremism" in video_page.text: 389 video["category"] = "moderated-terrorism-extremism" 390 return (video, []) 391 392 elif "Copyright</h4>" in video_page.text: 393 video["category"] = "moderated-copyright" 394 return (video, []) 395 396 elif "Contains Holocaust Denial" in video_page.text: 397 video["category"] = "moderated-holocaust-denial" 398 return (video, []) 399 400 elif "Harmful Activities" in video_page.text: 401 video["category"] = "moderated-harmful-activity" 402 return (video, []) 403 404 else: 405 video["category"] = "moderated-other" 406 self.log.warning("Unknown moderated reason for BitChute video %s" % video["id"]) 407 return (video, []) 408 409 elif "<iframe class=\"rumble\"" in video_page.text: 410 # some videos are actually embeds from rumble? 411 # these are iframes, so at the moment we cannot simply extract 412 # their info from the page, so we skip them. In the future we 413 # could add an extra request to get the relevant info, but so 414 # far the only examples I've seen are actually 'video not found' 415 video = { 416 **video, 417 "category": "error-embed-from-rumble" 418 } 419 return (video, []) 420 421 elif video_page.status_code != 200: 422 video = { 423 **video, 424 "category": "error-%i" % video_page.status_code 425 } 426 return (video, []) 427 428 soup = BeautifulSoup(video_page.text, 'html.parser') 429 video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value") 430 431 video["video_url"] = soup.select_one("video#player source").get("src") 432 video["thumbnail_image"] = soup.select_one("video#player").get("poster") 433 video["subject"] = soup.select_one("h1#video-title").text 434 video["author"] = soup.select_one("div.channel-banner p.name a").text 435 video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2] 436 video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip() 437 438 # we need *two more requests* to get the comment count and like/dislike counts 439 # this seems to be because bitchute uses a third-party comment widget 440 video_session.headers = {'Referer': video["url"], 'Origin': video["url"]} 441 counts = self.request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken}) 442 443 if detail == "comments": 444 # if comments are also to be scraped, this is another request to make, which returns 445 # a convenient JSON response with all the comments to the video 446 # we need yet another token for this, which we can extract from a bit of inline 447 # javascript on the page 448 comment_script = None 449 for line in video_page.text.split("\n"): 450 if "initComments(" in line: 451 comment_script = line.split("initComments(")[1] 452 break 453 454 if not comment_script: 455 # no script to extract comments from, cannot load 456 comment_count = -1 457 else: 458 # make the request 459 comment_count = 0 460 url = comment_script.split("'")[1] 461 comment_csrf = comment_script.split("'")[3] 462 comments_data = self.request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0}) 463 464 for comment in comments_data: 465 comment_count += 1 466 467 if comment.get("profile_picture_url", None): 468 thumbnail_image = url + comment.get("profile_picture_url") 469 else: 470 thumbnail_image = "" 471 472 comments.append({ 473 "query_type": video["query_type"], 474 "query": video["query"], 475 "id": comment["id"], 476 "thread_id": video["id"], 477 "subject": "", 478 "body": comment["content"], 479 "author": comment["fullname"], 480 "author_id": comment["creator"], 481 "timestamp": int(dateparser.parse(comment["created"]).timestamp()), 482 "url": "", 483 "views": "", 484 "length": "", 485 "hashtags": "", 486 "thumbnail_image": thumbnail_image, 487 "likes": comment["upvote_count"], 488 "category": "comment", 489 "dislikes": "", 490 "channel_subscribers": "", 491 "comments": "", 492 "parent_id": comment.get("parent", "") if "parent" in comment else video["id"], 493 }) 494 495 else: 496 # if we don't need the full comments, we still need another request to get the *amount* 497 # of comments, 498 comment_count = self.request_from_bitchute(video_session, "POST", 499 "https://commentfreely.bitchute.com/api/get_comment_count/", 500 data={"csrfmiddlewaretoken": video_csfrtoken, 501 "cf_thread": "bc_" + video["id"]})["commentCount"] 502 503 except RuntimeError as e: 504 # we wrap this in one big try-catch because doing it for each request separarely is tedious 505 # hm... maybe this should be in a helper function 506 self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e, 507 is_final=True) 508 return (None, None) 509 510 # again, no structured info available for the publication date, but at least we can extract the 511 # exact day it was uploaded 512 try: 513 published = dateparser.parse( 514 soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1]) 515 except AttributeError as e: 516 # publication date not on page? 517 published = None 518 519 # merge data 520 video = { 521 **video, 522 "category": re.findall(r'<td><a href="/category/([^/]+)/"', video_page.text)[0], 523 "likes": counts["like_count"], 524 "dislikes": counts["dislike_count"], 525 "channel_subscribers": counts["subscriber_count"], 526 "comments": comment_count, 527 "parent_id": "", 528 "hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]), 529 "views": counts["view_count"] 530 } 531 532 if published: 533 video["timestamp"] = int(published.timestamp()) 534 535 # may need to be increased? bitchute doesn't seem particularly strict 536 time.sleep(0.25) 537 return (video, comments) 538 539 def request_from_bitchute(self, session, method, url, headers=None, data=None): 540 """ 541 Request something via the BitChute API (or non-API) 542 543 To avoid having to write the same error-checking everywhere, this takes 544 care of retrying on failure, et cetera 545 546 :param session: Requests session 547 :param str method: GET or POST 548 :param str url: URL to fetch 549 :param dict header: Headers to pass with the request 550 :param dict data: Data/params to send with the request 551 552 :return: Requests response 553 """ 554 retries = 0 555 response = None 556 while retries < 3: 557 try: 558 if method.lower() == "post": 559 request = session.post(url, headers=headers, data=data) 560 elif method.lower() == "get": 561 request = session.get(url, headers=headers, params=data) 562 else: 563 raise NotImplemented() 564 565 if request.status_code >= 300: 566 raise ValueError("Response %i from BitChute for URL %s, need to retry" % (request.status_code, url)) 567 568 response = request.json() 569 return response 570 571 except (ConnectionResetError, requests.RequestException, ValueError) as e: 572 retries += 1 573 time.sleep(retries * 2) 574 575 except json.JSONDecodeError as e: 576 self.log.warning("Error decoding JSON: %s\n\n%s" % (e, request.text)) 577 578 if not response: 579 self.log.warning("Failed BitChute request to %s %i times, aborting" % (url, retries)) 580 raise RuntimeError() 581 582 return response 583 584 585 def validate_query(query, request, user): 586 """ 587 Validate BitChute query input 588 589 :param dict query: Query parameters, from client-side. 590 :param request: Flask request 591 :param User user: User object of user who has submitted the query 592 :return dict: Safe query parameters 593 """ 594 # no query 4 u 595 if not query.get("query", "").strip(): 596 raise QueryParametersException("You must provide a search query.") 597 598 # reformat queries to be a comma-separated list with no wrapping 599 # whitespace 600 items = query.get("query").replace("\n", ",") 601 if len(items.split(",")) > 15 and query.get("search_type") != "url": 602 raise QueryParametersException("You cannot query more than 15 items at a time.") 603 604 # simple! 605 return { 606 "items": query.get("max_posts"), 607 "query": items, 608 "scope": query.get("search_scope"), 609 "item_type": query.get("search_type") 610 }
21class SearchBitChute(Search): 22 """ 23 BitChute scraper 24 """ 25 type = "bitchute-search" # job ID 26 category = "Search" # category 27 title = "Search BitChute" # title displayed in UI 28 description = "Retrieve BitChute videos" # description displayed in UI 29 extension = "csv" # extension of result file, used internally and in UI 30 is_local = False # Whether this datasource is locally scraped 31 is_static = False # Whether this datasource is still updated 32 33 # not available as a processor for existing datasets 34 accepts = [None] 35 36 # let's not get rate limited 37 max_workers = 1 38 39 # scraping happens in one or the other method, so keep track of this internally 40 max_items = 0 41 42 options = { 43 "intro": { 44 "type": UserInput.OPTION_INFO, 45 "help": "Videos are scraped in the order they are returned by [BitChute](https://bitchute.com)'s search " 46 "function.\n\nYou can scrape results for up to **fifteen** items at a time. Separate the items " 47 "with commas or blank lines. When searching for usernames, there is no need to include @ in front." 48 }, 49 "search_type": { 50 "type": UserInput.OPTION_CHOICE, 51 "help": "Search by", 52 "options": { 53 "search": "Search query", 54 "user": "Username", 55 "url": "Video URL or ID" 56 }, 57 "default": "search" 58 }, 59 "query": { 60 "type": UserInput.OPTION_TEXT_LARGE, 61 "help": "Query" 62 }, 63 "max_posts": { 64 "type": UserInput.OPTION_TEXT, 65 "help": "Videos per item", 66 "min": 0, 67 "max": 2500, 68 "default": 10 69 }, 70 "divider": { 71 "type": UserInput.OPTION_DIVIDER 72 }, 73 "enrichment-info": { 74 "type": UserInput.OPTION_INFO, 75 "help": "You can optionally scrape more details - exact publication date, likes, dislikes, category, " 76 "comment count and channel subscriber count - for each video. Note that this takes a couple of " 77 "seconds per video (which can add up!). Consider doing a basic query first and then repeating it " 78 "with more details only if necessary." 79 }, 80 "search_scope": { 81 "type": UserInput.OPTION_CHOICE, 82 "help": "Search scope", 83 "options": { 84 "basic": "Basic", 85 "detail": "Detailed", 86 "comments": "Detailed, also scrape video comments" 87 }, 88 "default": "basic" 89 } 90 91 } 92 93 def get_items(self, query): 94 """ 95 Run custom search 96 97 Fetches data from BitChute for either users or search queries 98 """ 99 # ready our parameters 100 parameters = self.dataset.get_parameters() 101 self.max_items = parameters.get("items", 100) 102 queries = [query.strip() for query in parameters.get("query", "").split(",")] 103 num_query = 0 104 detail = parameters.get("scope", "basic") 105 query_type = parameters.get("item_type", "search") 106 107 # bitchute uses a CSRF cookie that needs to be included on each request. The only way to obtain it is by 108 # visiting the site, so do just that and extract the CSRF token from the page: 109 session = requests.Session() 110 session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0" 111 request = session.get("https://www.bitchute.com/search") 112 csrftoken = BeautifulSoup(request.text, 'html.parser').findAll("input", {"name": "csrfmiddlewaretoken"})[0].get( 113 "value") 114 time.sleep(0.25) 115 116 self.dataset.update_status("Querying BitChute") 117 results = [] 118 for query in queries: 119 num_query += 1 120 query = query.strip() 121 122 if query_type == "search": 123 results.append(self.get_videos_query(session, query, csrftoken, detail)) 124 elif query_type == "url": 125 if "/video/" in query: 126 query = query.split("/video/")[1].split("/")[0] 127 # else assume bare ID 128 129 self.dataset.update_status("Getting details for video '%s' (%i/%i)" % (query, num_query, len(queries))) 130 results.append(self.get_videos_id(session, query, csrftoken, detail)) 131 else: 132 results.append(self.get_videos_user(session, query, csrftoken, detail)) 133 134 self.dataset.update_progress(num_query / len(queries)) 135 136 return chain(*results) 137 138 def get_videos_id(self, session, video_id, csrftoken, detail): 139 dummy_video = { 140 "query_type": "video id", 141 "query": video_id, 142 "id": video_id, 143 "thread_id": video_id, 144 "subject": "", 145 "body": "", 146 "author": "", 147 "author_id": "", 148 "timestamp": None, 149 "url": "https://www.bitchute.com/video/" + video_id + "/", 150 "views": None, 151 "length": None, 152 "thumbnail_image": None, 153 154 } 155 156 # we can't use the BitChute search, so do one request per URL, and 157 # get details for 'free' 158 if detail == "basic": 159 detail = "detail" 160 161 video, comments = self.append_details(dummy_video, detail) 162 if not video: 163 # unrecoverable error while scraping details 164 return 165 166 yield video 167 for comment in comments: 168 # these need to be yielded *after* the video because else the result file will have the comments 169 # before the video, which is weird 170 yield comment 171 172 def get_videos_user(self, session, user, csrftoken, detail): 173 """ 174 Scrape videos for given BitChute user 175 176 :param session: HTTP Session to use 177 :param str user: Username to scrape videos for 178 :param str csrftoken: CSRF token to use for requests 179 :param str detail: Detail level to scrape, basic/detail/comments 180 181 :return: Video data dictionaries, as a generator 182 """ 183 offset = 0 184 num_items = 0 185 base_url = "https://www.bitchute.com/channel/%s/" % user 186 url = base_url + "extend/" 187 188 container = session.get(base_url) 189 container_soup = BeautifulSoup(container.text, 'html.parser') 190 headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"} 191 192 while True: 193 self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, user)) 194 195 if self.interrupted: 196 raise ProcessorInterruptedException("Interrupted while scraping BitChute") 197 198 post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)} 199 200 try: 201 self.dataset.log("Fetching data for BitChute video %s" % url) 202 request = session.post(url, data=post_data, headers=headers) 203 if request.status_code != 200: 204 raise ConnectionError() 205 response = request.json() 206 except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e: 207 self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e, is_final=True) 208 return 209 210 soup = BeautifulSoup(response["html"], 'html.parser') 211 videos = soup.select(".channel-videos-container") 212 comments = [] 213 214 if len(videos) == 0 or num_items >= self.max_items: 215 break 216 217 for video_element in videos: 218 if num_items >= self.max_items: 219 break 220 else: 221 num_items += 1 222 223 offset += 1 224 225 link = video_element.select_one(".channel-videos-title a") 226 video = { 227 "query_type": "user id", 228 "query": user, 229 "id": link["href"].split("/")[-2], 230 "thread_id": link["href"].split("/")[-2], 231 "subject": link.text, 232 "body": strip_tags(video_element.select_one(".channel-videos-text").text), 233 "author": container_soup.select_one(".details .name a").text, 234 "author_id": container_soup.select_one(".details .name a")["href"].split("/")[2], 235 "timestamp": int( 236 dateparser.parse( 237 video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()), 238 "url": "https://www.bitchute.com" + link["href"], 239 "views": video_element.select_one(".video-views").text.strip(), 240 "length": video_element.select_one(".video-duration").text.strip(), 241 "thumbnail_image": video_element.select_one(".channel-videos-image img")["src"], 242 } 243 244 if detail != "basic": 245 video, comments = self.append_details(video, detail) 246 if not video: 247 # unrecoverable error while scraping details 248 return 249 250 yield video 251 for comment in comments: 252 # these need to be yielded *after* the video because else the result file will have the comments 253 # before the video, which is weird 254 yield comment 255 256 257 def get_videos_query(self, session, query, csrftoken, detail): 258 """ 259 Scrape videos for given BitChute search query 260 261 :param session: HTTP Session to use 262 :param str user: Search query to scrape videos for 263 :param str csrftoken: CSRF token to use for requests 264 :param str detail: Detail level to scrape, basic/detail/comments 265 266 :return: Video data dictionaries, as a generator 267 """ 268 page = 0 269 num_items = 0 270 while True: 271 self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, query)) 272 273 if self.interrupted: 274 raise ProcessorInterruptedException("Interrupted while scraping BitChute") 275 276 # prepare the request - the CSRF param *must* be the first or the request will fail 277 post_data = {"csrfmiddlewaretoken": csrftoken, "query": query, "kind": "video", "duration": "", 278 "sort": "", "page": str(page)} 279 headers = {'Referer': "https://www.bitchute.com/search", 'Origin': "https://www.bitchute.com/search"} 280 response = self.request_from_bitchute(session, "POST", "https://www.bitchute.com/api/search/list/", headers, post_data) 281 282 if not response["success"] or response["count"] == 0 or num_items >= self.max_items: 283 break 284 285 comments = [] 286 for video_data in response["results"]: 287 if num_items >= self.max_items: 288 break 289 else: 290 num_items += 1 291 292 # note: deleted videos will have a published date of 'None'. To 293 # avoid crashing the backend the easiest way is to set it to something 294 # that is obviously not a valid date in this context. 295 if video_data["published"] is None: 296 video_data["published"] = "1970-01-01" 297 # this is only included as '5 months ago' and so forth, not exact date 298 # so use dateparser to at least approximate the date 299 try: 300 dt = dateparser.parse(video_data["published"]) 301 except Exception as e: 302 self.log.warning('dateparser.parse error reading video_data["published"]: %s' % str(video_data['published'])) 303 raise e 304 305 video = { 306 "query_type": "search", 307 "query": query, 308 "id": video_data["id"], 309 "thread_id": video_data["id"], 310 "subject": video_data["name"], 311 "body": video_data["description"], 312 "author": video_data["channel_name"], 313 "author_id": video_data["channel_path"].split("/")[2], 314 "timestamp": int(dt.timestamp()), 315 "url": "https://www.bitchute.com" + video_data["path"], 316 "views": video_data["views"], 317 "length": video_data["duration"], 318 "thumbnail_image": video_data["images"]["thumbnail"] 319 } 320 321 if detail != "basic": 322 video, comments = self.append_details(video, detail) 323 if not video: 324 # unrecoverable error while scraping details 325 return 326 327 yield video 328 for comment in comments: 329 # these need to be yielded *after* the video because else the result file will have the comments 330 # before the video, which is weird 331 yield comment 332 333 page += 1 334 335 def append_details(self, video, detail): 336 """ 337 Append extra metadata to video data 338 339 Fetches the BitChute video detail page to scrape extra data for the given video. 340 341 :param dict video: Video details as scraped so far 342 :param str detail: Detail level. If 'comments', also scrape video comments. 343 344 :return dict: Tuple, first item: updated video data, second: list of comments 345 """ 346 comments = [] 347 348 video = { 349 **video, 350 "likes": "", 351 "dislikes": "", 352 "channel_subscribers": "", 353 "comments": "", 354 "hashtags": "", 355 "parent_id": "", 356 "video_url": "" 357 } 358 359 try: 360 # to get more details per video, we need to request the actual video detail page 361 # start a new session, to not interfere with the CSRF token from the search session 362 video_session = requests.session() 363 video_page = video_session.get(video["url"]) 364 365 if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \ 366 "<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \ 367 "<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \ 368 "<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text: 369 if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text: 370 video["category"] = "moderated-illegal" 371 return (video, []) 372 373 elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text: 374 video["category"] = "moderated-nsfl" 375 return (video, []) 376 377 elif "Incitement to Hatred" in video_page.text: 378 video["category"] = "moderated-incitement-hatred" 379 return (video, []) 380 381 elif "Threats or Incitement to Violence" in video_page.text: 382 video["category"] = "moderated-incitement-violence" 383 return (video, []) 384 385 elif "Platform Misuse" in video_page.text: 386 video["category"] = "moderated-misuse" 387 return (video, []) 388 389 elif "Terrorism & Violent Extremism" in video_page.text: 390 video["category"] = "moderated-terrorism-extremism" 391 return (video, []) 392 393 elif "Copyright</h4>" in video_page.text: 394 video["category"] = "moderated-copyright" 395 return (video, []) 396 397 elif "Contains Holocaust Denial" in video_page.text: 398 video["category"] = "moderated-holocaust-denial" 399 return (video, []) 400 401 elif "Harmful Activities" in video_page.text: 402 video["category"] = "moderated-harmful-activity" 403 return (video, []) 404 405 else: 406 video["category"] = "moderated-other" 407 self.log.warning("Unknown moderated reason for BitChute video %s" % video["id"]) 408 return (video, []) 409 410 elif "<iframe class=\"rumble\"" in video_page.text: 411 # some videos are actually embeds from rumble? 412 # these are iframes, so at the moment we cannot simply extract 413 # their info from the page, so we skip them. In the future we 414 # could add an extra request to get the relevant info, but so 415 # far the only examples I've seen are actually 'video not found' 416 video = { 417 **video, 418 "category": "error-embed-from-rumble" 419 } 420 return (video, []) 421 422 elif video_page.status_code != 200: 423 video = { 424 **video, 425 "category": "error-%i" % video_page.status_code 426 } 427 return (video, []) 428 429 soup = BeautifulSoup(video_page.text, 'html.parser') 430 video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value") 431 432 video["video_url"] = soup.select_one("video#player source").get("src") 433 video["thumbnail_image"] = soup.select_one("video#player").get("poster") 434 video["subject"] = soup.select_one("h1#video-title").text 435 video["author"] = soup.select_one("div.channel-banner p.name a").text 436 video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2] 437 video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip() 438 439 # we need *two more requests* to get the comment count and like/dislike counts 440 # this seems to be because bitchute uses a third-party comment widget 441 video_session.headers = {'Referer': video["url"], 'Origin': video["url"]} 442 counts = self.request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken}) 443 444 if detail == "comments": 445 # if comments are also to be scraped, this is another request to make, which returns 446 # a convenient JSON response with all the comments to the video 447 # we need yet another token for this, which we can extract from a bit of inline 448 # javascript on the page 449 comment_script = None 450 for line in video_page.text.split("\n"): 451 if "initComments(" in line: 452 comment_script = line.split("initComments(")[1] 453 break 454 455 if not comment_script: 456 # no script to extract comments from, cannot load 457 comment_count = -1 458 else: 459 # make the request 460 comment_count = 0 461 url = comment_script.split("'")[1] 462 comment_csrf = comment_script.split("'")[3] 463 comments_data = self.request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0}) 464 465 for comment in comments_data: 466 comment_count += 1 467 468 if comment.get("profile_picture_url", None): 469 thumbnail_image = url + comment.get("profile_picture_url") 470 else: 471 thumbnail_image = "" 472 473 comments.append({ 474 "query_type": video["query_type"], 475 "query": video["query"], 476 "id": comment["id"], 477 "thread_id": video["id"], 478 "subject": "", 479 "body": comment["content"], 480 "author": comment["fullname"], 481 "author_id": comment["creator"], 482 "timestamp": int(dateparser.parse(comment["created"]).timestamp()), 483 "url": "", 484 "views": "", 485 "length": "", 486 "hashtags": "", 487 "thumbnail_image": thumbnail_image, 488 "likes": comment["upvote_count"], 489 "category": "comment", 490 "dislikes": "", 491 "channel_subscribers": "", 492 "comments": "", 493 "parent_id": comment.get("parent", "") if "parent" in comment else video["id"], 494 }) 495 496 else: 497 # if we don't need the full comments, we still need another request to get the *amount* 498 # of comments, 499 comment_count = self.request_from_bitchute(video_session, "POST", 500 "https://commentfreely.bitchute.com/api/get_comment_count/", 501 data={"csrfmiddlewaretoken": video_csfrtoken, 502 "cf_thread": "bc_" + video["id"]})["commentCount"] 503 504 except RuntimeError as e: 505 # we wrap this in one big try-catch because doing it for each request separarely is tedious 506 # hm... maybe this should be in a helper function 507 self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e, 508 is_final=True) 509 return (None, None) 510 511 # again, no structured info available for the publication date, but at least we can extract the 512 # exact day it was uploaded 513 try: 514 published = dateparser.parse( 515 soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1]) 516 except AttributeError as e: 517 # publication date not on page? 518 published = None 519 520 # merge data 521 video = { 522 **video, 523 "category": re.findall(r'<td><a href="/category/([^/]+)/"', video_page.text)[0], 524 "likes": counts["like_count"], 525 "dislikes": counts["dislike_count"], 526 "channel_subscribers": counts["subscriber_count"], 527 "comments": comment_count, 528 "parent_id": "", 529 "hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]), 530 "views": counts["view_count"] 531 } 532 533 if published: 534 video["timestamp"] = int(published.timestamp()) 535 536 # may need to be increased? bitchute doesn't seem particularly strict 537 time.sleep(0.25) 538 return (video, comments) 539 540 def request_from_bitchute(self, session, method, url, headers=None, data=None): 541 """ 542 Request something via the BitChute API (or non-API) 543 544 To avoid having to write the same error-checking everywhere, this takes 545 care of retrying on failure, et cetera 546 547 :param session: Requests session 548 :param str method: GET or POST 549 :param str url: URL to fetch 550 :param dict header: Headers to pass with the request 551 :param dict data: Data/params to send with the request 552 553 :return: Requests response 554 """ 555 retries = 0 556 response = None 557 while retries < 3: 558 try: 559 if method.lower() == "post": 560 request = session.post(url, headers=headers, data=data) 561 elif method.lower() == "get": 562 request = session.get(url, headers=headers, params=data) 563 else: 564 raise NotImplemented() 565 566 if request.status_code >= 300: 567 raise ValueError("Response %i from BitChute for URL %s, need to retry" % (request.status_code, url)) 568 569 response = request.json() 570 return response 571 572 except (ConnectionResetError, requests.RequestException, ValueError) as e: 573 retries += 1 574 time.sleep(retries * 2) 575 576 except json.JSONDecodeError as e: 577 self.log.warning("Error decoding JSON: %s\n\n%s" % (e, request.text)) 578 579 if not response: 580 self.log.warning("Failed BitChute request to %s %i times, aborting" % (url, retries)) 581 raise RuntimeError() 582 583 return response 584 585 586 def validate_query(query, request, user): 587 """ 588 Validate BitChute query input 589 590 :param dict query: Query parameters, from client-side. 591 :param request: Flask request 592 :param User user: User object of user who has submitted the query 593 :return dict: Safe query parameters 594 """ 595 # no query 4 u 596 if not query.get("query", "").strip(): 597 raise QueryParametersException("You must provide a search query.") 598 599 # reformat queries to be a comma-separated list with no wrapping 600 # whitespace 601 items = query.get("query").replace("\n", ",") 602 if len(items.split(",")) > 15 and query.get("search_type") != "url": 603 raise QueryParametersException("You cannot query more than 15 items at a time.") 604 605 # simple! 606 return { 607 "items": query.get("max_posts"), 608 "query": items, 609 "scope": query.get("search_scope"), 610 "item_type": query.get("search_type") 611 }
BitChute scraper
options =
{'intro': {'type': 'info', 'help': "Videos are scraped in the order they are returned by [BitChute](https://bitchute.com)'s search function.\n\nYou can scrape results for up to **fifteen** items at a time. Separate the items with commas or blank lines. When searching for usernames, there is no need to include @ in front."}, 'search_type': {'type': 'choice', 'help': 'Search by', 'options': {'search': 'Search query', 'user': 'Username', 'url': 'Video URL or ID'}, 'default': 'search'}, 'query': {'type': 'textarea', 'help': 'Query'}, 'max_posts': {'type': 'string', 'help': 'Videos per item', 'min': 0, 'max': 2500, 'default': 10}, 'divider': {'type': 'divider'}, 'enrichment-info': {'type': 'info', 'help': 'You can optionally scrape more details - exact publication date, likes, dislikes, category, comment count and channel subscriber count - for each video. Note that this takes a couple of seconds per video (which can add up!). Consider doing a basic query first and then repeating it with more details only if necessary.'}, 'search_scope': {'type': 'choice', 'help': 'Search scope', 'options': {'basic': 'Basic', 'detail': 'Detailed', 'comments': 'Detailed, also scrape video comments'}, 'default': 'basic'}}
def
get_items(self, query):
93 def get_items(self, query): 94 """ 95 Run custom search 96 97 Fetches data from BitChute for either users or search queries 98 """ 99 # ready our parameters 100 parameters = self.dataset.get_parameters() 101 self.max_items = parameters.get("items", 100) 102 queries = [query.strip() for query in parameters.get("query", "").split(",")] 103 num_query = 0 104 detail = parameters.get("scope", "basic") 105 query_type = parameters.get("item_type", "search") 106 107 # bitchute uses a CSRF cookie that needs to be included on each request. The only way to obtain it is by 108 # visiting the site, so do just that and extract the CSRF token from the page: 109 session = requests.Session() 110 session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0" 111 request = session.get("https://www.bitchute.com/search") 112 csrftoken = BeautifulSoup(request.text, 'html.parser').findAll("input", {"name": "csrfmiddlewaretoken"})[0].get( 113 "value") 114 time.sleep(0.25) 115 116 self.dataset.update_status("Querying BitChute") 117 results = [] 118 for query in queries: 119 num_query += 1 120 query = query.strip() 121 122 if query_type == "search": 123 results.append(self.get_videos_query(session, query, csrftoken, detail)) 124 elif query_type == "url": 125 if "/video/" in query: 126 query = query.split("/video/")[1].split("/")[0] 127 # else assume bare ID 128 129 self.dataset.update_status("Getting details for video '%s' (%i/%i)" % (query, num_query, len(queries))) 130 results.append(self.get_videos_id(session, query, csrftoken, detail)) 131 else: 132 results.append(self.get_videos_user(session, query, csrftoken, detail)) 133 134 self.dataset.update_progress(num_query / len(queries)) 135 136 return chain(*results)
Run custom search
Fetches data from BitChute for either users or search queries
def
get_videos_id(self, session, video_id, csrftoken, detail):
138 def get_videos_id(self, session, video_id, csrftoken, detail): 139 dummy_video = { 140 "query_type": "video id", 141 "query": video_id, 142 "id": video_id, 143 "thread_id": video_id, 144 "subject": "", 145 "body": "", 146 "author": "", 147 "author_id": "", 148 "timestamp": None, 149 "url": "https://www.bitchute.com/video/" + video_id + "/", 150 "views": None, 151 "length": None, 152 "thumbnail_image": None, 153 154 } 155 156 # we can't use the BitChute search, so do one request per URL, and 157 # get details for 'free' 158 if detail == "basic": 159 detail = "detail" 160 161 video, comments = self.append_details(dummy_video, detail) 162 if not video: 163 # unrecoverable error while scraping details 164 return 165 166 yield video 167 for comment in comments: 168 # these need to be yielded *after* the video because else the result file will have the comments 169 # before the video, which is weird 170 yield comment
def
get_videos_user(self, session, user, csrftoken, detail):
172 def get_videos_user(self, session, user, csrftoken, detail): 173 """ 174 Scrape videos for given BitChute user 175 176 :param session: HTTP Session to use 177 :param str user: Username to scrape videos for 178 :param str csrftoken: CSRF token to use for requests 179 :param str detail: Detail level to scrape, basic/detail/comments 180 181 :return: Video data dictionaries, as a generator 182 """ 183 offset = 0 184 num_items = 0 185 base_url = "https://www.bitchute.com/channel/%s/" % user 186 url = base_url + "extend/" 187 188 container = session.get(base_url) 189 container_soup = BeautifulSoup(container.text, 'html.parser') 190 headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"} 191 192 while True: 193 self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, user)) 194 195 if self.interrupted: 196 raise ProcessorInterruptedException("Interrupted while scraping BitChute") 197 198 post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)} 199 200 try: 201 self.dataset.log("Fetching data for BitChute video %s" % url) 202 request = session.post(url, data=post_data, headers=headers) 203 if request.status_code != 200: 204 raise ConnectionError() 205 response = request.json() 206 except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e: 207 self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e, is_final=True) 208 return 209 210 soup = BeautifulSoup(response["html"], 'html.parser') 211 videos = soup.select(".channel-videos-container") 212 comments = [] 213 214 if len(videos) == 0 or num_items >= self.max_items: 215 break 216 217 for video_element in videos: 218 if num_items >= self.max_items: 219 break 220 else: 221 num_items += 1 222 223 offset += 1 224 225 link = video_element.select_one(".channel-videos-title a") 226 video = { 227 "query_type": "user id", 228 "query": user, 229 "id": link["href"].split("/")[-2], 230 "thread_id": link["href"].split("/")[-2], 231 "subject": link.text, 232 "body": strip_tags(video_element.select_one(".channel-videos-text").text), 233 "author": container_soup.select_one(".details .name a").text, 234 "author_id": container_soup.select_one(".details .name a")["href"].split("/")[2], 235 "timestamp": int( 236 dateparser.parse( 237 video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()), 238 "url": "https://www.bitchute.com" + link["href"], 239 "views": video_element.select_one(".video-views").text.strip(), 240 "length": video_element.select_one(".video-duration").text.strip(), 241 "thumbnail_image": video_element.select_one(".channel-videos-image img")["src"], 242 } 243 244 if detail != "basic": 245 video, comments = self.append_details(video, detail) 246 if not video: 247 # unrecoverable error while scraping details 248 return 249 250 yield video 251 for comment in comments: 252 # these need to be yielded *after* the video because else the result file will have the comments 253 # before the video, which is weird 254 yield comment
Scrape videos for given BitChute user
Parameters
- session: HTTP Session to use
- str user: Username to scrape videos for
- str csrftoken: CSRF token to use for requests
- str detail: Detail level to scrape, basic/detail/comments
Returns
Video data dictionaries, as a generator
def
get_videos_query(self, session, query, csrftoken, detail):
257 def get_videos_query(self, session, query, csrftoken, detail): 258 """ 259 Scrape videos for given BitChute search query 260 261 :param session: HTTP Session to use 262 :param str user: Search query to scrape videos for 263 :param str csrftoken: CSRF token to use for requests 264 :param str detail: Detail level to scrape, basic/detail/comments 265 266 :return: Video data dictionaries, as a generator 267 """ 268 page = 0 269 num_items = 0 270 while True: 271 self.dataset.update_status("Retrieved %i items for query '%s'" % (num_items, query)) 272 273 if self.interrupted: 274 raise ProcessorInterruptedException("Interrupted while scraping BitChute") 275 276 # prepare the request - the CSRF param *must* be the first or the request will fail 277 post_data = {"csrfmiddlewaretoken": csrftoken, "query": query, "kind": "video", "duration": "", 278 "sort": "", "page": str(page)} 279 headers = {'Referer': "https://www.bitchute.com/search", 'Origin': "https://www.bitchute.com/search"} 280 response = self.request_from_bitchute(session, "POST", "https://www.bitchute.com/api/search/list/", headers, post_data) 281 282 if not response["success"] or response["count"] == 0 or num_items >= self.max_items: 283 break 284 285 comments = [] 286 for video_data in response["results"]: 287 if num_items >= self.max_items: 288 break 289 else: 290 num_items += 1 291 292 # note: deleted videos will have a published date of 'None'. To 293 # avoid crashing the backend the easiest way is to set it to something 294 # that is obviously not a valid date in this context. 295 if video_data["published"] is None: 296 video_data["published"] = "1970-01-01" 297 # this is only included as '5 months ago' and so forth, not exact date 298 # so use dateparser to at least approximate the date 299 try: 300 dt = dateparser.parse(video_data["published"]) 301 except Exception as e: 302 self.log.warning('dateparser.parse error reading video_data["published"]: %s' % str(video_data['published'])) 303 raise e 304 305 video = { 306 "query_type": "search", 307 "query": query, 308 "id": video_data["id"], 309 "thread_id": video_data["id"], 310 "subject": video_data["name"], 311 "body": video_data["description"], 312 "author": video_data["channel_name"], 313 "author_id": video_data["channel_path"].split("/")[2], 314 "timestamp": int(dt.timestamp()), 315 "url": "https://www.bitchute.com" + video_data["path"], 316 "views": video_data["views"], 317 "length": video_data["duration"], 318 "thumbnail_image": video_data["images"]["thumbnail"] 319 } 320 321 if detail != "basic": 322 video, comments = self.append_details(video, detail) 323 if not video: 324 # unrecoverable error while scraping details 325 return 326 327 yield video 328 for comment in comments: 329 # these need to be yielded *after* the video because else the result file will have the comments 330 # before the video, which is weird 331 yield comment 332 333 page += 1
Scrape videos for given BitChute search query
Parameters
- session: HTTP Session to use
- str user: Search query to scrape videos for
- str csrftoken: CSRF token to use for requests
- str detail: Detail level to scrape, basic/detail/comments
Returns
Video data dictionaries, as a generator
def
append_details(self, video, detail):
335 def append_details(self, video, detail): 336 """ 337 Append extra metadata to video data 338 339 Fetches the BitChute video detail page to scrape extra data for the given video. 340 341 :param dict video: Video details as scraped so far 342 :param str detail: Detail level. If 'comments', also scrape video comments. 343 344 :return dict: Tuple, first item: updated video data, second: list of comments 345 """ 346 comments = [] 347 348 video = { 349 **video, 350 "likes": "", 351 "dislikes": "", 352 "channel_subscribers": "", 353 "comments": "", 354 "hashtags": "", 355 "parent_id": "", 356 "video_url": "" 357 } 358 359 try: 360 # to get more details per video, we need to request the actual video detail page 361 # start a new session, to not interfere with the CSRF token from the search session 362 video_session = requests.session() 363 video_page = video_session.get(video["url"]) 364 365 if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \ 366 "<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \ 367 "<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \ 368 "<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text: 369 if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text: 370 video["category"] = "moderated-illegal" 371 return (video, []) 372 373 elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text: 374 video["category"] = "moderated-nsfl" 375 return (video, []) 376 377 elif "Incitement to Hatred" in video_page.text: 378 video["category"] = "moderated-incitement-hatred" 379 return (video, []) 380 381 elif "Threats or Incitement to Violence" in video_page.text: 382 video["category"] = "moderated-incitement-violence" 383 return (video, []) 384 385 elif "Platform Misuse" in video_page.text: 386 video["category"] = "moderated-misuse" 387 return (video, []) 388 389 elif "Terrorism & Violent Extremism" in video_page.text: 390 video["category"] = "moderated-terrorism-extremism" 391 return (video, []) 392 393 elif "Copyright</h4>" in video_page.text: 394 video["category"] = "moderated-copyright" 395 return (video, []) 396 397 elif "Contains Holocaust Denial" in video_page.text: 398 video["category"] = "moderated-holocaust-denial" 399 return (video, []) 400 401 elif "Harmful Activities" in video_page.text: 402 video["category"] = "moderated-harmful-activity" 403 return (video, []) 404 405 else: 406 video["category"] = "moderated-other" 407 self.log.warning("Unknown moderated reason for BitChute video %s" % video["id"]) 408 return (video, []) 409 410 elif "<iframe class=\"rumble\"" in video_page.text: 411 # some videos are actually embeds from rumble? 412 # these are iframes, so at the moment we cannot simply extract 413 # their info from the page, so we skip them. In the future we 414 # could add an extra request to get the relevant info, but so 415 # far the only examples I've seen are actually 'video not found' 416 video = { 417 **video, 418 "category": "error-embed-from-rumble" 419 } 420 return (video, []) 421 422 elif video_page.status_code != 200: 423 video = { 424 **video, 425 "category": "error-%i" % video_page.status_code 426 } 427 return (video, []) 428 429 soup = BeautifulSoup(video_page.text, 'html.parser') 430 video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value") 431 432 video["video_url"] = soup.select_one("video#player source").get("src") 433 video["thumbnail_image"] = soup.select_one("video#player").get("poster") 434 video["subject"] = soup.select_one("h1#video-title").text 435 video["author"] = soup.select_one("div.channel-banner p.name a").text 436 video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2] 437 video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip() 438 439 # we need *two more requests* to get the comment count and like/dislike counts 440 # this seems to be because bitchute uses a third-party comment widget 441 video_session.headers = {'Referer': video["url"], 'Origin': video["url"]} 442 counts = self.request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken}) 443 444 if detail == "comments": 445 # if comments are also to be scraped, this is another request to make, which returns 446 # a convenient JSON response with all the comments to the video 447 # we need yet another token for this, which we can extract from a bit of inline 448 # javascript on the page 449 comment_script = None 450 for line in video_page.text.split("\n"): 451 if "initComments(" in line: 452 comment_script = line.split("initComments(")[1] 453 break 454 455 if not comment_script: 456 # no script to extract comments from, cannot load 457 comment_count = -1 458 else: 459 # make the request 460 comment_count = 0 461 url = comment_script.split("'")[1] 462 comment_csrf = comment_script.split("'")[3] 463 comments_data = self.request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0}) 464 465 for comment in comments_data: 466 comment_count += 1 467 468 if comment.get("profile_picture_url", None): 469 thumbnail_image = url + comment.get("profile_picture_url") 470 else: 471 thumbnail_image = "" 472 473 comments.append({ 474 "query_type": video["query_type"], 475 "query": video["query"], 476 "id": comment["id"], 477 "thread_id": video["id"], 478 "subject": "", 479 "body": comment["content"], 480 "author": comment["fullname"], 481 "author_id": comment["creator"], 482 "timestamp": int(dateparser.parse(comment["created"]).timestamp()), 483 "url": "", 484 "views": "", 485 "length": "", 486 "hashtags": "", 487 "thumbnail_image": thumbnail_image, 488 "likes": comment["upvote_count"], 489 "category": "comment", 490 "dislikes": "", 491 "channel_subscribers": "", 492 "comments": "", 493 "parent_id": comment.get("parent", "") if "parent" in comment else video["id"], 494 }) 495 496 else: 497 # if we don't need the full comments, we still need another request to get the *amount* 498 # of comments, 499 comment_count = self.request_from_bitchute(video_session, "POST", 500 "https://commentfreely.bitchute.com/api/get_comment_count/", 501 data={"csrfmiddlewaretoken": video_csfrtoken, 502 "cf_thread": "bc_" + video["id"]})["commentCount"] 503 504 except RuntimeError as e: 505 # we wrap this in one big try-catch because doing it for each request separarely is tedious 506 # hm... maybe this should be in a helper function 507 self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e, 508 is_final=True) 509 return (None, None) 510 511 # again, no structured info available for the publication date, but at least we can extract the 512 # exact day it was uploaded 513 try: 514 published = dateparser.parse( 515 soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1]) 516 except AttributeError as e: 517 # publication date not on page? 518 published = None 519 520 # merge data 521 video = { 522 **video, 523 "category": re.findall(r'<td><a href="/category/([^/]+)/"', video_page.text)[0], 524 "likes": counts["like_count"], 525 "dislikes": counts["dislike_count"], 526 "channel_subscribers": counts["subscriber_count"], 527 "comments": comment_count, 528 "parent_id": "", 529 "hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]), 530 "views": counts["view_count"] 531 } 532 533 if published: 534 video["timestamp"] = int(published.timestamp()) 535 536 # may need to be increased? bitchute doesn't seem particularly strict 537 time.sleep(0.25) 538 return (video, comments)
Append extra metadata to video data
Fetches the BitChute video detail page to scrape extra data for the given video.
Parameters
- dict video: Video details as scraped so far
- str detail: Detail level. If 'comments', also scrape video comments.
Returns
list of comments
def
request_from_bitchute(self, session, method, url, headers=None, data=None):
540 def request_from_bitchute(self, session, method, url, headers=None, data=None): 541 """ 542 Request something via the BitChute API (or non-API) 543 544 To avoid having to write the same error-checking everywhere, this takes 545 care of retrying on failure, et cetera 546 547 :param session: Requests session 548 :param str method: GET or POST 549 :param str url: URL to fetch 550 :param dict header: Headers to pass with the request 551 :param dict data: Data/params to send with the request 552 553 :return: Requests response 554 """ 555 retries = 0 556 response = None 557 while retries < 3: 558 try: 559 if method.lower() == "post": 560 request = session.post(url, headers=headers, data=data) 561 elif method.lower() == "get": 562 request = session.get(url, headers=headers, params=data) 563 else: 564 raise NotImplemented() 565 566 if request.status_code >= 300: 567 raise ValueError("Response %i from BitChute for URL %s, need to retry" % (request.status_code, url)) 568 569 response = request.json() 570 return response 571 572 except (ConnectionResetError, requests.RequestException, ValueError) as e: 573 retries += 1 574 time.sleep(retries * 2) 575 576 except json.JSONDecodeError as e: 577 self.log.warning("Error decoding JSON: %s\n\n%s" % (e, request.text)) 578 579 if not response: 580 self.log.warning("Failed BitChute request to %s %i times, aborting" % (url, retries)) 581 raise RuntimeError() 582 583 return response
Request something via the BitChute API (or non-API)
To avoid having to write the same error-checking everywhere, this takes care of retrying on failure, et cetera
Parameters
- session: Requests session
- str method: GET or POST
- str url: URL to fetch
- dict header: Headers to pass with the request
- dict data: Data/params to send with the request
Returns
Requests response
def
validate_query(query, request, user):
586 def validate_query(query, request, user): 587 """ 588 Validate BitChute query input 589 590 :param dict query: Query parameters, from client-side. 591 :param request: Flask request 592 :param User user: User object of user who has submitted the query 593 :return dict: Safe query parameters 594 """ 595 # no query 4 u 596 if not query.get("query", "").strip(): 597 raise QueryParametersException("You must provide a search query.") 598 599 # reformat queries to be a comma-separated list with no wrapping 600 # whitespace 601 items = query.get("query").replace("\n", ",") 602 if len(items.split(",")) > 15 and query.get("search_type") != "url": 603 raise QueryParametersException("You cannot query more than 15 items at a time.") 604 605 # simple! 606 return { 607 "items": query.get("max_posts"), 608 "query": items, 609 "scope": query.get("search_scope"), 610 "item_type": query.get("search_type") 611 }
Validate BitChute query input
Parameters
- dict query: Query parameters, from client-side.
- request: Flask request
- User user: User object of user who has submitted the query
Returns
Safe query parameters
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor