datasources.ninegag.search_9gag
Import scraped 9gag data
It's prohibitively difficult to scrape data from 9gag within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped 9gag data 3 4It's prohibitively difficult to scrape data from 9gag within 4CAT itself due 5to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7from datetime import datetime 8 9from backend.lib.search import Search 10from common.lib.item_mapping import MappedItem 11 12 13class SearchNineGag(Search): 14 """ 15 Import scraped 9gag data 16 """ 17 type = "ninegag-search" # job ID 18 category = "Search" # category 19 title = "Import scraped 9gag data" # title displayed in UI 20 description = "Import 9gag data collected with an external tool such as Zeeschuimer." # description displayed in UI 21 extension = "ndjson" # extension of result file, used internally and in UI 22 is_from_zeeschuimer = True 23 24 # not available as a processor for existing datasets 25 accepts = [None] 26 references = [ 27 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 28 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 29 ] 30 31 def get_items(self, query): 32 """ 33 Run custom search 34 35 Not available for 9gag 36 """ 37 raise NotImplementedError("9gag datasets can only be created by importing data from elsewhere") 38 39 @staticmethod 40 def map_item(post): 41 post_timestamp = datetime.fromtimestamp(post["creationTs"]) 42 43 image = sorted([v for v in post["images"].values() if not "hasAudio" in v], key=lambda image: image["width"] * image["height"], reverse=True)[0] 44 video = sorted([v for v in post["images"].values() if "hasAudio" in v], key=lambda image: image["width"] * image["height"], reverse=True) 45 46 video_url = "" 47 if video: 48 # annoyingly, not all formats are always available 49 video = video[0] 50 if "av1Url" in video: 51 video_url = video["av1Url"] 52 elif "h265Url" in video: 53 video_url = video["h265Url"] 54 elif "vp9Url" in video: 55 video_url = video["vp9Url"] 56 elif "vp8Url" in video: 57 video_url = video["vp8Url"] 58 59 if not post["creator"]: 60 # anonymous posts exist 61 # they display as from the user '9GAGGER' on the website 62 post["creator"] = { 63 "username": "9GAGGER", 64 "fullName": "", 65 "emojiStatus": "", 66 "isVerifiedAccount": "" 67 } 68 69 return MappedItem({ 70 "id": post["id"], 71 "url": post["url"], 72 "subject": post["title"], 73 "body": post["description"], 74 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 75 "author": post["creator"]["username"], 76 "author_name": post["creator"]["fullName"], 77 "author_status": post["creator"]["emojiStatus"], 78 "author_verified": "yes" if post["creator"]["isVerifiedAccount"] else "no", 79 "type": post["type"], 80 "image_url": image["url"], 81 "video_url": video_url, 82 "is_nsfw": "no" if post["nsfw"] == 0 else "yes", 83 "is_promoted": "no" if post["promoted"] == 0 else "yes", 84 "is_vote_masked": "no" if post["isVoteMasked"] == 0 else "yes", 85 "is_anonymous": "no" if not post["isAnonymous"] else "yes", 86 "source_domain": post["sourceDomain"], 87 "source_url": post["sourceUrl"], 88 "upvotes": post["upVoteCount"], 89 "downvotes": post["downVoteCount"], 90 "score": post["upVoteCount"] - post["downVoteCount"], 91 "comments": post["commentsCount"], 92 "tags": ",".join([tag["key"] for tag in post["tags"]]), 93 "tags_annotated": ",".join(post["annotationTags"]), 94 "unix_timestamp": int(post_timestamp.timestamp()), 95 })
14class SearchNineGag(Search): 15 """ 16 Import scraped 9gag data 17 """ 18 type = "ninegag-search" # job ID 19 category = "Search" # category 20 title = "Import scraped 9gag data" # title displayed in UI 21 description = "Import 9gag data collected with an external tool such as Zeeschuimer." # description displayed in UI 22 extension = "ndjson" # extension of result file, used internally and in UI 23 is_from_zeeschuimer = True 24 25 # not available as a processor for existing datasets 26 accepts = [None] 27 references = [ 28 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 29 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 30 ] 31 32 def get_items(self, query): 33 """ 34 Run custom search 35 36 Not available for 9gag 37 """ 38 raise NotImplementedError("9gag datasets can only be created by importing data from elsewhere") 39 40 @staticmethod 41 def map_item(post): 42 post_timestamp = datetime.fromtimestamp(post["creationTs"]) 43 44 image = sorted([v for v in post["images"].values() if not "hasAudio" in v], key=lambda image: image["width"] * image["height"], reverse=True)[0] 45 video = sorted([v for v in post["images"].values() if "hasAudio" in v], key=lambda image: image["width"] * image["height"], reverse=True) 46 47 video_url = "" 48 if video: 49 # annoyingly, not all formats are always available 50 video = video[0] 51 if "av1Url" in video: 52 video_url = video["av1Url"] 53 elif "h265Url" in video: 54 video_url = video["h265Url"] 55 elif "vp9Url" in video: 56 video_url = video["vp9Url"] 57 elif "vp8Url" in video: 58 video_url = video["vp8Url"] 59 60 if not post["creator"]: 61 # anonymous posts exist 62 # they display as from the user '9GAGGER' on the website 63 post["creator"] = { 64 "username": "9GAGGER", 65 "fullName": "", 66 "emojiStatus": "", 67 "isVerifiedAccount": "" 68 } 69 70 return MappedItem({ 71 "id": post["id"], 72 "url": post["url"], 73 "subject": post["title"], 74 "body": post["description"], 75 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 76 "author": post["creator"]["username"], 77 "author_name": post["creator"]["fullName"], 78 "author_status": post["creator"]["emojiStatus"], 79 "author_verified": "yes" if post["creator"]["isVerifiedAccount"] else "no", 80 "type": post["type"], 81 "image_url": image["url"], 82 "video_url": video_url, 83 "is_nsfw": "no" if post["nsfw"] == 0 else "yes", 84 "is_promoted": "no" if post["promoted"] == 0 else "yes", 85 "is_vote_masked": "no" if post["isVoteMasked"] == 0 else "yes", 86 "is_anonymous": "no" if not post["isAnonymous"] else "yes", 87 "source_domain": post["sourceDomain"], 88 "source_url": post["sourceUrl"], 89 "upvotes": post["upVoteCount"], 90 "downvotes": post["downVoteCount"], 91 "score": post["upVoteCount"] - post["downVoteCount"], 92 "comments": post["commentsCount"], 93 "tags": ",".join([tag["key"] for tag in post["tags"]]), 94 "tags_annotated": ",".join(post["annotationTags"]), 95 "unix_timestamp": int(post_timestamp.timestamp()), 96 })
Import scraped 9gag data
references =
['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def
get_items(self, query):
32 def get_items(self, query): 33 """ 34 Run custom search 35 36 Not available for 9gag 37 """ 38 raise NotImplementedError("9gag datasets can only be created by importing data from elsewhere")
Run custom search
Not available for 9gag
@staticmethod
def
map_item(post):
40 @staticmethod 41 def map_item(post): 42 post_timestamp = datetime.fromtimestamp(post["creationTs"]) 43 44 image = sorted([v for v in post["images"].values() if not "hasAudio" in v], key=lambda image: image["width"] * image["height"], reverse=True)[0] 45 video = sorted([v for v in post["images"].values() if "hasAudio" in v], key=lambda image: image["width"] * image["height"], reverse=True) 46 47 video_url = "" 48 if video: 49 # annoyingly, not all formats are always available 50 video = video[0] 51 if "av1Url" in video: 52 video_url = video["av1Url"] 53 elif "h265Url" in video: 54 video_url = video["h265Url"] 55 elif "vp9Url" in video: 56 video_url = video["vp9Url"] 57 elif "vp8Url" in video: 58 video_url = video["vp8Url"] 59 60 if not post["creator"]: 61 # anonymous posts exist 62 # they display as from the user '9GAGGER' on the website 63 post["creator"] = { 64 "username": "9GAGGER", 65 "fullName": "", 66 "emojiStatus": "", 67 "isVerifiedAccount": "" 68 } 69 70 return MappedItem({ 71 "id": post["id"], 72 "url": post["url"], 73 "subject": post["title"], 74 "body": post["description"], 75 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 76 "author": post["creator"]["username"], 77 "author_name": post["creator"]["fullName"], 78 "author_status": post["creator"]["emojiStatus"], 79 "author_verified": "yes" if post["creator"]["isVerifiedAccount"] else "no", 80 "type": post["type"], 81 "image_url": image["url"], 82 "video_url": video_url, 83 "is_nsfw": "no" if post["nsfw"] == 0 else "yes", 84 "is_promoted": "no" if post["promoted"] == 0 else "yes", 85 "is_vote_masked": "no" if post["isVoteMasked"] == 0 else "yes", 86 "is_anonymous": "no" if not post["isAnonymous"] else "yes", 87 "source_domain": post["sourceDomain"], 88 "source_url": post["sourceUrl"], 89 "upvotes": post["upVoteCount"], 90 "downvotes": post["downVoteCount"], 91 "score": post["upVoteCount"] - post["downVoteCount"], 92 "comments": post["commentsCount"], 93 "tags": ",".join([tag["key"] for tag in post["tags"]]), 94 "tags_annotated": ",".join(post["annotationTags"]), 95 "unix_timestamp": int(post_timestamp.timestamp()), 96 })
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor