Edit on GitHub

datasources.ninegag.search_9gag

Import scraped 9gag data

It's prohibitively difficult to scrape data from 9gag within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

View Source

 1"""
 2Import scraped 9gag data
 3
 4It's prohibitively difficult to scrape data from 9gag within 4CAT itself due
 5to its aggressive rate limiting. Instead, import data collected elsewhere.
 6"""
 7from datetime import datetime
 8
 9from backend.lib.search import Search
10from common.lib.item_mapping import MappedItem
11
12
13class SearchNineGag(Search):
14    """
15    Import scraped 9gag data
16    """
17    type = "ninegag-search"  # job ID
18    category = "Search"  # category
19    title = "Import scraped 9gag data"  # title displayed in UI
20    description = "Import 9gag data collected with an external tool such as Zeeschuimer."  # description displayed in UI
21    extension = "ndjson"  # extension of result file, used internally and in UI
22    is_from_zeeschuimer = True
23
24    # not available as a processor for existing datasets
25    accepts = [None]
26    references = [
27        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
28        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
29    ]
30
31    def get_items(self, query):
32        """
33        Run custom search
34
35        Not available for 9gag
36        """
37        raise NotImplementedError("9gag datasets can only be created by importing data from elsewhere")
38
39    @staticmethod
40    def map_item(post):
41        post_timestamp = datetime.fromtimestamp(post["creationTs"])
42
43        image = sorted([v for v in post["images"].values() if "hasAudio" not in v], key=lambda image: image["width"] * image["height"], reverse=True)[0]
44        video = sorted([v for v in post["images"].values() if "hasAudio" in v], key=lambda image: image["width"] * image["height"], reverse=True)
45
46        video_url = ""
47        if video:
48            # annoyingly, not all formats are always available
49            video = video[0]
50            if "av1Url" in video:
51                video_url = video["av1Url"]
52            elif "h265Url" in video:
53                video_url = video["h265Url"]
54            elif "vp9Url" in video:
55                video_url = video["vp9Url"]
56            elif "vp8Url" in video:
57                video_url = video["vp8Url"]
58
59        if not post["creator"]:
60            # anonymous posts exist
61            # they display as from the user '9GAGGER' on the website
62            post["creator"] = {
63                "username": "9GAGGER",
64                "fullName": "",
65                "emojiStatus": "",
66                "isVerifiedAccount": ""
67            }
68
69        return MappedItem({
70            "id": post["id"],
71            "url": post["url"],
72            "subject": post["title"],
73            "body": post["description"],
74            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
75            "author": post["creator"]["username"],
76            "author_name": post["creator"]["fullName"],
77            "author_status": post["creator"]["emojiStatus"],
78            "author_verified": "yes" if post["creator"]["isVerifiedAccount"] else "no",
79            "type": post["type"],
80            "image_url": image["url"],
81            "video_url": video_url,
82            "is_nsfw": "no" if post["nsfw"] == 0 else "yes",
83            "is_promoted": "no" if post["promoted"] == 0 else "yes",
84            "is_vote_masked": "no" if post["isVoteMasked"] == 0 else "yes",
85            "is_anonymous": "no" if not post["isAnonymous"] else "yes",
86            "source_domain": post["sourceDomain"],
87            "source_url": post["sourceUrl"],
88            "upvotes": post["upVoteCount"],
89            "downvotes": post["downVoteCount"],
90            "score": post["upVoteCount"] - post["downVoteCount"],
91            "comments": post["commentsCount"],
92            "tags": ",".join([tag["key"] for tag in post["tags"]]),
93            "tags_annotated": ",".join(post["annotationTags"]),
94            "unix_timestamp": int(post_timestamp.timestamp()),
95        })

class SearchNineGag(backend.lib.search.Search): View Source

14class SearchNineGag(Search):
15    """
16    Import scraped 9gag data
17    """
18    type = "ninegag-search"  # job ID
19    category = "Search"  # category
20    title = "Import scraped 9gag data"  # title displayed in UI
21    description = "Import 9gag data collected with an external tool such as Zeeschuimer."  # description displayed in UI
22    extension = "ndjson"  # extension of result file, used internally and in UI
23    is_from_zeeschuimer = True
24
25    # not available as a processor for existing datasets
26    accepts = [None]
27    references = [
28        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
29        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
30    ]
31
32    def get_items(self, query):
33        """
34        Run custom search
35
36        Not available for 9gag
37        """
38        raise NotImplementedError("9gag datasets can only be created by importing data from elsewhere")
39
40    @staticmethod
41    def map_item(post):
42        post_timestamp = datetime.fromtimestamp(post["creationTs"])
43
44        image = sorted([v for v in post["images"].values() if "hasAudio" not in v], key=lambda image: image["width"] * image["height"], reverse=True)[0]
45        video = sorted([v for v in post["images"].values() if "hasAudio" in v], key=lambda image: image["width"] * image["height"], reverse=True)
46
47        video_url = ""
48        if video:
49            # annoyingly, not all formats are always available
50            video = video[0]
51            if "av1Url" in video:
52                video_url = video["av1Url"]
53            elif "h265Url" in video:
54                video_url = video["h265Url"]
55            elif "vp9Url" in video:
56                video_url = video["vp9Url"]
57            elif "vp8Url" in video:
58                video_url = video["vp8Url"]
59
60        if not post["creator"]:
61            # anonymous posts exist
62            # they display as from the user '9GAGGER' on the website
63            post["creator"] = {
64                "username": "9GAGGER",
65                "fullName": "",
66                "emojiStatus": "",
67                "isVerifiedAccount": ""
68            }
69
70        return MappedItem({
71            "id": post["id"],
72            "url": post["url"],
73            "subject": post["title"],
74            "body": post["description"],
75            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
76            "author": post["creator"]["username"],
77            "author_name": post["creator"]["fullName"],
78            "author_status": post["creator"]["emojiStatus"],
79            "author_verified": "yes" if post["creator"]["isVerifiedAccount"] else "no",
80            "type": post["type"],
81            "image_url": image["url"],
82            "video_url": video_url,
83            "is_nsfw": "no" if post["nsfw"] == 0 else "yes",
84            "is_promoted": "no" if post["promoted"] == 0 else "yes",
85            "is_vote_masked": "no" if post["isVoteMasked"] == 0 else "yes",
86            "is_anonymous": "no" if not post["isAnonymous"] else "yes",
87            "source_domain": post["sourceDomain"],
88            "source_url": post["sourceUrl"],
89            "upvotes": post["upVoteCount"],
90            "downvotes": post["downVoteCount"],
91            "score": post["upVoteCount"] - post["downVoteCount"],
92            "comments": post["commentsCount"],
93            "tags": ",".join([tag["key"] for tag in post["tags"]]),
94            "tags_annotated": ",".join(post["annotationTags"]),
95            "unix_timestamp": int(post_timestamp.timestamp()),
96        })

Import scraped 9gag data

type = 'ninegag-search'

category = 'Search'

title = 'Import scraped 9gag data'

description = 'Import 9gag data collected with an external tool such as Zeeschuimer.'

extension = 'ndjson'

is_from_zeeschuimer = True

accepts = [None]

references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']

def get_items(self, query): View Source

32    def get_items(self, query):
33        """
34        Run custom search
35
36        Not available for 9gag
37        """
38        raise NotImplementedError("9gag datasets can only be created by importing data from elsewhere")

Run custom search

Not available for 9gag

@staticmethod

def map_item(post): View Source

40    @staticmethod
41    def map_item(post):
42        post_timestamp = datetime.fromtimestamp(post["creationTs"])
43
44        image = sorted([v for v in post["images"].values() if "hasAudio" not in v], key=lambda image: image["width"] * image["height"], reverse=True)[0]
45        video = sorted([v for v in post["images"].values() if "hasAudio" in v], key=lambda image: image["width"] * image["height"], reverse=True)
46
47        video_url = ""
48        if video:
49            # annoyingly, not all formats are always available
50            video = video[0]
51            if "av1Url" in video:
52                video_url = video["av1Url"]
53            elif "h265Url" in video:
54                video_url = video["h265Url"]
55            elif "vp9Url" in video:
56                video_url = video["vp9Url"]
57            elif "vp8Url" in video:
58                video_url = video["vp8Url"]
59
60        if not post["creator"]:
61            # anonymous posts exist
62            # they display as from the user '9GAGGER' on the website
63            post["creator"] = {
64                "username": "9GAGGER",
65                "fullName": "",
66                "emojiStatus": "",
67                "isVerifiedAccount": ""
68            }
69
70        return MappedItem({
71            "id": post["id"],
72            "url": post["url"],
73            "subject": post["title"],
74            "body": post["description"],
75            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
76            "author": post["creator"]["username"],
77            "author_name": post["creator"]["fullName"],
78            "author_status": post["creator"]["emojiStatus"],
79            "author_verified": "yes" if post["creator"]["isVerifiedAccount"] else "no",
80            "type": post["type"],
81            "image_url": image["url"],
82            "video_url": video_url,
83            "is_nsfw": "no" if post["nsfw"] == 0 else "yes",
84            "is_promoted": "no" if post["promoted"] == 0 else "yes",
85            "is_vote_masked": "no" if post["isVoteMasked"] == 0 else "yes",
86            "is_anonymous": "no" if not post["isAnonymous"] else "yes",
87            "source_domain": post["sourceDomain"],
88            "source_url": post["sourceUrl"],
89            "upvotes": post["upVoteCount"],
90            "downvotes": post["downVoteCount"],
91            "score": post["upVoteCount"] - post["downVoteCount"],
92            "comments": post["commentsCount"],
93            "tags": ",".join([tag["key"] for tag in post["tags"]]),
94            "tags_annotated": ",".join(post["annotationTags"]),
95            "unix_timestamp": int(post_timestamp.timestamp()),
96        })

Inherited Members

backend.lib.worker.BasicWorker: BasicWorker; INTERRUPT_NONE; INTERRUPT_RETRY; INTERRUPT_CANCEL; queue; log; manager; interrupted; modules; init_time; name; run; clean_up; request_interrupt; is_4cat_class
backend.lib.search.Search: max_workers; prefix; return_cols; import_error_count; import_warning_count; process; search; import_from_file; items_to_csv; items_to_ndjson; items_to_archive
backend.lib.processor.BasicProcessor: db; job; dataset; owner; source_dataset; source_file; config; is_running_in_preset; is_hidden; filepath; work; after_process; remove_files; abort; iterate_proxied_requests; push_proxied_request; flush_proxied_requests; iterate_archive_contents; unpack_archive_contents; extract_archived_file_by_name; write_csv_items_and_finish; write_archive_and_finish; create_standalone; save_annotations; map_item_method_available; get_mapped_item; is_filter; get_options; get_status; is_top_dataset; is_from_collector; get_extension; is_rankable; exclude_followup_processors; is_4cat_processor