Edit on GitHub

datasources.imgur.search_imgur

Import scraped Imgur data

It's prohibitively difficult to scrape data from Imgur within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

 1"""
 2Import scraped Imgur data
 3
 4It's prohibitively difficult to scrape data from Imgur within 4CAT itself due
 5to its aggressive rate limiting. Instead, import data collected elsewhere.
 6"""
 7from datetime import datetime
 8
 9from backend.lib.search import Search
10from common.lib.item_mapping import MappedItem
11
12class SearchNineGag(Search):
13    """
14    Import scraped Imgur data
15    """
16    type = "imgur-search"  # job ID
17    category = "Search"  # category
18    title = "Import scraped Imgur data"  # title displayed in UI
19    description = "Import Imgur data collected with an external tool such as Zeeschuimer."  # description displayed in UI
20    extension = "ndjson"  # extension of result file, used internally and in UI
21    is_from_zeeschuimer = True
22
23    # not available as a processor for existing datasets
24    accepts = [None]
25    references = [
26        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
27        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
28    ]
29
30    def get_items(self, query):
31        """
32        Run custom search
33
34        Not available for Imgur
35        """
36        raise NotImplementedError("Imgur datasets can only be created by importing data from elsewhere")
37
38    @staticmethod
39    def map_item(item):
40        post_timestamp = datetime.strptime(item["created_at"], "%Y-%m-%dT%H:%M:%SZ")
41
42        return MappedItem({
43            "id": item["id"],
44            "subject": item["title"],
45            "body": item["description"],
46            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
47            "author": item["account_id"],
48            "type": item["cover"]["type"],
49            "media_url": item["cover"]["url"],
50            "post_url": item["url"],
51            "album_media": item["image_count"],
52            "is_ad": "no" if not item["is_ad"] else "yes",
53            "is_album": "no" if not item["is_album"] else "yes",
54            "is_mature": "no" if not item["is_mature"] else "yes",
55            "is_viral": "no" if not item["in_most_viral"] else "yes",
56            "views": item["view_count"],
57            "upvotes": item["upvote_count"],
58            "downvotes": item["downvote_count"],
59            "score": item["point_count"],
60            "comments": item["comment_count"],
61            "favourites": item["favorite_count"],
62            "virality_score": item["virality"],
63            "unix_timestamp": int(post_timestamp.timestamp()),
64        })
class SearchNineGag(backend.lib.search.Search):
13class SearchNineGag(Search):
14    """
15    Import scraped Imgur data
16    """
17    type = "imgur-search"  # job ID
18    category = "Search"  # category
19    title = "Import scraped Imgur data"  # title displayed in UI
20    description = "Import Imgur data collected with an external tool such as Zeeschuimer."  # description displayed in UI
21    extension = "ndjson"  # extension of result file, used internally and in UI
22    is_from_zeeschuimer = True
23
24    # not available as a processor for existing datasets
25    accepts = [None]
26    references = [
27        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
28        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
29    ]
30
31    def get_items(self, query):
32        """
33        Run custom search
34
35        Not available for Imgur
36        """
37        raise NotImplementedError("Imgur datasets can only be created by importing data from elsewhere")
38
39    @staticmethod
40    def map_item(item):
41        post_timestamp = datetime.strptime(item["created_at"], "%Y-%m-%dT%H:%M:%SZ")
42
43        return MappedItem({
44            "id": item["id"],
45            "subject": item["title"],
46            "body": item["description"],
47            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
48            "author": item["account_id"],
49            "type": item["cover"]["type"],
50            "media_url": item["cover"]["url"],
51            "post_url": item["url"],
52            "album_media": item["image_count"],
53            "is_ad": "no" if not item["is_ad"] else "yes",
54            "is_album": "no" if not item["is_album"] else "yes",
55            "is_mature": "no" if not item["is_mature"] else "yes",
56            "is_viral": "no" if not item["in_most_viral"] else "yes",
57            "views": item["view_count"],
58            "upvotes": item["upvote_count"],
59            "downvotes": item["downvote_count"],
60            "score": item["point_count"],
61            "comments": item["comment_count"],
62            "favourites": item["favorite_count"],
63            "virality_score": item["virality"],
64            "unix_timestamp": int(post_timestamp.timestamp()),
65        })

Import scraped Imgur data

type = 'imgur-search'
category = 'Search'
title = 'Import scraped Imgur data'
description = 'Import Imgur data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def get_items(self, query):
31    def get_items(self, query):
32        """
33        Run custom search
34
35        Not available for Imgur
36        """
37        raise NotImplementedError("Imgur datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Imgur

@staticmethod
def map_item(item):
39    @staticmethod
40    def map_item(item):
41        post_timestamp = datetime.strptime(item["created_at"], "%Y-%m-%dT%H:%M:%SZ")
42
43        return MappedItem({
44            "id": item["id"],
45            "subject": item["title"],
46            "body": item["description"],
47            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
48            "author": item["account_id"],
49            "type": item["cover"]["type"],
50            "media_url": item["cover"]["url"],
51            "post_url": item["url"],
52            "album_media": item["image_count"],
53            "is_ad": "no" if not item["is_ad"] else "yes",
54            "is_album": "no" if not item["is_album"] else "yes",
55            "is_mature": "no" if not item["is_mature"] else "yes",
56            "is_viral": "no" if not item["in_most_viral"] else "yes",
57            "views": item["view_count"],
58            "upvotes": item["upvote_count"],
59            "downvotes": item["downvote_count"],
60            "score": item["point_count"],
61            "comments": item["comment_count"],
62            "favourites": item["favorite_count"],
63            "virality_score": item["virality"],
64            "unix_timestamp": int(post_timestamp.timestamp()),
65        })