datasources.imgur.search_imgur
Import scraped Imgur data
It's prohibitively difficult to scrape data from Imgur within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped Imgur data 3 4It's prohibitively difficult to scrape data from Imgur within 4CAT itself due 5to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7from datetime import datetime 8 9from backend.lib.search import Search 10from common.lib.item_mapping import MappedItem 11 12class SearchNineGag(Search): 13 """ 14 Import scraped Imgur data 15 """ 16 type = "imgur-search" # job ID 17 category = "Search" # category 18 title = "Import scraped Imgur data" # title displayed in UI 19 description = "Import Imgur data collected with an external tool such as Zeeschuimer." # description displayed in UI 20 extension = "ndjson" # extension of result file, used internally and in UI 21 is_from_zeeschuimer = True 22 23 # not available as a processor for existing datasets 24 accepts = [None] 25 references = [ 26 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 27 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 28 ] 29 30 def get_items(self, query): 31 """ 32 Run custom search 33 34 Not available for Imgur 35 """ 36 raise NotImplementedError("Imgur datasets can only be created by importing data from elsewhere") 37 38 @staticmethod 39 def map_item(item): 40 post_timestamp = datetime.strptime(item["created_at"], "%Y-%m-%dT%H:%M:%SZ") 41 42 return MappedItem({ 43 "id": item["id"], 44 "subject": item["title"], 45 "body": item["description"], 46 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 47 "author": item["account_id"], 48 "type": item["cover"]["type"], 49 "media_url": item["cover"]["url"], 50 "post_url": item["url"], 51 "album_media": item["image_count"], 52 "is_ad": "no" if not item["is_ad"] else "yes", 53 "is_album": "no" if not item["is_album"] else "yes", 54 "is_mature": "no" if not item["is_mature"] else "yes", 55 "is_viral": "no" if not item["in_most_viral"] else "yes", 56 "views": item["view_count"], 57 "upvotes": item["upvote_count"], 58 "downvotes": item["downvote_count"], 59 "score": item["point_count"], 60 "comments": item["comment_count"], 61 "favourites": item["favorite_count"], 62 "virality_score": item["virality"], 63 "unix_timestamp": int(post_timestamp.timestamp()), 64 })
13class SearchNineGag(Search): 14 """ 15 Import scraped Imgur data 16 """ 17 type = "imgur-search" # job ID 18 category = "Search" # category 19 title = "Import scraped Imgur data" # title displayed in UI 20 description = "Import Imgur data collected with an external tool such as Zeeschuimer." # description displayed in UI 21 extension = "ndjson" # extension of result file, used internally and in UI 22 is_from_zeeschuimer = True 23 24 # not available as a processor for existing datasets 25 accepts = [None] 26 references = [ 27 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 28 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 29 ] 30 31 def get_items(self, query): 32 """ 33 Run custom search 34 35 Not available for Imgur 36 """ 37 raise NotImplementedError("Imgur datasets can only be created by importing data from elsewhere") 38 39 @staticmethod 40 def map_item(item): 41 post_timestamp = datetime.strptime(item["created_at"], "%Y-%m-%dT%H:%M:%SZ") 42 43 return MappedItem({ 44 "id": item["id"], 45 "subject": item["title"], 46 "body": item["description"], 47 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 48 "author": item["account_id"], 49 "type": item["cover"]["type"], 50 "media_url": item["cover"]["url"], 51 "post_url": item["url"], 52 "album_media": item["image_count"], 53 "is_ad": "no" if not item["is_ad"] else "yes", 54 "is_album": "no" if not item["is_album"] else "yes", 55 "is_mature": "no" if not item["is_mature"] else "yes", 56 "is_viral": "no" if not item["in_most_viral"] else "yes", 57 "views": item["view_count"], 58 "upvotes": item["upvote_count"], 59 "downvotes": item["downvote_count"], 60 "score": item["point_count"], 61 "comments": item["comment_count"], 62 "favourites": item["favorite_count"], 63 "virality_score": item["virality"], 64 "unix_timestamp": int(post_timestamp.timestamp()), 65 })
Import scraped Imgur data
references =
['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def
get_items(self, query):
31 def get_items(self, query): 32 """ 33 Run custom search 34 35 Not available for Imgur 36 """ 37 raise NotImplementedError("Imgur datasets can only be created by importing data from elsewhere")
Run custom search
Not available for Imgur
@staticmethod
def
map_item(item):
39 @staticmethod 40 def map_item(item): 41 post_timestamp = datetime.strptime(item["created_at"], "%Y-%m-%dT%H:%M:%SZ") 42 43 return MappedItem({ 44 "id": item["id"], 45 "subject": item["title"], 46 "body": item["description"], 47 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 48 "author": item["account_id"], 49 "type": item["cover"]["type"], 50 "media_url": item["cover"]["url"], 51 "post_url": item["url"], 52 "album_media": item["image_count"], 53 "is_ad": "no" if not item["is_ad"] else "yes", 54 "is_album": "no" if not item["is_album"] else "yes", 55 "is_mature": "no" if not item["is_mature"] else "yes", 56 "is_viral": "no" if not item["in_most_viral"] else "yes", 57 "views": item["view_count"], 58 "upvotes": item["upvote_count"], 59 "downvotes": item["downvote_count"], 60 "score": item["point_count"], 61 "comments": item["comment_count"], 62 "favourites": item["favorite_count"], 63 "virality_score": item["virality"], 64 "unix_timestamp": int(post_timestamp.timestamp()), 65 })
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor