datasources.gab.search_gab
Import scraped Gab data
1""" 2Import scraped Gab data 3""" 4import datetime 5 6from backend.lib.search import Search 7from common.lib.item_mapping import MappedItem, MissingMappedField 8 9 10class SearchGab(Search): 11 """ 12 Import scraped gab data 13 """ 14 type = "gab-search" # job ID 15 category = "Search" # category 16 title = "Import scraped Gab data" # title displayed in UI 17 description = "Import Gab data collected with an external tool such as Zeeschuimer." # description displayed in UI 18 extension = "ndjson" # extension of result file, used internally and in UI 19 is_from_zeeschuimer = True 20 fake = "" 21 22 # not available as a processor for existing datasets 23 accepts = [None] 24 25 def get_items(self, query): 26 """ 27 Run custom search 28 29 Not available for Gab 30 """ 31 raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere") 32 33 @staticmethod 34 def map_item(post): 35 """ 36 Parse Gab post 37 38 :param node: Data as received from Gab 39 :return dict: Mapped item 40 """ 41 unknown_data = [] 42 post_id = post.get("i", post["id"]) 43 metadata = post.get("__import_meta", {}) 44 timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown") 45 # reaction_type seems to just be nummeric keys; unsure which reactions they map to 46 reactions = post.get("rc", post.get("reactions_counts")) 47 if type(reactions) is not int: 48 reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()]) 49 else: 50 reaction_count = reactions 51 52 # Other dictionaries are nested in the post dictionary 53 group = post.get("g", post.get("group", {})) 54 author = post.get("author_info", post.get("account", {})) 55 mentions = post.get("m", post.get("mentions", [])) 56 tags = post.get("tg", post.get("tags", [])) 57 # card or link 58 card = post.get("card", post.get("link", {})) 59 # media or image_info 60 media_items = post.get("image_info", post.get("media_attachments", [])) 61 image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"] 62 video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"] 63 if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]): 64 unknown_data.extend([f"Unknown media type: {media}" for media in media_items if media.get('t', media.get('type')) not in ['image', 'video']]) 65 if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]): 66 unknown_data.extend([f"Media missing URL: {img}" for img in image_urls if img is None]) 67 unknown_data.extend([f"Media missing URL: {vid}" for vid in video_urls if vid is None]) 68 # remove None values from the lists 69 image_urls = [img for img in image_urls if img is not None] 70 video_urls = [vid for vid in video_urls if vid is not None] 71 72 post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ") 73 mapped_item = { 74 "collected_at": timestamp_collected, 75 "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected 76 "id": post_id, 77 "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"), 78 "body": post.get("c") if "c" in post else post["content"], 79 "url": post.get("ul") if "ul" in post else post["url"], 80 "reaction_count": reaction_count, 81 "favourites_count": post.get("fbc", post.get("favourites_count")), 82 "replies_count": post.get("rc", post.get("replies_count")), 83 "reblogs_count": post.get("rbc", post.get("reblogs_count")), 84 "mentions": ",".join([mention["username"] for mention in mentions]), 85 "tags": ",".join([tag["name"] for tag in tags]), 86 87 "group_id": group["id"] if group else None, 88 "group_title": group["title"] if group else None, 89 "group_description": group["description"] if group else None, 90 "group_member_count": group["member_count"] if group else None, 91 "group_is_private": group["is_private"] if group else None, 92 "group_url": group["url"] if group else None, 93 "group_created_at": group.get("created_at") if group else None, 94 95 "account_id": author.get("i") if "i" in author else author["id"], 96 "account_username": author.get("un") if "un" in author else author["username"], 97 "account_account": author.get("ac") if "ac"in author else author["acct"], 98 "account_display_name": author.get("dn") if "dn" in author else author["display_name"], 99 "account_note": author.get("nt") if "nt" in author else author["note"], 100 101 "link_id": card["id"] if card else None, 102 "link_url": card["url"] if card else None, 103 "link_title": card["title"] if card else None, 104 "link_description": card["description"] if card else None, 105 "link_type": card["type"] if card else None, 106 "link_image": card["image"] if card else None, 107 108 "image_urls": ",".join(image_urls), 109 "video_urls": ",".join(video_urls), 110 111 "thread_id": post.get("i") if "i" in post else post["conversation_id"], 112 "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S") 113 } 114 115 return MappedItem(mapped_item, message="".join(unknown_data)) if unknown_data else MappedItem(mapped_item)
11class SearchGab(Search): 12 """ 13 Import scraped gab data 14 """ 15 type = "gab-search" # job ID 16 category = "Search" # category 17 title = "Import scraped Gab data" # title displayed in UI 18 description = "Import Gab data collected with an external tool such as Zeeschuimer." # description displayed in UI 19 extension = "ndjson" # extension of result file, used internally and in UI 20 is_from_zeeschuimer = True 21 fake = "" 22 23 # not available as a processor for existing datasets 24 accepts = [None] 25 26 def get_items(self, query): 27 """ 28 Run custom search 29 30 Not available for Gab 31 """ 32 raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere") 33 34 @staticmethod 35 def map_item(post): 36 """ 37 Parse Gab post 38 39 :param node: Data as received from Gab 40 :return dict: Mapped item 41 """ 42 unknown_data = [] 43 post_id = post.get("i", post["id"]) 44 metadata = post.get("__import_meta", {}) 45 timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown") 46 # reaction_type seems to just be nummeric keys; unsure which reactions they map to 47 reactions = post.get("rc", post.get("reactions_counts")) 48 if type(reactions) is not int: 49 reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()]) 50 else: 51 reaction_count = reactions 52 53 # Other dictionaries are nested in the post dictionary 54 group = post.get("g", post.get("group", {})) 55 author = post.get("author_info", post.get("account", {})) 56 mentions = post.get("m", post.get("mentions", [])) 57 tags = post.get("tg", post.get("tags", [])) 58 # card or link 59 card = post.get("card", post.get("link", {})) 60 # media or image_info 61 media_items = post.get("image_info", post.get("media_attachments", [])) 62 image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"] 63 video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"] 64 if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]): 65 unknown_data.extend([f"Unknown media type: {media}" for media in media_items if media.get('t', media.get('type')) not in ['image', 'video']]) 66 if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]): 67 unknown_data.extend([f"Media missing URL: {img}" for img in image_urls if img is None]) 68 unknown_data.extend([f"Media missing URL: {vid}" for vid in video_urls if vid is None]) 69 # remove None values from the lists 70 image_urls = [img for img in image_urls if img is not None] 71 video_urls = [vid for vid in video_urls if vid is not None] 72 73 post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ") 74 mapped_item = { 75 "collected_at": timestamp_collected, 76 "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected 77 "id": post_id, 78 "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"), 79 "body": post.get("c") if "c" in post else post["content"], 80 "url": post.get("ul") if "ul" in post else post["url"], 81 "reaction_count": reaction_count, 82 "favourites_count": post.get("fbc", post.get("favourites_count")), 83 "replies_count": post.get("rc", post.get("replies_count")), 84 "reblogs_count": post.get("rbc", post.get("reblogs_count")), 85 "mentions": ",".join([mention["username"] for mention in mentions]), 86 "tags": ",".join([tag["name"] for tag in tags]), 87 88 "group_id": group["id"] if group else None, 89 "group_title": group["title"] if group else None, 90 "group_description": group["description"] if group else None, 91 "group_member_count": group["member_count"] if group else None, 92 "group_is_private": group["is_private"] if group else None, 93 "group_url": group["url"] if group else None, 94 "group_created_at": group.get("created_at") if group else None, 95 96 "account_id": author.get("i") if "i" in author else author["id"], 97 "account_username": author.get("un") if "un" in author else author["username"], 98 "account_account": author.get("ac") if "ac"in author else author["acct"], 99 "account_display_name": author.get("dn") if "dn" in author else author["display_name"], 100 "account_note": author.get("nt") if "nt" in author else author["note"], 101 102 "link_id": card["id"] if card else None, 103 "link_url": card["url"] if card else None, 104 "link_title": card["title"] if card else None, 105 "link_description": card["description"] if card else None, 106 "link_type": card["type"] if card else None, 107 "link_image": card["image"] if card else None, 108 109 "image_urls": ",".join(image_urls), 110 "video_urls": ",".join(video_urls), 111 112 "thread_id": post.get("i") if "i" in post else post["conversation_id"], 113 "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S") 114 } 115 116 return MappedItem(mapped_item, message="".join(unknown_data)) if unknown_data else MappedItem(mapped_item)
Import scraped gab data
def
get_items(self, query):
26 def get_items(self, query): 27 """ 28 Run custom search 29 30 Not available for Gab 31 """ 32 raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere")
Run custom search
Not available for Gab
@staticmethod
def
map_item(post):
34 @staticmethod 35 def map_item(post): 36 """ 37 Parse Gab post 38 39 :param node: Data as received from Gab 40 :return dict: Mapped item 41 """ 42 unknown_data = [] 43 post_id = post.get("i", post["id"]) 44 metadata = post.get("__import_meta", {}) 45 timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown") 46 # reaction_type seems to just be nummeric keys; unsure which reactions they map to 47 reactions = post.get("rc", post.get("reactions_counts")) 48 if type(reactions) is not int: 49 reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()]) 50 else: 51 reaction_count = reactions 52 53 # Other dictionaries are nested in the post dictionary 54 group = post.get("g", post.get("group", {})) 55 author = post.get("author_info", post.get("account", {})) 56 mentions = post.get("m", post.get("mentions", [])) 57 tags = post.get("tg", post.get("tags", [])) 58 # card or link 59 card = post.get("card", post.get("link", {})) 60 # media or image_info 61 media_items = post.get("image_info", post.get("media_attachments", [])) 62 image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"] 63 video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"] 64 if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]): 65 unknown_data.extend([f"Unknown media type: {media}" for media in media_items if media.get('t', media.get('type')) not in ['image', 'video']]) 66 if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]): 67 unknown_data.extend([f"Media missing URL: {img}" for img in image_urls if img is None]) 68 unknown_data.extend([f"Media missing URL: {vid}" for vid in video_urls if vid is None]) 69 # remove None values from the lists 70 image_urls = [img for img in image_urls if img is not None] 71 video_urls = [vid for vid in video_urls if vid is not None] 72 73 post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ") 74 mapped_item = { 75 "collected_at": timestamp_collected, 76 "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected 77 "id": post_id, 78 "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"), 79 "body": post.get("c") if "c" in post else post["content"], 80 "url": post.get("ul") if "ul" in post else post["url"], 81 "reaction_count": reaction_count, 82 "favourites_count": post.get("fbc", post.get("favourites_count")), 83 "replies_count": post.get("rc", post.get("replies_count")), 84 "reblogs_count": post.get("rbc", post.get("reblogs_count")), 85 "mentions": ",".join([mention["username"] for mention in mentions]), 86 "tags": ",".join([tag["name"] for tag in tags]), 87 88 "group_id": group["id"] if group else None, 89 "group_title": group["title"] if group else None, 90 "group_description": group["description"] if group else None, 91 "group_member_count": group["member_count"] if group else None, 92 "group_is_private": group["is_private"] if group else None, 93 "group_url": group["url"] if group else None, 94 "group_created_at": group.get("created_at") if group else None, 95 96 "account_id": author.get("i") if "i" in author else author["id"], 97 "account_username": author.get("un") if "un" in author else author["username"], 98 "account_account": author.get("ac") if "ac"in author else author["acct"], 99 "account_display_name": author.get("dn") if "dn" in author else author["display_name"], 100 "account_note": author.get("nt") if "nt" in author else author["note"], 101 102 "link_id": card["id"] if card else None, 103 "link_url": card["url"] if card else None, 104 "link_title": card["title"] if card else None, 105 "link_description": card["description"] if card else None, 106 "link_type": card["type"] if card else None, 107 "link_image": card["image"] if card else None, 108 109 "image_urls": ",".join(image_urls), 110 "video_urls": ",".join(video_urls), 111 112 "thread_id": post.get("i") if "i" in post else post["conversation_id"], 113 "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S") 114 } 115 116 return MappedItem(mapped_item, message="".join(unknown_data)) if unknown_data else MappedItem(mapped_item)
Parse Gab post
Parameters
- node: Data as received from Gab
Returns
Mapped item
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor