datasources.gab.search_gab
Import scraped Gab data
1""" 2Import scraped Gab data 3""" 4import datetime 5 6from backend.lib.search import Search 7from common.lib.item_mapping import MappedItem, MissingMappedField 8from common.config_manager import config 9 10 11class SearchGab(Search): 12 """ 13 Import scraped gab data 14 """ 15 type = "gab-search" # job ID 16 category = "Search" # category 17 title = "Import scraped Gab data" # title displayed in UI 18 description = "Import Gab data collected with an external tool such as Zeeschuimer." # description displayed in UI 19 extension = "ndjson" # extension of result file, used internally and in UI 20 is_from_zeeschuimer = True 21 fake = "" 22 23 # not available as a processor for existing datasets 24 accepts = [None] 25 26 def get_items(self, query): 27 """ 28 Run custom search 29 30 Not available for Gab 31 """ 32 raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere") 33 34 @staticmethod 35 def map_item(post): 36 """ 37 Parse Gab post 38 39 :param node: Data as received from Gab 40 :return dict: Mapped item 41 """ 42 post_id = post.get("i", post["id"]) 43 metadata = post.get("__import_meta", {}) 44 timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown") 45 # reaction_type seems to just be nummeric keys; unsure which reactions they map to 46 reactions = post.get("rc", post.get("reactions_counts")) 47 if type(reactions) != int: 48 reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()]) 49 else: 50 reaction_count = reactions 51 52 # Other dictionaries are nested in the post dictionary 53 group = post.get("g", post.get("group", {})) 54 author = post.get("author_info", post.get("account", {})) 55 mentions = post.get("m", post.get("mentions", [])) 56 tags = post.get("tg", post.get("tags", [])) 57 # card or link 58 card = post.get("card", post.get("link", {})) 59 # media or image_info 60 media_items = post.get("image_info", post.get("media_attachments", [])) 61 image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"] 62 video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"] 63 if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]): 64 # TODO: Use MappedItem message; currently it is not called... 65 config.with_db() 66 config.db.log.warning(f"Unknown media type in post {post_id}: {media_items}") 67 if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]): 68 config.with_db() 69 config.db.log.warning(f"Missing media URL in post {post_id}: {media_items}") 70 image_urls = [img for img in image_urls if img is not None] 71 video_urls = [vid for vid in video_urls if vid is not None] 72 73 post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ") 74 mapped_item = { 75 "collected_at": timestamp_collected, 76 "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected 77 "id": post_id, 78 "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"), 79 "body": post.get("c") if "c" in post else post["content"], 80 "url": post.get("ul") if "ul" in post else post["url"], 81 "reaction_count": reaction_count, 82 "favourites_count": post.get("fbc", post.get("favourites_count")), 83 "replies_count": post.get("rc", post.get("replies_count")), 84 "reblogs_count": post.get("rbc", post.get("reblogs_count")), 85 "mentions": ",".join([mention["username"] for mention in mentions]), 86 "tags": ",".join([tag["name"] for tag in tags]), 87 88 "group_id": group["id"] if group else None, 89 "group_title": group["title"] if group else None, 90 "group_description": group["description"] if group else None, 91 "group_member_count": group["member_count"] if group else None, 92 "group_is_private": group["is_private"] if group else None, 93 "group_url": group["url"] if group else None, 94 "group_created_at": group.get("created_at") if group else None, 95 96 "account_id": author.get("i") if "i" in author else author["id"], 97 "account_username": author.get("un") if "un" in author else author["username"], 98 "account_account": author.get("ac") if "ac"in author else author["acct"], 99 "account_display_name": author.get("dn") if "dn" in author else author["display_name"], 100 "account_note": author.get("nt") if "nt" in author else author["note"], 101 102 "link_id": card["id"] if card else None, 103 "link_url": card["url"] if card else None, 104 "link_title": card["title"] if card else None, 105 "link_description": card["description"] if card else None, 106 "link_type": card["type"] if card else None, 107 "link_image": card["image"] if card else None, 108 109 "image_urls": ",".join(image_urls), 110 "video_urls": ",".join(video_urls), 111 112 "thread_id": post.get("i") if "i" in post else post["conversation_id"], 113 "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S") 114 } 115 116 return MappedItem(mapped_item)
12class SearchGab(Search): 13 """ 14 Import scraped gab data 15 """ 16 type = "gab-search" # job ID 17 category = "Search" # category 18 title = "Import scraped Gab data" # title displayed in UI 19 description = "Import Gab data collected with an external tool such as Zeeschuimer." # description displayed in UI 20 extension = "ndjson" # extension of result file, used internally and in UI 21 is_from_zeeschuimer = True 22 fake = "" 23 24 # not available as a processor for existing datasets 25 accepts = [None] 26 27 def get_items(self, query): 28 """ 29 Run custom search 30 31 Not available for Gab 32 """ 33 raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere") 34 35 @staticmethod 36 def map_item(post): 37 """ 38 Parse Gab post 39 40 :param node: Data as received from Gab 41 :return dict: Mapped item 42 """ 43 post_id = post.get("i", post["id"]) 44 metadata = post.get("__import_meta", {}) 45 timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown") 46 # reaction_type seems to just be nummeric keys; unsure which reactions they map to 47 reactions = post.get("rc", post.get("reactions_counts")) 48 if type(reactions) != int: 49 reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()]) 50 else: 51 reaction_count = reactions 52 53 # Other dictionaries are nested in the post dictionary 54 group = post.get("g", post.get("group", {})) 55 author = post.get("author_info", post.get("account", {})) 56 mentions = post.get("m", post.get("mentions", [])) 57 tags = post.get("tg", post.get("tags", [])) 58 # card or link 59 card = post.get("card", post.get("link", {})) 60 # media or image_info 61 media_items = post.get("image_info", post.get("media_attachments", [])) 62 image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"] 63 video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"] 64 if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]): 65 # TODO: Use MappedItem message; currently it is not called... 66 config.with_db() 67 config.db.log.warning(f"Unknown media type in post {post_id}: {media_items}") 68 if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]): 69 config.with_db() 70 config.db.log.warning(f"Missing media URL in post {post_id}: {media_items}") 71 image_urls = [img for img in image_urls if img is not None] 72 video_urls = [vid for vid in video_urls if vid is not None] 73 74 post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ") 75 mapped_item = { 76 "collected_at": timestamp_collected, 77 "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected 78 "id": post_id, 79 "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"), 80 "body": post.get("c") if "c" in post else post["content"], 81 "url": post.get("ul") if "ul" in post else post["url"], 82 "reaction_count": reaction_count, 83 "favourites_count": post.get("fbc", post.get("favourites_count")), 84 "replies_count": post.get("rc", post.get("replies_count")), 85 "reblogs_count": post.get("rbc", post.get("reblogs_count")), 86 "mentions": ",".join([mention["username"] for mention in mentions]), 87 "tags": ",".join([tag["name"] for tag in tags]), 88 89 "group_id": group["id"] if group else None, 90 "group_title": group["title"] if group else None, 91 "group_description": group["description"] if group else None, 92 "group_member_count": group["member_count"] if group else None, 93 "group_is_private": group["is_private"] if group else None, 94 "group_url": group["url"] if group else None, 95 "group_created_at": group.get("created_at") if group else None, 96 97 "account_id": author.get("i") if "i" in author else author["id"], 98 "account_username": author.get("un") if "un" in author else author["username"], 99 "account_account": author.get("ac") if "ac"in author else author["acct"], 100 "account_display_name": author.get("dn") if "dn" in author else author["display_name"], 101 "account_note": author.get("nt") if "nt" in author else author["note"], 102 103 "link_id": card["id"] if card else None, 104 "link_url": card["url"] if card else None, 105 "link_title": card["title"] if card else None, 106 "link_description": card["description"] if card else None, 107 "link_type": card["type"] if card else None, 108 "link_image": card["image"] if card else None, 109 110 "image_urls": ",".join(image_urls), 111 "video_urls": ",".join(video_urls), 112 113 "thread_id": post.get("i") if "i" in post else post["conversation_id"], 114 "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S") 115 } 116 117 return MappedItem(mapped_item)
Import scraped gab data
def
get_items(self, query):
27 def get_items(self, query): 28 """ 29 Run custom search 30 31 Not available for Gab 32 """ 33 raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere")
Run custom search
Not available for Gab
@staticmethod
def
map_item(post):
35 @staticmethod 36 def map_item(post): 37 """ 38 Parse Gab post 39 40 :param node: Data as received from Gab 41 :return dict: Mapped item 42 """ 43 post_id = post.get("i", post["id"]) 44 metadata = post.get("__import_meta", {}) 45 timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown") 46 # reaction_type seems to just be nummeric keys; unsure which reactions they map to 47 reactions = post.get("rc", post.get("reactions_counts")) 48 if type(reactions) != int: 49 reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()]) 50 else: 51 reaction_count = reactions 52 53 # Other dictionaries are nested in the post dictionary 54 group = post.get("g", post.get("group", {})) 55 author = post.get("author_info", post.get("account", {})) 56 mentions = post.get("m", post.get("mentions", [])) 57 tags = post.get("tg", post.get("tags", [])) 58 # card or link 59 card = post.get("card", post.get("link", {})) 60 # media or image_info 61 media_items = post.get("image_info", post.get("media_attachments", [])) 62 image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"] 63 video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"] 64 if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]): 65 # TODO: Use MappedItem message; currently it is not called... 66 config.with_db() 67 config.db.log.warning(f"Unknown media type in post {post_id}: {media_items}") 68 if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]): 69 config.with_db() 70 config.db.log.warning(f"Missing media URL in post {post_id}: {media_items}") 71 image_urls = [img for img in image_urls if img is not None] 72 video_urls = [vid for vid in video_urls if vid is not None] 73 74 post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ") 75 mapped_item = { 76 "collected_at": timestamp_collected, 77 "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected 78 "id": post_id, 79 "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"), 80 "body": post.get("c") if "c" in post else post["content"], 81 "url": post.get("ul") if "ul" in post else post["url"], 82 "reaction_count": reaction_count, 83 "favourites_count": post.get("fbc", post.get("favourites_count")), 84 "replies_count": post.get("rc", post.get("replies_count")), 85 "reblogs_count": post.get("rbc", post.get("reblogs_count")), 86 "mentions": ",".join([mention["username"] for mention in mentions]), 87 "tags": ",".join([tag["name"] for tag in tags]), 88 89 "group_id": group["id"] if group else None, 90 "group_title": group["title"] if group else None, 91 "group_description": group["description"] if group else None, 92 "group_member_count": group["member_count"] if group else None, 93 "group_is_private": group["is_private"] if group else None, 94 "group_url": group["url"] if group else None, 95 "group_created_at": group.get("created_at") if group else None, 96 97 "account_id": author.get("i") if "i" in author else author["id"], 98 "account_username": author.get("un") if "un" in author else author["username"], 99 "account_account": author.get("ac") if "ac"in author else author["acct"], 100 "account_display_name": author.get("dn") if "dn" in author else author["display_name"], 101 "account_note": author.get("nt") if "nt" in author else author["note"], 102 103 "link_id": card["id"] if card else None, 104 "link_url": card["url"] if card else None, 105 "link_title": card["title"] if card else None, 106 "link_description": card["description"] if card else None, 107 "link_type": card["type"] if card else None, 108 "link_image": card["image"] if card else None, 109 110 "image_urls": ",".join(image_urls), 111 "video_urls": ",".join(video_urls), 112 113 "thread_id": post.get("i") if "i" in post else post["conversation_id"], 114 "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S") 115 } 116 117 return MappedItem(mapped_item)
Parse Gab post
Parameters
- node: Data as received from Gab
Returns
Mapped item
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor