Edit on GitHub

datasources.gab.search_gab

Import scraped Gab data

View Source

  1"""
  2Import scraped Gab data
  3"""
  4import datetime
  5
  6from backend.lib.search import Search
  7from common.lib.item_mapping import MappedItem, MissingMappedField
  8
  9
 10class SearchGab(Search):
 11    """
 12    Import scraped gab data
 13    """
 14    type = "gab-search"  # job ID
 15    category = "Search"  # category
 16    title = "Import scraped Gab data"  # title displayed in UI
 17    description = "Import Gab data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 18    extension = "ndjson"  # extension of result file, used internally and in UI
 19    is_from_zeeschuimer = True
 20    fake = ""
 21
 22    # not available as a processor for existing datasets
 23    accepts = [None]
 24
 25    def get_items(self, query):
 26        """
 27        Run custom search
 28
 29        Not available for Gab
 30        """
 31        raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere")
 32
 33    @staticmethod
 34    def map_item(post):
 35        """
 36        Parse Gab post
 37
 38        :param node:  Data as received from Gab
 39        :return dict:  Mapped item
 40        """
 41        unknown_data = []
 42        post_id = post.get("i", post["id"])
 43        metadata = post.get("__import_meta", {})
 44        timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown")
 45        # reaction_type seems to just be nummeric keys; unsure which reactions they map to
 46        reactions =  post.get("rc", post.get("reactions_counts"))
 47        if type(reactions) is not int:
 48            reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()])
 49        else:
 50            reaction_count = reactions
 51
 52        # Other dictionaries are nested in the post dictionary
 53        group = post.get("g", post.get("group", {}))
 54        author = post.get("author_info", post.get("account", {}))
 55        mentions = post.get("m", post.get("mentions", []))
 56        tags = post.get("tg", post.get("tags", []))
 57        # card or link
 58        card = post.get("card", post.get("link", {}))
 59        # media or image_info
 60        media_items = post.get("image_info", post.get("media_attachments", []))
 61        image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"]
 62        video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"]
 63        if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]):
 64            unknown_data.extend([f"Unknown media type: {media}" for media in media_items if media.get('t', media.get('type')) not in ['image', 'video']])
 65        if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]):
 66            unknown_data.extend([f"Media missing URL: {img}" for img in image_urls if img is None])
 67            unknown_data.extend([f"Media missing URL: {vid}" for vid in video_urls if vid is None])
 68            # remove None values from the lists
 69            image_urls = [img for img in image_urls if img is not None]
 70            video_urls = [vid for vid in video_urls if vid is not None]
 71        
 72        post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ")
 73        mapped_item = {
 74            "collected_at": timestamp_collected,
 75            "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected
 76            "id": post_id,
 77            "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"),
 78            "body": post.get("c") if "c" in post else post["content"],
 79            "url": post.get("ul") if "ul" in post else post["url"],
 80            "reaction_count": reaction_count,
 81            "favourites_count": post.get("fbc", post.get("favourites_count")),
 82            "replies_count": post.get("rc", post.get("replies_count")),
 83            "reblogs_count": post.get("rbc", post.get("reblogs_count")),
 84            "mentions": ",".join([mention["username"] for mention in mentions]),
 85            "tags": ",".join([tag["name"] for tag in tags]),	
 86
 87            "group_id": group["id"] if group else None,
 88            "group_title": group["title"] if group else None,
 89            "group_description": group["description"] if group else None,
 90            "group_member_count": group["member_count"] if group else None,
 91            "group_is_private": group["is_private"] if group else None,
 92            "group_url": group["url"] if group else None,
 93            "group_created_at": group.get("created_at") if group else None,
 94
 95            "account_id": author.get("i") if "i" in author else author["id"],
 96            "account_username": author.get("un") if "un" in author else author["username"],
 97            "account_account": author.get("ac") if "ac"in author else author["acct"],
 98            "account_display_name": author.get("dn") if "dn" in author else author["display_name"],
 99            "account_note": author.get("nt") if "nt" in author else author["note"],
100
101            "link_id": card["id"] if card else None,
102            "link_url": card["url"] if card else None,
103            "link_title": card["title"] if card else None,
104            "link_description": card["description"] if card else None,
105            "link_type": card["type"] if card else None,
106            "link_image": card["image"] if card else None,
107
108            "image_urls": ",".join(image_urls),
109            "video_urls": ",".join(video_urls),
110
111            "thread_id": post.get("i") if "i" in post else post["conversation_id"],
112            "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S")
113        }        
114    
115        return MappedItem(mapped_item, message="".join(unknown_data)) if unknown_data else MappedItem(mapped_item)

class SearchGab(backend.lib.search.Search): View Source

 11class SearchGab(Search):
 12    """
 13    Import scraped gab data
 14    """
 15    type = "gab-search"  # job ID
 16    category = "Search"  # category
 17    title = "Import scraped Gab data"  # title displayed in UI
 18    description = "Import Gab data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 19    extension = "ndjson"  # extension of result file, used internally and in UI
 20    is_from_zeeschuimer = True
 21    fake = ""
 22
 23    # not available as a processor for existing datasets
 24    accepts = [None]
 25
 26    def get_items(self, query):
 27        """
 28        Run custom search
 29
 30        Not available for Gab
 31        """
 32        raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere")
 33
 34    @staticmethod
 35    def map_item(post):
 36        """
 37        Parse Gab post
 38
 39        :param node:  Data as received from Gab
 40        :return dict:  Mapped item
 41        """
 42        unknown_data = []
 43        post_id = post.get("i", post["id"])
 44        metadata = post.get("__import_meta", {})
 45        timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown")
 46        # reaction_type seems to just be nummeric keys; unsure which reactions they map to
 47        reactions =  post.get("rc", post.get("reactions_counts"))
 48        if type(reactions) is not int:
 49            reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()])
 50        else:
 51            reaction_count = reactions
 52
 53        # Other dictionaries are nested in the post dictionary
 54        group = post.get("g", post.get("group", {}))
 55        author = post.get("author_info", post.get("account", {}))
 56        mentions = post.get("m", post.get("mentions", []))
 57        tags = post.get("tg", post.get("tags", []))
 58        # card or link
 59        card = post.get("card", post.get("link", {}))
 60        # media or image_info
 61        media_items = post.get("image_info", post.get("media_attachments", []))
 62        image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"]
 63        video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"]
 64        if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]):
 65            unknown_data.extend([f"Unknown media type: {media}" for media in media_items if media.get('t', media.get('type')) not in ['image', 'video']])
 66        if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]):
 67            unknown_data.extend([f"Media missing URL: {img}" for img in image_urls if img is None])
 68            unknown_data.extend([f"Media missing URL: {vid}" for vid in video_urls if vid is None])
 69            # remove None values from the lists
 70            image_urls = [img for img in image_urls if img is not None]
 71            video_urls = [vid for vid in video_urls if vid is not None]
 72        
 73        post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ")
 74        mapped_item = {
 75            "collected_at": timestamp_collected,
 76            "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected
 77            "id": post_id,
 78            "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"),
 79            "body": post.get("c") if "c" in post else post["content"],
 80            "url": post.get("ul") if "ul" in post else post["url"],
 81            "reaction_count": reaction_count,
 82            "favourites_count": post.get("fbc", post.get("favourites_count")),
 83            "replies_count": post.get("rc", post.get("replies_count")),
 84            "reblogs_count": post.get("rbc", post.get("reblogs_count")),
 85            "mentions": ",".join([mention["username"] for mention in mentions]),
 86            "tags": ",".join([tag["name"] for tag in tags]),	
 87
 88            "group_id": group["id"] if group else None,
 89            "group_title": group["title"] if group else None,
 90            "group_description": group["description"] if group else None,
 91            "group_member_count": group["member_count"] if group else None,
 92            "group_is_private": group["is_private"] if group else None,
 93            "group_url": group["url"] if group else None,
 94            "group_created_at": group.get("created_at") if group else None,
 95
 96            "account_id": author.get("i") if "i" in author else author["id"],
 97            "account_username": author.get("un") if "un" in author else author["username"],
 98            "account_account": author.get("ac") if "ac"in author else author["acct"],
 99            "account_display_name": author.get("dn") if "dn" in author else author["display_name"],
100            "account_note": author.get("nt") if "nt" in author else author["note"],
101
102            "link_id": card["id"] if card else None,
103            "link_url": card["url"] if card else None,
104            "link_title": card["title"] if card else None,
105            "link_description": card["description"] if card else None,
106            "link_type": card["type"] if card else None,
107            "link_image": card["image"] if card else None,
108
109            "image_urls": ",".join(image_urls),
110            "video_urls": ",".join(video_urls),
111
112            "thread_id": post.get("i") if "i" in post else post["conversation_id"],
113            "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S")
114        }        
115    
116        return MappedItem(mapped_item, message="".join(unknown_data)) if unknown_data else MappedItem(mapped_item)

Import scraped gab data

type = 'gab-search'

category = 'Search'

title = 'Import scraped Gab data'

description = 'Import Gab data collected with an external tool such as Zeeschuimer.'

extension = 'ndjson'

is_from_zeeschuimer = True

fake = ''

accepts = [None]

def get_items(self, query): View Source

26    def get_items(self, query):
27        """
28        Run custom search
29
30        Not available for Gab
31        """
32        raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Gab

@staticmethod

def map_item(post): View Source

 34    @staticmethod
 35    def map_item(post):
 36        """
 37        Parse Gab post
 38
 39        :param node:  Data as received from Gab
 40        :return dict:  Mapped item
 41        """
 42        unknown_data = []
 43        post_id = post.get("i", post["id"])
 44        metadata = post.get("__import_meta", {})
 45        timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown")
 46        # reaction_type seems to just be nummeric keys; unsure which reactions they map to
 47        reactions =  post.get("rc", post.get("reactions_counts"))
 48        if type(reactions) is not int:
 49            reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()])
 50        else:
 51            reaction_count = reactions
 52
 53        # Other dictionaries are nested in the post dictionary
 54        group = post.get("g", post.get("group", {}))
 55        author = post.get("author_info", post.get("account", {}))
 56        mentions = post.get("m", post.get("mentions", []))
 57        tags = post.get("tg", post.get("tags", []))
 58        # card or link
 59        card = post.get("card", post.get("link", {}))
 60        # media or image_info
 61        media_items = post.get("image_info", post.get("media_attachments", []))
 62        image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"]
 63        video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"]
 64        if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]):
 65            unknown_data.extend([f"Unknown media type: {media}" for media in media_items if media.get('t', media.get('type')) not in ['image', 'video']])
 66        if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]):
 67            unknown_data.extend([f"Media missing URL: {img}" for img in image_urls if img is None])
 68            unknown_data.extend([f"Media missing URL: {vid}" for vid in video_urls if vid is None])
 69            # remove None values from the lists
 70            image_urls = [img for img in image_urls if img is not None]
 71            video_urls = [vid for vid in video_urls if vid is not None]
 72        
 73        post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ")
 74        mapped_item = {
 75            "collected_at": timestamp_collected,
 76            "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected
 77            "id": post_id,
 78            "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"),
 79            "body": post.get("c") if "c" in post else post["content"],
 80            "url": post.get("ul") if "ul" in post else post["url"],
 81            "reaction_count": reaction_count,
 82            "favourites_count": post.get("fbc", post.get("favourites_count")),
 83            "replies_count": post.get("rc", post.get("replies_count")),
 84            "reblogs_count": post.get("rbc", post.get("reblogs_count")),
 85            "mentions": ",".join([mention["username"] for mention in mentions]),
 86            "tags": ",".join([tag["name"] for tag in tags]),	
 87
 88            "group_id": group["id"] if group else None,
 89            "group_title": group["title"] if group else None,
 90            "group_description": group["description"] if group else None,
 91            "group_member_count": group["member_count"] if group else None,
 92            "group_is_private": group["is_private"] if group else None,
 93            "group_url": group["url"] if group else None,
 94            "group_created_at": group.get("created_at") if group else None,
 95
 96            "account_id": author.get("i") if "i" in author else author["id"],
 97            "account_username": author.get("un") if "un" in author else author["username"],
 98            "account_account": author.get("ac") if "ac"in author else author["acct"],
 99            "account_display_name": author.get("dn") if "dn" in author else author["display_name"],
100            "account_note": author.get("nt") if "nt" in author else author["note"],
101
102            "link_id": card["id"] if card else None,
103            "link_url": card["url"] if card else None,
104            "link_title": card["title"] if card else None,
105            "link_description": card["description"] if card else None,
106            "link_type": card["type"] if card else None,
107            "link_image": card["image"] if card else None,
108
109            "image_urls": ",".join(image_urls),
110            "video_urls": ",".join(video_urls),
111
112            "thread_id": post.get("i") if "i" in post else post["conversation_id"],
113            "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S")
114        }        
115    
116        return MappedItem(mapped_item, message="".join(unknown_data)) if unknown_data else MappedItem(mapped_item)

Parse Gab post

Parameters

node: Data as received from Gab

Returns

Mapped item

Inherited Members

backend.lib.worker.BasicWorker: BasicWorker; INTERRUPT_NONE; INTERRUPT_RETRY; INTERRUPT_CANCEL; queue; log; manager; interrupted; modules; init_time; name; run; clean_up; request_interrupt; is_4cat_class
backend.lib.search.Search: max_workers; prefix; return_cols; import_error_count; import_warning_count; process; search; import_from_file; items_to_csv; items_to_ndjson; items_to_archive
backend.lib.processor.BasicProcessor: db; job; dataset; owner; source_dataset; source_file; config; is_running_in_preset; is_hidden; filepath; work; after_process; remove_files; abort; iterate_proxied_requests; push_proxied_request; flush_proxied_requests; iterate_archive_contents; unpack_archive_contents; extract_archived_file_by_name; write_csv_items_and_finish; write_archive_and_finish; create_standalone; save_annotations; map_item_method_available; get_mapped_item; is_filter; get_options; get_status; is_top_dataset; is_from_collector; get_extension; is_rankable; exclude_followup_processors; is_4cat_processor