Edit on GitHub

datasources.gab.search_gab

Import scraped Gab data

  1"""
  2Import scraped Gab data
  3"""
  4import datetime
  5
  6from backend.lib.search import Search
  7from common.lib.item_mapping import MappedItem, MissingMappedField
  8from common.config_manager import config
  9
 10
 11class SearchGab(Search):
 12    """
 13    Import scraped gab data
 14    """
 15    type = "gab-search"  # job ID
 16    category = "Search"  # category
 17    title = "Import scraped Gab data"  # title displayed in UI
 18    description = "Import Gab data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 19    extension = "ndjson"  # extension of result file, used internally and in UI
 20    is_from_zeeschuimer = True
 21    fake = ""
 22
 23    # not available as a processor for existing datasets
 24    accepts = [None]
 25
 26    def get_items(self, query):
 27        """
 28        Run custom search
 29
 30        Not available for Gab
 31        """
 32        raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere")
 33
 34    @staticmethod
 35    def map_item(post):
 36        """
 37        Parse Gab post
 38
 39        :param node:  Data as received from Gab
 40        :return dict:  Mapped item
 41        """
 42        post_id = post.get("i", post["id"])
 43        metadata = post.get("__import_meta", {})
 44        timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown")
 45        # reaction_type seems to just be nummeric keys; unsure which reactions they map to
 46        reactions =  post.get("rc", post.get("reactions_counts"))
 47        if type(reactions) != int:
 48            reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()])
 49        else:
 50            reaction_count = reactions
 51
 52        # Other dictionaries are nested in the post dictionary
 53        group = post.get("g", post.get("group", {}))
 54        author = post.get("author_info", post.get("account", {}))
 55        mentions = post.get("m", post.get("mentions", []))
 56        tags = post.get("tg", post.get("tags", []))
 57        # card or link
 58        card = post.get("card", post.get("link", {}))
 59        # media or image_info
 60        media_items = post.get("image_info", post.get("media_attachments", []))
 61        image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"]
 62        video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"]
 63        if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]):
 64            # TODO: Use MappedItem message; currently it is not called...
 65            config.with_db()
 66            config.db.log.warning(f"Unknown media type in post {post_id}: {media_items}")
 67        if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]):
 68            config.with_db()
 69            config.db.log.warning(f"Missing media URL in post {post_id}: {media_items}")
 70            image_urls = [img for img in image_urls if img is not None]
 71            video_urls = [vid for vid in video_urls if vid is not None]
 72        
 73        post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ")
 74        mapped_item = {
 75            "collected_at": timestamp_collected,
 76            "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected
 77            "id": post_id,
 78            "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"),
 79            "body": post.get("c") if "c" in post else post["content"],
 80            "url": post.get("ul") if "ul" in post else post["url"],
 81            "reaction_count": reaction_count,
 82            "favourites_count": post.get("fbc", post.get("favourites_count")),
 83            "replies_count": post.get("rc", post.get("replies_count")),
 84            "reblogs_count": post.get("rbc", post.get("reblogs_count")),
 85            "mentions": ",".join([mention["username"] for mention in mentions]),
 86            "tags": ",".join([tag["name"] for tag in tags]),	
 87
 88            "group_id": group["id"] if group else None,
 89            "group_title": group["title"] if group else None,
 90            "group_description": group["description"] if group else None,
 91            "group_member_count": group["member_count"] if group else None,
 92            "group_is_private": group["is_private"] if group else None,
 93            "group_url": group["url"] if group else None,
 94            "group_created_at": group.get("created_at") if group else None,
 95
 96            "account_id": author.get("i") if "i" in author else author["id"],
 97            "account_username": author.get("un") if "un" in author else author["username"],
 98            "account_account": author.get("ac") if "ac"in author else author["acct"],
 99            "account_display_name": author.get("dn") if "dn" in author else author["display_name"],
100            "account_note": author.get("nt") if "nt" in author else author["note"],
101
102            "link_id": card["id"] if card else None,
103            "link_url": card["url"] if card else None,
104            "link_title": card["title"] if card else None,
105            "link_description": card["description"] if card else None,
106            "link_type": card["type"] if card else None,
107            "link_image": card["image"] if card else None,
108
109            "image_urls": ",".join(image_urls),
110            "video_urls": ",".join(video_urls),
111
112            "thread_id": post.get("i") if "i" in post else post["conversation_id"],
113            "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S")
114        }        
115    
116        return MappedItem(mapped_item)
class SearchGab(backend.lib.search.Search):
 12class SearchGab(Search):
 13    """
 14    Import scraped gab data
 15    """
 16    type = "gab-search"  # job ID
 17    category = "Search"  # category
 18    title = "Import scraped Gab data"  # title displayed in UI
 19    description = "Import Gab data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 20    extension = "ndjson"  # extension of result file, used internally and in UI
 21    is_from_zeeschuimer = True
 22    fake = ""
 23
 24    # not available as a processor for existing datasets
 25    accepts = [None]
 26
 27    def get_items(self, query):
 28        """
 29        Run custom search
 30
 31        Not available for Gab
 32        """
 33        raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere")
 34
 35    @staticmethod
 36    def map_item(post):
 37        """
 38        Parse Gab post
 39
 40        :param node:  Data as received from Gab
 41        :return dict:  Mapped item
 42        """
 43        post_id = post.get("i", post["id"])
 44        metadata = post.get("__import_meta", {})
 45        timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown")
 46        # reaction_type seems to just be nummeric keys; unsure which reactions they map to
 47        reactions =  post.get("rc", post.get("reactions_counts"))
 48        if type(reactions) != int:
 49            reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()])
 50        else:
 51            reaction_count = reactions
 52
 53        # Other dictionaries are nested in the post dictionary
 54        group = post.get("g", post.get("group", {}))
 55        author = post.get("author_info", post.get("account", {}))
 56        mentions = post.get("m", post.get("mentions", []))
 57        tags = post.get("tg", post.get("tags", []))
 58        # card or link
 59        card = post.get("card", post.get("link", {}))
 60        # media or image_info
 61        media_items = post.get("image_info", post.get("media_attachments", []))
 62        image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"]
 63        video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"]
 64        if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]):
 65            # TODO: Use MappedItem message; currently it is not called...
 66            config.with_db()
 67            config.db.log.warning(f"Unknown media type in post {post_id}: {media_items}")
 68        if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]):
 69            config.with_db()
 70            config.db.log.warning(f"Missing media URL in post {post_id}: {media_items}")
 71            image_urls = [img for img in image_urls if img is not None]
 72            video_urls = [vid for vid in video_urls if vid is not None]
 73        
 74        post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ")
 75        mapped_item = {
 76            "collected_at": timestamp_collected,
 77            "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected
 78            "id": post_id,
 79            "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"),
 80            "body": post.get("c") if "c" in post else post["content"],
 81            "url": post.get("ul") if "ul" in post else post["url"],
 82            "reaction_count": reaction_count,
 83            "favourites_count": post.get("fbc", post.get("favourites_count")),
 84            "replies_count": post.get("rc", post.get("replies_count")),
 85            "reblogs_count": post.get("rbc", post.get("reblogs_count")),
 86            "mentions": ",".join([mention["username"] for mention in mentions]),
 87            "tags": ",".join([tag["name"] for tag in tags]),	
 88
 89            "group_id": group["id"] if group else None,
 90            "group_title": group["title"] if group else None,
 91            "group_description": group["description"] if group else None,
 92            "group_member_count": group["member_count"] if group else None,
 93            "group_is_private": group["is_private"] if group else None,
 94            "group_url": group["url"] if group else None,
 95            "group_created_at": group.get("created_at") if group else None,
 96
 97            "account_id": author.get("i") if "i" in author else author["id"],
 98            "account_username": author.get("un") if "un" in author else author["username"],
 99            "account_account": author.get("ac") if "ac"in author else author["acct"],
100            "account_display_name": author.get("dn") if "dn" in author else author["display_name"],
101            "account_note": author.get("nt") if "nt" in author else author["note"],
102
103            "link_id": card["id"] if card else None,
104            "link_url": card["url"] if card else None,
105            "link_title": card["title"] if card else None,
106            "link_description": card["description"] if card else None,
107            "link_type": card["type"] if card else None,
108            "link_image": card["image"] if card else None,
109
110            "image_urls": ",".join(image_urls),
111            "video_urls": ",".join(video_urls),
112
113            "thread_id": post.get("i") if "i" in post else post["conversation_id"],
114            "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S")
115        }        
116    
117        return MappedItem(mapped_item)

Import scraped gab data

type = 'gab-search'
category = 'Search'
title = 'Import scraped Gab data'
description = 'Import Gab data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
fake = ''
accepts = [None]
def get_items(self, query):
27    def get_items(self, query):
28        """
29        Run custom search
30
31        Not available for Gab
32        """
33        raise NotImplementedError("Gab datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Gab

@staticmethod
def map_item(post):
 35    @staticmethod
 36    def map_item(post):
 37        """
 38        Parse Gab post
 39
 40        :param node:  Data as received from Gab
 41        :return dict:  Mapped item
 42        """
 43        post_id = post.get("i", post["id"])
 44        metadata = post.get("__import_meta", {})
 45        timestamp_collected = datetime.datetime.fromtimestamp(metadata.get("timestamp_collected")/1000).strftime("%Y-%m-%d %H:%M:%S") if metadata.get("timestamp_collected") else MissingMappedField("Unknown")
 46        # reaction_type seems to just be nummeric keys; unsure which reactions they map to
 47        reactions =  post.get("rc", post.get("reactions_counts"))
 48        if type(reactions) != int:
 49            reaction_count = sum([reaction_value for reaction_type, reaction_value in post.get("rc", post.get("reactions_counts")).items()])
 50        else:
 51            reaction_count = reactions
 52
 53        # Other dictionaries are nested in the post dictionary
 54        group = post.get("g", post.get("group", {}))
 55        author = post.get("author_info", post.get("account", {}))
 56        mentions = post.get("m", post.get("mentions", []))
 57        tags = post.get("tg", post.get("tags", []))
 58        # card or link
 59        card = post.get("card", post.get("link", {}))
 60        # media or image_info
 61        media_items = post.get("image_info", post.get("media_attachments", []))
 62        image_urls = [media.get("u", media.get("url")) for media in media_items if media.get("t", media.get("type")) == "image"]
 63        video_urls = [media.get("smp4", media.get("source_mp4")) for media in media_items if media.get("t", media.get("type")) == "video"]
 64        if any([media_type not in ["image", "video"] for media_type in [media.get("t", media.get("type")) for media in media_items]]):
 65            # TODO: Use MappedItem message; currently it is not called...
 66            config.with_db()
 67            config.db.log.warning(f"Unknown media type in post {post_id}: {media_items}")
 68        if any([True for vid in video_urls if vid is None]) or any([True for img in image_urls if img is None]):
 69            config.with_db()
 70            config.db.log.warning(f"Missing media URL in post {post_id}: {media_items}")
 71            image_urls = [img for img in image_urls if img is not None]
 72            video_urls = [vid for vid in video_urls if vid is not None]
 73        
 74        post_time = datetime.datetime.strptime(post.get("ca", post.get("created_at")), "%Y-%m-%dT%H:%M:%S.%fZ")
 75        mapped_item = {
 76            "collected_at": timestamp_collected,
 77            "source_url": metadata.get("source_platform_url", MissingMappedField("Unknown")), # URL from which post was collected
 78            "id": post_id,
 79            "created_at": post_time.strftime("%Y-%m-%d %H:%M:%S"),
 80            "body": post.get("c") if "c" in post else post["content"],
 81            "url": post.get("ul") if "ul" in post else post["url"],
 82            "reaction_count": reaction_count,
 83            "favourites_count": post.get("fbc", post.get("favourites_count")),
 84            "replies_count": post.get("rc", post.get("replies_count")),
 85            "reblogs_count": post.get("rbc", post.get("reblogs_count")),
 86            "mentions": ",".join([mention["username"] for mention in mentions]),
 87            "tags": ",".join([tag["name"] for tag in tags]),	
 88
 89            "group_id": group["id"] if group else None,
 90            "group_title": group["title"] if group else None,
 91            "group_description": group["description"] if group else None,
 92            "group_member_count": group["member_count"] if group else None,
 93            "group_is_private": group["is_private"] if group else None,
 94            "group_url": group["url"] if group else None,
 95            "group_created_at": group.get("created_at") if group else None,
 96
 97            "account_id": author.get("i") if "i" in author else author["id"],
 98            "account_username": author.get("un") if "un" in author else author["username"],
 99            "account_account": author.get("ac") if "ac"in author else author["acct"],
100            "account_display_name": author.get("dn") if "dn" in author else author["display_name"],
101            "account_note": author.get("nt") if "nt" in author else author["note"],
102
103            "link_id": card["id"] if card else None,
104            "link_url": card["url"] if card else None,
105            "link_title": card["title"] if card else None,
106            "link_description": card["description"] if card else None,
107            "link_type": card["type"] if card else None,
108            "link_image": card["image"] if card else None,
109
110            "image_urls": ",".join(image_urls),
111            "video_urls": ",".join(video_urls),
112
113            "thread_id": post.get("i") if "i" in post else post["conversation_id"],
114            "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S")
115        }        
116    
117        return MappedItem(mapped_item)

Parse Gab post

Parameters
  • node: Data as received from Gab
Returns

Mapped item