Edit on GitHub

datasources.truth.search_truth

Import scraped Truth Social data

View Source

  1"""
  2Import scraped Truth Social data
  3"""
  4import datetime
  5
  6from backend.lib.search import Search
  7from common.lib.item_mapping import MappedItem
  8
  9
 10class SearchGab(Search):
 11    """
 12    Import scraped truth social data
 13    """
 14    type = "truthsocial-search"  # job ID
 15    category = "Search"  # category
 16    title = "Import scraped Truth Social data"  # title displayed in UI
 17    description = "Import Truth Social data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 18    extension = "ndjson"  # extension of result file, used internally and in UI
 19    is_from_zeeschuimer = True
 20    fake = ""
 21
 22    # not available as a processor for existing datasets
 23    accepts = [None]
 24
 25    def get_items(self, query):
 26        """
 27        Run custom search
 28
 29        Not available for Truth Social
 30        """
 31        raise NotImplementedError("Truth Social datasets can only be created by importing data from elsewhere")
 32
 33    @staticmethod
 34    def map_item(post):
 35        """
 36        Parse Truth Social post
 37
 38        :param post:  Data as received from Truth Social
 39        :return dict:  Mapped item
 40        """
 41        errors = []
 42        post_time = datetime.datetime.strptime(post["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
 43        images = []
 44        videos = []
 45        video_thumbs = []
 46        if "media_attachments" in post:
 47            for media in post["media_attachments"]:
 48                mtype = media.get("type")
 49                if mtype == "image":
 50                    images.append(media.get("url"))
 51                elif mtype == "video":
 52                    videos.append(media.get("url"))
 53                    video_thumbs.append(media.get("preview_url"))
 54                elif mtype == "tv":
 55                    # Truth social has "TV channels" with videos
 56                    # These do not have direct links to media
 57                    # url is a thumbnail
 58                    video_thumbs.append(media.get("url"))
 59                    # preview_url is a smaller thumb
 60                else:
 61                    errors.append(f"New media type: {mtype}")
 62
 63        group = post.get("group") if post.get("group") else {}
 64        
 65        if post.get("quote_id", None):
 66            thread_id = post.get("quote_id")
 67        elif post.get("in_reply_to", None):
 68            reply_to = post.get("in_reply_to")
 69            while reply_to:
 70                if reply_to.get("in_reply_to", None):
 71                    reply_to = reply_to.get("in_reply_to")
 72                else:
 73                    thread_id = reply_to.get("id")
 74                    break
 75        else:
 76            thread_id = post.get("id")
 77        
 78        mentions = [mention.get("username") for mention in (post.get("mentions") if post.get("mentions") else [])]
 79        hashtags = [tag.get("name") for tag in (post.get("tags") if post.get("tags") else [])]
 80    
 81        mapped_item = {
 82            "id": post["id"],
 83            "created_at": post["created_at"],
 84            "body": post["content"],
 85            "url": post.get("url", None),
 86            "reblogs_count": post.get("reblogs_count", 0),
 87            "replies_count": post.get("replies_count", 0),
 88
 89            "account_id": post["account"]["id"],
 90            "account_username": post["account"]["username"],
 91            "account_display_name": post["account"]["display_name"],
 92            "account_avatar": post["account"]["avatar"],
 93            "account_verified": post["account"]["verified"],
 94            "account_followers": post["account"]["followers_count"],
 95            "account_following": post["account"]["following_count"],
 96            
 97            "mentions": ",".join(mentions),
 98            "hashtags": ",".join(hashtags),
 99
100            # media
101            "images": ",".join(images),
102            "video_thumbs": ",".join(video_thumbs),
103            "video_urls": ",".join(videos),
104            
105            # group
106            "group_id": group.get("id", None),
107            "group_display_name": group.get("display_name", None),
108            "group_avatar": group.get("avatar", None),
109            "group_note": group.get("note", None),
110            "group_members_count": group.get("members_count", 0),
111
112            "thread_id": thread_id,
113            "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S")
114        }        
115
116        return MappedItem(mapped_item, message="; ".join(errors))

class SearchGab(backend.lib.search.Search): View Source

 11class SearchGab(Search):
 12    """
 13    Import scraped truth social data
 14    """
 15    type = "truthsocial-search"  # job ID
 16    category = "Search"  # category
 17    title = "Import scraped Truth Social data"  # title displayed in UI
 18    description = "Import Truth Social data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 19    extension = "ndjson"  # extension of result file, used internally and in UI
 20    is_from_zeeschuimer = True
 21    fake = ""
 22
 23    # not available as a processor for existing datasets
 24    accepts = [None]
 25
 26    def get_items(self, query):
 27        """
 28        Run custom search
 29
 30        Not available for Truth Social
 31        """
 32        raise NotImplementedError("Truth Social datasets can only be created by importing data from elsewhere")
 33
 34    @staticmethod
 35    def map_item(post):
 36        """
 37        Parse Truth Social post
 38
 39        :param post:  Data as received from Truth Social
 40        :return dict:  Mapped item
 41        """
 42        errors = []
 43        post_time = datetime.datetime.strptime(post["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
 44        images = []
 45        videos = []
 46        video_thumbs = []
 47        if "media_attachments" in post:
 48            for media in post["media_attachments"]:
 49                mtype = media.get("type")
 50                if mtype == "image":
 51                    images.append(media.get("url"))
 52                elif mtype == "video":
 53                    videos.append(media.get("url"))
 54                    video_thumbs.append(media.get("preview_url"))
 55                elif mtype == "tv":
 56                    # Truth social has "TV channels" with videos
 57                    # These do not have direct links to media
 58                    # url is a thumbnail
 59                    video_thumbs.append(media.get("url"))
 60                    # preview_url is a smaller thumb
 61                else:
 62                    errors.append(f"New media type: {mtype}")
 63
 64        group = post.get("group") if post.get("group") else {}
 65        
 66        if post.get("quote_id", None):
 67            thread_id = post.get("quote_id")
 68        elif post.get("in_reply_to", None):
 69            reply_to = post.get("in_reply_to")
 70            while reply_to:
 71                if reply_to.get("in_reply_to", None):
 72                    reply_to = reply_to.get("in_reply_to")
 73                else:
 74                    thread_id = reply_to.get("id")
 75                    break
 76        else:
 77            thread_id = post.get("id")
 78        
 79        mentions = [mention.get("username") for mention in (post.get("mentions") if post.get("mentions") else [])]
 80        hashtags = [tag.get("name") for tag in (post.get("tags") if post.get("tags") else [])]
 81    
 82        mapped_item = {
 83            "id": post["id"],
 84            "created_at": post["created_at"],
 85            "body": post["content"],
 86            "url": post.get("url", None),
 87            "reblogs_count": post.get("reblogs_count", 0),
 88            "replies_count": post.get("replies_count", 0),
 89
 90            "account_id": post["account"]["id"],
 91            "account_username": post["account"]["username"],
 92            "account_display_name": post["account"]["display_name"],
 93            "account_avatar": post["account"]["avatar"],
 94            "account_verified": post["account"]["verified"],
 95            "account_followers": post["account"]["followers_count"],
 96            "account_following": post["account"]["following_count"],
 97            
 98            "mentions": ",".join(mentions),
 99            "hashtags": ",".join(hashtags),
100
101            # media
102            "images": ",".join(images),
103            "video_thumbs": ",".join(video_thumbs),
104            "video_urls": ",".join(videos),
105            
106            # group
107            "group_id": group.get("id", None),
108            "group_display_name": group.get("display_name", None),
109            "group_avatar": group.get("avatar", None),
110            "group_note": group.get("note", None),
111            "group_members_count": group.get("members_count", 0),
112
113            "thread_id": thread_id,
114            "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S")
115        }        
116
117        return MappedItem(mapped_item, message="; ".join(errors))

Import scraped truth social data

type = 'truthsocial-search'

category = 'Search'

title = 'Import scraped Truth Social data'

description = 'Import Truth Social data collected with an external tool such as Zeeschuimer.'

extension = 'ndjson'

is_from_zeeschuimer = True

fake = ''

accepts = [None]

def get_items(self, query): View Source

26    def get_items(self, query):
27        """
28        Run custom search
29
30        Not available for Truth Social
31        """
32        raise NotImplementedError("Truth Social datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Truth Social

@staticmethod

def map_item(post): View Source

 34    @staticmethod
 35    def map_item(post):
 36        """
 37        Parse Truth Social post
 38
 39        :param post:  Data as received from Truth Social
 40        :return dict:  Mapped item
 41        """
 42        errors = []
 43        post_time = datetime.datetime.strptime(post["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
 44        images = []
 45        videos = []
 46        video_thumbs = []
 47        if "media_attachments" in post:
 48            for media in post["media_attachments"]:
 49                mtype = media.get("type")
 50                if mtype == "image":
 51                    images.append(media.get("url"))
 52                elif mtype == "video":
 53                    videos.append(media.get("url"))
 54                    video_thumbs.append(media.get("preview_url"))
 55                elif mtype == "tv":
 56                    # Truth social has "TV channels" with videos
 57                    # These do not have direct links to media
 58                    # url is a thumbnail
 59                    video_thumbs.append(media.get("url"))
 60                    # preview_url is a smaller thumb
 61                else:
 62                    errors.append(f"New media type: {mtype}")
 63
 64        group = post.get("group") if post.get("group") else {}
 65        
 66        if post.get("quote_id", None):
 67            thread_id = post.get("quote_id")
 68        elif post.get("in_reply_to", None):
 69            reply_to = post.get("in_reply_to")
 70            while reply_to:
 71                if reply_to.get("in_reply_to", None):
 72                    reply_to = reply_to.get("in_reply_to")
 73                else:
 74                    thread_id = reply_to.get("id")
 75                    break
 76        else:
 77            thread_id = post.get("id")
 78        
 79        mentions = [mention.get("username") for mention in (post.get("mentions") if post.get("mentions") else [])]
 80        hashtags = [tag.get("name") for tag in (post.get("tags") if post.get("tags") else [])]
 81    
 82        mapped_item = {
 83            "id": post["id"],
 84            "created_at": post["created_at"],
 85            "body": post["content"],
 86            "url": post.get("url", None),
 87            "reblogs_count": post.get("reblogs_count", 0),
 88            "replies_count": post.get("replies_count", 0),
 89
 90            "account_id": post["account"]["id"],
 91            "account_username": post["account"]["username"],
 92            "account_display_name": post["account"]["display_name"],
 93            "account_avatar": post["account"]["avatar"],
 94            "account_verified": post["account"]["verified"],
 95            "account_followers": post["account"]["followers_count"],
 96            "account_following": post["account"]["following_count"],
 97            
 98            "mentions": ",".join(mentions),
 99            "hashtags": ",".join(hashtags),
100
101            # media
102            "images": ",".join(images),
103            "video_thumbs": ",".join(video_thumbs),
104            "video_urls": ",".join(videos),
105            
106            # group
107            "group_id": group.get("id", None),
108            "group_display_name": group.get("display_name", None),
109            "group_avatar": group.get("avatar", None),
110            "group_note": group.get("note", None),
111            "group_members_count": group.get("members_count", 0),
112
113            "thread_id": thread_id,
114            "timestamp": post_time.strftime("%Y-%m-%d %H:%M:%S")
115        }        
116
117        return MappedItem(mapped_item, message="; ".join(errors))

Parse Truth Social post

Parameters

post: Data as received from Truth Social

Returns

Mapped item

Inherited Members

backend.lib.worker.BasicWorker: BasicWorker; INTERRUPT_NONE; INTERRUPT_RETRY; INTERRUPT_CANCEL; queue; log; manager; interrupted; modules; init_time; name; run; clean_up; request_interrupt; is_4cat_class
backend.lib.search.Search: max_workers; prefix; return_cols; import_error_count; import_warning_count; process; search; import_from_file; items_to_csv; items_to_ndjson; items_to_archive
backend.lib.processor.BasicProcessor: db; job; dataset; owner; source_dataset; source_file; config; is_running_in_preset; is_hidden; filepath; work; after_process; remove_files; abort; iterate_proxied_requests; push_proxied_request; flush_proxied_requests; iterate_archive_contents; unpack_archive_contents; extract_archived_file_by_name; write_csv_items_and_finish; write_archive_and_finish; create_standalone; save_annotations; map_item_method_available; get_mapped_item; is_filter; get_options; get_status; is_top_dataset; is_from_collector; get_extension; is_rankable; exclude_followup_processors; is_4cat_processor