Edit on GitHub

datasources.threads.search_threads

Import scraped Threads data

It's prohibitively difficult to scrape data from Threads within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

View Source

 1"""
 2Import scraped Threads data
 3
 4It's prohibitively difficult to scrape data from Threads within 4CAT itself due
 5to its aggressive rate limiting. Instead, import data collected elsewhere.
 6"""
 7from datetime import datetime
 8from urllib.parse import urlparse, parse_qs
 9import re
10
11from backend.lib.search import Search
12from common.lib.item_mapping import MappedItem
13
14
15class SearchThreads(Search):
16    """
17    Import scraped Threads data
18    """
19    type = "threads-search"  # job ID
20    category = "Search"  # category
21    title = "Import scraped Threads data"  # title displayed in UI
22    description = "Import Threads data collected with an external tool such as Zeeschuimer."  # description displayed in UI
23    extension = "ndjson"  # extension of result file, used internally and in UI
24    is_from_zeeschuimer = True
25
26    # not available as a processor for existing datasets
27    accepts = [None]
28    references = [
29        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
30        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
31    ]
32
33    def get_items(self, query):
34        """
35        Run custom search
36
37        Not available for 9gag
38        """
39        raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere")
40
41    @staticmethod
42    def map_item(post):
43        post_timestamp = datetime.fromtimestamp(post["taken_at"])
44
45        if post["carousel_media"]:
46            image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"] and c["image_versions2"].get("candidates")]
47            video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]]
48
49        else:
50            image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"] and post["image_versions2"].get("candidates") else []
51            video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else []
52        audio_url = post["audio"]["audio_src"] if post["audio"] else ""
53        
54        linked_url = ""
55        link_thumbnail = ""
56        if post["text_post_app_info"].get("link_preview_attachment"):
57            linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"]
58            parsed_url = parse_qs(urlparse(linked_url).query).get("u")
59            if parsed_url:
60                linked_url = parsed_url.pop()
61                link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url")
62            else:
63                link_thumbnail = linked_url
64
65        return MappedItem({
66            "id": post["code"],
67            "thread_id": post["code"],
68            "url": f"https://www.threads.com/@{post['user']['username']}/post/{post['code']}",
69            "body": post["caption"]["text"] if post["caption"] else "",
70            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
71            "author": post["user"]["username"],
72            "author_is_verified": "yes" if post["user"].get("is_verified") else "no",
73            "author_avatar": post["user"].get("profile_pic_url"),
74            "image_url": ",".join(image_urls),
75            "video_url": ",".join(video_urls),
76            "audio_url": audio_url,
77            "link_url": linked_url,
78            "link_thumbnail_url": link_thumbnail if link_thumbnail else "",
79            "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no",
80            "likes": post["like_count"],
81            "reposts": post["text_post_app_info"]["repost_count"],
82            "replies": post["text_post_app_info"]["direct_reply_count"],
83            "quotes": post["text_post_app_info"]["quote_count"],
84            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "",
85            "unix_timestamp": int(post_timestamp.timestamp()),
86        })

class SearchThreads(backend.lib.search.Search): View Source

16class SearchThreads(Search):
17    """
18    Import scraped Threads data
19    """
20    type = "threads-search"  # job ID
21    category = "Search"  # category
22    title = "Import scraped Threads data"  # title displayed in UI
23    description = "Import Threads data collected with an external tool such as Zeeschuimer."  # description displayed in UI
24    extension = "ndjson"  # extension of result file, used internally and in UI
25    is_from_zeeschuimer = True
26
27    # not available as a processor for existing datasets
28    accepts = [None]
29    references = [
30        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
31        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
32    ]
33
34    def get_items(self, query):
35        """
36        Run custom search
37
38        Not available for 9gag
39        """
40        raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere")
41
42    @staticmethod
43    def map_item(post):
44        post_timestamp = datetime.fromtimestamp(post["taken_at"])
45
46        if post["carousel_media"]:
47            image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"] and c["image_versions2"].get("candidates")]
48            video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]]
49
50        else:
51            image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"] and post["image_versions2"].get("candidates") else []
52            video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else []
53        audio_url = post["audio"]["audio_src"] if post["audio"] else ""
54        
55        linked_url = ""
56        link_thumbnail = ""
57        if post["text_post_app_info"].get("link_preview_attachment"):
58            linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"]
59            parsed_url = parse_qs(urlparse(linked_url).query).get("u")
60            if parsed_url:
61                linked_url = parsed_url.pop()
62                link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url")
63            else:
64                link_thumbnail = linked_url
65
66        return MappedItem({
67            "id": post["code"],
68            "thread_id": post["code"],
69            "url": f"https://www.threads.com/@{post['user']['username']}/post/{post['code']}",
70            "body": post["caption"]["text"] if post["caption"] else "",
71            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
72            "author": post["user"]["username"],
73            "author_is_verified": "yes" if post["user"].get("is_verified") else "no",
74            "author_avatar": post["user"].get("profile_pic_url"),
75            "image_url": ",".join(image_urls),
76            "video_url": ",".join(video_urls),
77            "audio_url": audio_url,
78            "link_url": linked_url,
79            "link_thumbnail_url": link_thumbnail if link_thumbnail else "",
80            "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no",
81            "likes": post["like_count"],
82            "reposts": post["text_post_app_info"]["repost_count"],
83            "replies": post["text_post_app_info"]["direct_reply_count"],
84            "quotes": post["text_post_app_info"]["quote_count"],
85            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "",
86            "unix_timestamp": int(post_timestamp.timestamp()),
87        })

Import scraped Threads data

type = 'threads-search'

category = 'Search'

title = 'Import scraped Threads data'

description = 'Import Threads data collected with an external tool such as Zeeschuimer.'

extension = 'ndjson'

is_from_zeeschuimer = True

accepts = [None]

references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']

def get_items(self, query): View Source

34    def get_items(self, query):
35        """
36        Run custom search
37
38        Not available for 9gag
39        """
40        raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere")

Run custom search

Not available for 9gag

@staticmethod

def map_item(post): View Source

42    @staticmethod
43    def map_item(post):
44        post_timestamp = datetime.fromtimestamp(post["taken_at"])
45
46        if post["carousel_media"]:
47            image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"] and c["image_versions2"].get("candidates")]
48            video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]]
49
50        else:
51            image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"] and post["image_versions2"].get("candidates") else []
52            video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else []
53        audio_url = post["audio"]["audio_src"] if post["audio"] else ""
54        
55        linked_url = ""
56        link_thumbnail = ""
57        if post["text_post_app_info"].get("link_preview_attachment"):
58            linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"]
59            parsed_url = parse_qs(urlparse(linked_url).query).get("u")
60            if parsed_url:
61                linked_url = parsed_url.pop()
62                link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url")
63            else:
64                link_thumbnail = linked_url
65
66        return MappedItem({
67            "id": post["code"],
68            "thread_id": post["code"],
69            "url": f"https://www.threads.com/@{post['user']['username']}/post/{post['code']}",
70            "body": post["caption"]["text"] if post["caption"] else "",
71            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
72            "author": post["user"]["username"],
73            "author_is_verified": "yes" if post["user"].get("is_verified") else "no",
74            "author_avatar": post["user"].get("profile_pic_url"),
75            "image_url": ",".join(image_urls),
76            "video_url": ",".join(video_urls),
77            "audio_url": audio_url,
78            "link_url": linked_url,
79            "link_thumbnail_url": link_thumbnail if link_thumbnail else "",
80            "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no",
81            "likes": post["like_count"],
82            "reposts": post["text_post_app_info"]["repost_count"],
83            "replies": post["text_post_app_info"]["direct_reply_count"],
84            "quotes": post["text_post_app_info"]["quote_count"],
85            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "",
86            "unix_timestamp": int(post_timestamp.timestamp()),
87        })

Inherited Members

backend.lib.worker.BasicWorker: BasicWorker; INTERRUPT_NONE; INTERRUPT_RETRY; INTERRUPT_CANCEL; queue; log; manager; interrupted; modules; init_time; name; run; clean_up; request_interrupt; is_4cat_class
backend.lib.search.Search: max_workers; prefix; return_cols; import_error_count; import_warning_count; process; search; import_from_file; items_to_csv; items_to_ndjson; items_to_archive
backend.lib.processor.BasicProcessor: db; job; dataset; owner; source_dataset; source_file; config; is_running_in_preset; is_hidden; filepath; work; after_process; remove_files; abort; iterate_proxied_requests; push_proxied_request; flush_proxied_requests; iterate_archive_contents; unpack_archive_contents; extract_archived_file_by_name; write_csv_items_and_finish; write_archive_and_finish; create_standalone; save_annotations; map_item_method_available; get_mapped_item; is_filter; get_options; get_status; is_top_dataset; is_from_collector; get_extension; is_rankable; exclude_followup_processors; is_4cat_processor