Edit on GitHub

datasources.threads.search_threads

Import scraped Threads data

It's prohibitively difficult to scrape data from Threads within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

 1"""
 2Import scraped Threads data
 3
 4It's prohibitively difficult to scrape data from Threads within 4CAT itself due
 5to its aggressive rate limiting. Instead, import data collected elsewhere.
 6"""
 7from datetime import datetime
 8from urllib.parse import urlparse, parse_qs, unquote
 9import re
10
11from backend.lib.search import Search
12from common.lib.item_mapping import MappedItem
13
14
15class SearchThreads(Search):
16    """
17    Import scraped Threads data
18    """
19    type = "threads-search"  # job ID
20    category = "Search"  # category
21    title = "Import scraped Threads data"  # title displayed in UI
22    description = "Import Threads data collected with an external tool such as Zeeschuimer."  # description displayed in UI
23    extension = "ndjson"  # extension of result file, used internally and in UI
24    is_from_zeeschuimer = True
25
26    # not available as a processor for existing datasets
27    accepts = [None]
28    references = [
29        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
30        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
31    ]
32
33    def get_items(self, query):
34        """
35        Run custom search
36
37        Not available for 9gag
38        """
39        raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere")
40
41    @staticmethod
42    def map_item(post):
43        post_timestamp = datetime.fromtimestamp(post["taken_at"])
44
45        if post["carousel_media"]:
46            image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]]
47            video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]]
48        else:
49            image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else []
50            video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else []
51
52        linked_url = ""
53        link_thumbnail = ""
54        if post["text_post_app_info"].get("link_preview_attachment"):
55            linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"]
56            linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop()
57            link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url")
58
59        return MappedItem({
60            "id": post["code"],
61            "url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}",
62            "body": post["caption"]["text"] if post["caption"] else "",
63            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
64            "author": post["user"]["username"],
65            "author_is_verified": "yes" if post["user"].get("is_verified") else "no",
66            "author_avatar": post["user"].get("profile_pic_url"),
67            "image_url": ",".join(image_urls),
68            "video_url": ",".join(video_urls),
69            "link_url": linked_url,
70            "link_thumbnail_url": link_thumbnail if link_thumbnail else "",
71            "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no",
72            "likes": post["like_count"],
73            "reposts": post["text_post_app_info"]["repost_count"],
74            "replies": post["text_post_app_info"]["direct_reply_count"],
75            "quotes": post["text_post_app_info"]["quote_count"],
76            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "",
77            "unix_timestamp": int(post_timestamp.timestamp()),
78        })
class SearchThreads(backend.lib.search.Search):
16class SearchThreads(Search):
17    """
18    Import scraped Threads data
19    """
20    type = "threads-search"  # job ID
21    category = "Search"  # category
22    title = "Import scraped Threads data"  # title displayed in UI
23    description = "Import Threads data collected with an external tool such as Zeeschuimer."  # description displayed in UI
24    extension = "ndjson"  # extension of result file, used internally and in UI
25    is_from_zeeschuimer = True
26
27    # not available as a processor for existing datasets
28    accepts = [None]
29    references = [
30        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
31        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
32    ]
33
34    def get_items(self, query):
35        """
36        Run custom search
37
38        Not available for 9gag
39        """
40        raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere")
41
42    @staticmethod
43    def map_item(post):
44        post_timestamp = datetime.fromtimestamp(post["taken_at"])
45
46        if post["carousel_media"]:
47            image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]]
48            video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]]
49        else:
50            image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else []
51            video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else []
52
53        linked_url = ""
54        link_thumbnail = ""
55        if post["text_post_app_info"].get("link_preview_attachment"):
56            linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"]
57            linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop()
58            link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url")
59
60        return MappedItem({
61            "id": post["code"],
62            "url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}",
63            "body": post["caption"]["text"] if post["caption"] else "",
64            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
65            "author": post["user"]["username"],
66            "author_is_verified": "yes" if post["user"].get("is_verified") else "no",
67            "author_avatar": post["user"].get("profile_pic_url"),
68            "image_url": ",".join(image_urls),
69            "video_url": ",".join(video_urls),
70            "link_url": linked_url,
71            "link_thumbnail_url": link_thumbnail if link_thumbnail else "",
72            "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no",
73            "likes": post["like_count"],
74            "reposts": post["text_post_app_info"]["repost_count"],
75            "replies": post["text_post_app_info"]["direct_reply_count"],
76            "quotes": post["text_post_app_info"]["quote_count"],
77            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "",
78            "unix_timestamp": int(post_timestamp.timestamp()),
79        })

Import scraped Threads data

type = 'threads-search'
category = 'Search'
title = 'Import scraped Threads data'
description = 'Import Threads data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def get_items(self, query):
34    def get_items(self, query):
35        """
36        Run custom search
37
38        Not available for 9gag
39        """
40        raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere")

Run custom search

Not available for 9gag

@staticmethod
def map_item(post):
42    @staticmethod
43    def map_item(post):
44        post_timestamp = datetime.fromtimestamp(post["taken_at"])
45
46        if post["carousel_media"]:
47            image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]]
48            video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]]
49        else:
50            image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else []
51            video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else []
52
53        linked_url = ""
54        link_thumbnail = ""
55        if post["text_post_app_info"].get("link_preview_attachment"):
56            linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"]
57            linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop()
58            link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url")
59
60        return MappedItem({
61            "id": post["code"],
62            "url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}",
63            "body": post["caption"]["text"] if post["caption"] else "",
64            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
65            "author": post["user"]["username"],
66            "author_is_verified": "yes" if post["user"].get("is_verified") else "no",
67            "author_avatar": post["user"].get("profile_pic_url"),
68            "image_url": ",".join(image_urls),
69            "video_url": ",".join(video_urls),
70            "link_url": linked_url,
71            "link_thumbnail_url": link_thumbnail if link_thumbnail else "",
72            "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no",
73            "likes": post["like_count"],
74            "reposts": post["text_post_app_info"]["repost_count"],
75            "replies": post["text_post_app_info"]["direct_reply_count"],
76            "quotes": post["text_post_app_info"]["quote_count"],
77            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "",
78            "unix_timestamp": int(post_timestamp.timestamp()),
79        })