datasources.threads.search_threads
Import scraped Threads data
It's prohibitively difficult to scrape data from Threads within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped Threads data 3 4It's prohibitively difficult to scrape data from Threads within 4CAT itself due 5to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7from datetime import datetime 8from urllib.parse import urlparse, parse_qs, unquote 9import re 10 11from backend.lib.search import Search 12from common.lib.item_mapping import MappedItem 13 14 15class SearchThreads(Search): 16 """ 17 Import scraped Threads data 18 """ 19 type = "threads-search" # job ID 20 category = "Search" # category 21 title = "Import scraped Threads data" # title displayed in UI 22 description = "Import Threads data collected with an external tool such as Zeeschuimer." # description displayed in UI 23 extension = "ndjson" # extension of result file, used internally and in UI 24 is_from_zeeschuimer = True 25 26 # not available as a processor for existing datasets 27 accepts = [None] 28 references = [ 29 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 30 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 31 ] 32 33 def get_items(self, query): 34 """ 35 Run custom search 36 37 Not available for 9gag 38 """ 39 raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere") 40 41 @staticmethod 42 def map_item(post): 43 post_timestamp = datetime.fromtimestamp(post["taken_at"]) 44 45 if post["carousel_media"]: 46 image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]] 47 video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]] 48 else: 49 image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else [] 50 video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else [] 51 52 linked_url = "" 53 link_thumbnail = "" 54 if post["text_post_app_info"].get("link_preview_attachment"): 55 linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"] 56 linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop() 57 link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url") 58 59 return MappedItem({ 60 "id": post["code"], 61 "url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}", 62 "body": post["caption"]["text"] if post["caption"] else "", 63 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 64 "author": post["user"]["username"], 65 "author_is_verified": "yes" if post["user"].get("is_verified") else "no", 66 "author_avatar": post["user"].get("profile_pic_url"), 67 "image_url": ",".join(image_urls), 68 "video_url": ",".join(video_urls), 69 "link_url": linked_url, 70 "link_thumbnail_url": link_thumbnail if link_thumbnail else "", 71 "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no", 72 "likes": post["like_count"], 73 "reposts": post["text_post_app_info"]["repost_count"], 74 "replies": post["text_post_app_info"]["direct_reply_count"], 75 "quotes": post["text_post_app_info"]["quote_count"], 76 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "", 77 "unix_timestamp": int(post_timestamp.timestamp()), 78 })
16class SearchThreads(Search): 17 """ 18 Import scraped Threads data 19 """ 20 type = "threads-search" # job ID 21 category = "Search" # category 22 title = "Import scraped Threads data" # title displayed in UI 23 description = "Import Threads data collected with an external tool such as Zeeschuimer." # description displayed in UI 24 extension = "ndjson" # extension of result file, used internally and in UI 25 is_from_zeeschuimer = True 26 27 # not available as a processor for existing datasets 28 accepts = [None] 29 references = [ 30 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 31 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 32 ] 33 34 def get_items(self, query): 35 """ 36 Run custom search 37 38 Not available for 9gag 39 """ 40 raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere") 41 42 @staticmethod 43 def map_item(post): 44 post_timestamp = datetime.fromtimestamp(post["taken_at"]) 45 46 if post["carousel_media"]: 47 image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]] 48 video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]] 49 else: 50 image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else [] 51 video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else [] 52 53 linked_url = "" 54 link_thumbnail = "" 55 if post["text_post_app_info"].get("link_preview_attachment"): 56 linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"] 57 linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop() 58 link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url") 59 60 return MappedItem({ 61 "id": post["code"], 62 "url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}", 63 "body": post["caption"]["text"] if post["caption"] else "", 64 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 65 "author": post["user"]["username"], 66 "author_is_verified": "yes" if post["user"].get("is_verified") else "no", 67 "author_avatar": post["user"].get("profile_pic_url"), 68 "image_url": ",".join(image_urls), 69 "video_url": ",".join(video_urls), 70 "link_url": linked_url, 71 "link_thumbnail_url": link_thumbnail if link_thumbnail else "", 72 "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no", 73 "likes": post["like_count"], 74 "reposts": post["text_post_app_info"]["repost_count"], 75 "replies": post["text_post_app_info"]["direct_reply_count"], 76 "quotes": post["text_post_app_info"]["quote_count"], 77 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "", 78 "unix_timestamp": int(post_timestamp.timestamp()), 79 })
Import scraped Threads data
references =
['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def
get_items(self, query):
34 def get_items(self, query): 35 """ 36 Run custom search 37 38 Not available for 9gag 39 """ 40 raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere")
Run custom search
Not available for 9gag
@staticmethod
def
map_item(post):
42 @staticmethod 43 def map_item(post): 44 post_timestamp = datetime.fromtimestamp(post["taken_at"]) 45 46 if post["carousel_media"]: 47 image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]] 48 video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]] 49 else: 50 image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else [] 51 video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else [] 52 53 linked_url = "" 54 link_thumbnail = "" 55 if post["text_post_app_info"].get("link_preview_attachment"): 56 linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"] 57 linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop() 58 link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url") 59 60 return MappedItem({ 61 "id": post["code"], 62 "url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}", 63 "body": post["caption"]["text"] if post["caption"] else "", 64 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 65 "author": post["user"]["username"], 66 "author_is_verified": "yes" if post["user"].get("is_verified") else "no", 67 "author_avatar": post["user"].get("profile_pic_url"), 68 "image_url": ",".join(image_urls), 69 "video_url": ",".join(video_urls), 70 "link_url": linked_url, 71 "link_thumbnail_url": link_thumbnail if link_thumbnail else "", 72 "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no", 73 "likes": post["like_count"], 74 "reposts": post["text_post_app_info"]["repost_count"], 75 "replies": post["text_post_app_info"]["direct_reply_count"], 76 "quotes": post["text_post_app_info"]["quote_count"], 77 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "", 78 "unix_timestamp": int(post_timestamp.timestamp()), 79 })
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor