Edit on GitHub

datasources.tiktok_comments.search_tiktok_comments

Import scraped TikTok comment data

It's prohibitively difficult to scrape data from TikTok within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

View Source

 1"""
 2Import scraped TikTok comment data
 3
 4It's prohibitively difficult to scrape data from TikTok within 4CAT itself due
 5to its aggressive rate limiting. Instead, import data collected elsewhere.
 6"""
 7from datetime import datetime, timezone
 8from urllib.parse import urlparse, parse_qs
 9
10from backend.lib.search import Search
11from common.lib.item_mapping import MappedItem
12
13
14class SearchTikTokComments(Search):
15    """
16    Import scraped TikTok comment data
17    """
18    type = "tiktok-comments-search"  # job ID
19    category = "Search"  # category
20    title = "Import scraped Tiktok comment data"  # title displayed in UI
21    description = "Import Tiktok comment data collected with an external tool such as Zeeschuimer."  # description displayed in UI
22    extension = "ndjson"  # extension of result file, used internally and in UI
23    is_from_zeeschuimer = True
24
25    # not available as a processor for existing datasets
26    accepts = [None]
27    references = [
28        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
29        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
30    ]
31
32    def get_items(self, query):
33        """
34        Run custom search
35
36        Not available for TikTok comments
37        """
38        raise NotImplementedError("TikTok comment datasets can only be created by importing data from elsewhere")
39
40    @staticmethod
41    def map_item(item):
42        item_datetime = datetime.fromtimestamp(item["create_time"]).strftime("%Y-%m-%d %H:%M:%S")
43        thread_id = item["aweme_id"] if item["reply_id"] == "0" else item["reply_id"]
44        avatar_url = item["user"]["avatar_thumb"]["url_list"][0]
45
46        return MappedItem({
47            "id": item["cid"],
48            "thread_id": thread_id,
49            "author": item["user"]["unique_id"],
50            "author_full": item["user"]["nickname"],
51            "author_avatar_url": avatar_url,
52            "body": item["text"],
53            "timestamp": item_datetime,
54            "unix_timestamp": item["create_time"],
55            "likes": item["digg_count"],
56            "replies": item.get("reply_comment_total", 0),
57            "post_id": item["aweme_id"],
58            "post_url": item["share_info"]["url"].split(".html")[0],
59            "post_body": item["share_info"]["title"],
60            "comment_url": item["share_info"]["url"],
61            "is_liked_by_post_author": "yes" if bool(item.get("author_pin")) else "no",
62            "is_sticky": "yes" if bool(item["stick_position"]) else "no",
63            "is_comment_on_comment": "no" if bool(item["reply_id"] == "0") else "yes",
64            "language_guess": item["comment_language"]
65        })

class SearchTikTokComments(backend.lib.search.Search): View Source

15class SearchTikTokComments(Search):
16    """
17    Import scraped TikTok comment data
18    """
19    type = "tiktok-comments-search"  # job ID
20    category = "Search"  # category
21    title = "Import scraped Tiktok comment data"  # title displayed in UI
22    description = "Import Tiktok comment data collected with an external tool such as Zeeschuimer."  # description displayed in UI
23    extension = "ndjson"  # extension of result file, used internally and in UI
24    is_from_zeeschuimer = True
25
26    # not available as a processor for existing datasets
27    accepts = [None]
28    references = [
29        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
30        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
31    ]
32
33    def get_items(self, query):
34        """
35        Run custom search
36
37        Not available for TikTok comments
38        """
39        raise NotImplementedError("TikTok comment datasets can only be created by importing data from elsewhere")
40
41    @staticmethod
42    def map_item(item):
43        item_datetime = datetime.fromtimestamp(item["create_time"]).strftime("%Y-%m-%d %H:%M:%S")
44        thread_id = item["aweme_id"] if item["reply_id"] == "0" else item["reply_id"]
45        avatar_url = item["user"]["avatar_thumb"]["url_list"][0]
46
47        return MappedItem({
48            "id": item["cid"],
49            "thread_id": thread_id,
50            "author": item["user"]["unique_id"],
51            "author_full": item["user"]["nickname"],
52            "author_avatar_url": avatar_url,
53            "body": item["text"],
54            "timestamp": item_datetime,
55            "unix_timestamp": item["create_time"],
56            "likes": item["digg_count"],
57            "replies": item.get("reply_comment_total", 0),
58            "post_id": item["aweme_id"],
59            "post_url": item["share_info"]["url"].split(".html")[0],
60            "post_body": item["share_info"]["title"],
61            "comment_url": item["share_info"]["url"],
62            "is_liked_by_post_author": "yes" if bool(item.get("author_pin")) else "no",
63            "is_sticky": "yes" if bool(item["stick_position"]) else "no",
64            "is_comment_on_comment": "no" if bool(item["reply_id"] == "0") else "yes",
65            "language_guess": item["comment_language"]
66        })

Import scraped TikTok comment data

type = 'tiktok-comments-search'

category = 'Search'

title = 'Import scraped Tiktok comment data'

description = 'Import Tiktok comment data collected with an external tool such as Zeeschuimer.'

extension = 'ndjson'

is_from_zeeschuimer = True

accepts = [None]

references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']

def get_items(self, query): View Source

33    def get_items(self, query):
34        """
35        Run custom search
36
37        Not available for TikTok comments
38        """
39        raise NotImplementedError("TikTok comment datasets can only be created by importing data from elsewhere")

Run custom search

Not available for TikTok comments

@staticmethod

def map_item(item): View Source

41    @staticmethod
42    def map_item(item):
43        item_datetime = datetime.fromtimestamp(item["create_time"]).strftime("%Y-%m-%d %H:%M:%S")
44        thread_id = item["aweme_id"] if item["reply_id"] == "0" else item["reply_id"]
45        avatar_url = item["user"]["avatar_thumb"]["url_list"][0]
46
47        return MappedItem({
48            "id": item["cid"],
49            "thread_id": thread_id,
50            "author": item["user"]["unique_id"],
51            "author_full": item["user"]["nickname"],
52            "author_avatar_url": avatar_url,
53            "body": item["text"],
54            "timestamp": item_datetime,
55            "unix_timestamp": item["create_time"],
56            "likes": item["digg_count"],
57            "replies": item.get("reply_comment_total", 0),
58            "post_id": item["aweme_id"],
59            "post_url": item["share_info"]["url"].split(".html")[0],
60            "post_body": item["share_info"]["title"],
61            "comment_url": item["share_info"]["url"],
62            "is_liked_by_post_author": "yes" if bool(item.get("author_pin")) else "no",
63            "is_sticky": "yes" if bool(item["stick_position"]) else "no",
64            "is_comment_on_comment": "no" if bool(item["reply_id"] == "0") else "yes",
65            "language_guess": item["comment_language"]
66        })

Inherited Members

backend.lib.worker.BasicWorker: BasicWorker; INTERRUPT_NONE; INTERRUPT_RETRY; INTERRUPT_CANCEL; queue; log; manager; interrupted; modules; init_time; name; run; clean_up; request_interrupt; is_4cat_class
backend.lib.search.Search: max_workers; prefix; return_cols; import_error_count; import_warning_count; process; search; import_from_file; items_to_csv; items_to_ndjson; items_to_archive
backend.lib.processor.BasicProcessor: db; job; dataset; owner; source_dataset; source_file; config; is_running_in_preset; is_hidden; filepath; work; after_process; remove_files; abort; add_field_to_parent; iterate_archive_contents; unpack_archive_contents; extract_archived_file_by_name; write_csv_items_and_finish; write_archive_and_finish; create_standalone; map_item_method_available; get_mapped_item; is_filter; get_options; get_status; is_top_dataset; is_from_collector; get_extension; is_rankable; exclude_followup_processors; is_4cat_processor