datasources.xiaohongshu_comments.search_rednote_comments
Import scraped RedNote comments
It's prohibitively difficult to scrape data from RedNote within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped RedNote comments 3 4It's prohibitively difficult to scrape data from RedNote within 4CAT itself due 5to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7from datetime import datetime 8 9from backend.lib.search import Search 10from common.lib.item_mapping import MappedItem, MissingMappedField 11from common.lib.helpers import normalize_url_encoding 12 13 14class SearchRedNoteComments(Search): 15 """ 16 Import scraped RedNote/Xiaohongshu/XSH comment data 17 """ 18 type = "xiaohongshu-comments-search" # job ID 19 category = "Search" # category 20 title = "Import scraped RedNote comment data" # title displayed in UI 21 description = "Import RedNote comment data collected with an external tool such as Zeeschuimer." # description displayed in UI 22 extension = "ndjson" # extension of result file, used internally and in UI 23 is_from_zeeschuimer = True 24 25 # not available as a processor for existing datasets 26 accepts = [None] 27 references = [ 28 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 29 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 30 ] 31 32 def get_items(self, query): 33 """ 34 Run custom search 35 36 Not available for RedNote 37 """ 38 raise NotImplementedError("RedNote/Xiaohongshu comment datasets can only be created by importing data from elsewhere") 39 40 41 @staticmethod 42 def map_item(item): 43 """ 44 Map XSH comment object to 4CAT item 45 46 Depending on whether the object was captured from JSON or HTML, treat it 47 differently. A lot of data is missing from HTML objects. 48 49 :param item: 50 :return: 51 """ 52 53 timestamp = datetime.fromtimestamp(int(item["create_time"]) / 1000).strftime("%Y-%m-%d %H:%M:%S") 54 55 return MappedItem({ 56 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 57 "id": item["id"], 58 "thread_id": item["note_id"], 59 "url": f"https://www.xiaohongshu.com/explore/{item['note_id']}", 60 "body": item.get("content", ""), 61 "timestamp": timestamp, 62 "author": item["user_info"]["nickname"], 63 "author_avatar_url": item["user_info"]["image"], 64 "ip_location": item["ip_location"] if item.get("ip_location") else MissingMappedField(""), 65 "likes": item["like_count"], 66 "replies": item["sub_comment_count"], 67 "unix_timestamp": int(item["create_time"] / 1000) 68 })
15class SearchRedNoteComments(Search): 16 """ 17 Import scraped RedNote/Xiaohongshu/XSH comment data 18 """ 19 type = "xiaohongshu-comments-search" # job ID 20 category = "Search" # category 21 title = "Import scraped RedNote comment data" # title displayed in UI 22 description = "Import RedNote comment data collected with an external tool such as Zeeschuimer." # description displayed in UI 23 extension = "ndjson" # extension of result file, used internally and in UI 24 is_from_zeeschuimer = True 25 26 # not available as a processor for existing datasets 27 accepts = [None] 28 references = [ 29 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 30 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 31 ] 32 33 def get_items(self, query): 34 """ 35 Run custom search 36 37 Not available for RedNote 38 """ 39 raise NotImplementedError("RedNote/Xiaohongshu comment datasets can only be created by importing data from elsewhere") 40 41 42 @staticmethod 43 def map_item(item): 44 """ 45 Map XSH comment object to 4CAT item 46 47 Depending on whether the object was captured from JSON or HTML, treat it 48 differently. A lot of data is missing from HTML objects. 49 50 :param item: 51 :return: 52 """ 53 54 timestamp = datetime.fromtimestamp(int(item["create_time"]) / 1000).strftime("%Y-%m-%d %H:%M:%S") 55 56 return MappedItem({ 57 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 58 "id": item["id"], 59 "thread_id": item["note_id"], 60 "url": f"https://www.xiaohongshu.com/explore/{item['note_id']}", 61 "body": item.get("content", ""), 62 "timestamp": timestamp, 63 "author": item["user_info"]["nickname"], 64 "author_avatar_url": item["user_info"]["image"], 65 "ip_location": item["ip_location"] if item.get("ip_location") else MissingMappedField(""), 66 "likes": item["like_count"], 67 "replies": item["sub_comment_count"], 68 "unix_timestamp": int(item["create_time"] / 1000) 69 })
Import scraped RedNote/Xiaohongshu/XSH comment data
references =
['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def
get_items(self, query):
33 def get_items(self, query): 34 """ 35 Run custom search 36 37 Not available for RedNote 38 """ 39 raise NotImplementedError("RedNote/Xiaohongshu comment datasets can only be created by importing data from elsewhere")
Run custom search
Not available for RedNote
@staticmethod
def
map_item(item):
42 @staticmethod 43 def map_item(item): 44 """ 45 Map XSH comment object to 4CAT item 46 47 Depending on whether the object was captured from JSON or HTML, treat it 48 differently. A lot of data is missing from HTML objects. 49 50 :param item: 51 :return: 52 """ 53 54 timestamp = datetime.fromtimestamp(int(item["create_time"]) / 1000).strftime("%Y-%m-%d %H:%M:%S") 55 56 return MappedItem({ 57 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 58 "id": item["id"], 59 "thread_id": item["note_id"], 60 "url": f"https://www.xiaohongshu.com/explore/{item['note_id']}", 61 "body": item.get("content", ""), 62 "timestamp": timestamp, 63 "author": item["user_info"]["nickname"], 64 "author_avatar_url": item["user_info"]["image"], 65 "ip_location": item["ip_location"] if item.get("ip_location") else MissingMappedField(""), 66 "likes": item["like_count"], 67 "replies": item["sub_comment_count"], 68 "unix_timestamp": int(item["create_time"] / 1000) 69 })
Map XSH comment object to 4CAT item
Depending on whether the object was captured from JSON or HTML, treat it differently. A lot of data is missing from HTML objects.
Parameters
- item:
Returns
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- run_interruptable_process
- get_queue_id
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- for_cleanup
- work
- after_process
- clean_up_on_error
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor