Edit on GitHub

datasources.xiaohongshu_comments.search_rednote_comments

Import scraped RedNote comments

It's prohibitively difficult to scrape data from RedNote within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

 1"""
 2Import scraped RedNote comments
 3
 4It's prohibitively difficult to scrape data from RedNote within 4CAT itself due
 5to its aggressive rate limiting. Instead, import data collected elsewhere.
 6"""
 7from datetime import datetime
 8
 9from backend.lib.search import Search
10from common.lib.item_mapping import MappedItem, MissingMappedField
11from common.lib.helpers import normalize_url_encoding
12
13
14class SearchRedNoteComments(Search):
15    """
16    Import scraped RedNote/Xiaohongshu/XSH comment data
17    """
18    type = "xiaohongshu-comments-search"  # job ID
19    category = "Search"  # category
20    title = "Import scraped RedNote comment data"  # title displayed in UI
21    description = "Import RedNote comment data collected with an external tool such as Zeeschuimer."  # description displayed in UI
22    extension = "ndjson"  # extension of result file, used internally and in UI
23    is_from_zeeschuimer = True
24
25    # not available as a processor for existing datasets
26    accepts = [None]
27    references = [
28        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
29        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
30    ]
31
32    def get_items(self, query):
33        """
34        Run custom search
35
36        Not available for RedNote
37        """
38        raise NotImplementedError("RedNote/Xiaohongshu comment datasets can only be created by importing data from elsewhere")
39
40
41    @staticmethod
42    def map_item(item):
43        """
44        Map XSH comment object to 4CAT item
45
46        Depending on whether the object was captured from JSON or HTML, treat it
47        differently. A lot of data is missing from HTML objects.
48
49        :param item:
50        :return:
51        """
52
53        timestamp = datetime.fromtimestamp(int(item["create_time"]) / 1000).strftime("%Y-%m-%d %H:%M:%S")
54
55        return MappedItem({
56            "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")),  # Zeeschuimer metadata
57            "id": item["id"],
58            "thread_id": item["note_id"],
59            "url": f"https://www.xiaohongshu.com/explore/{item['note_id']}",
60            "body": item.get("content", ""),
61            "timestamp": timestamp,
62            "author": item["user_info"]["nickname"],
63            "author_avatar_url": item["user_info"]["image"],
64            "ip_location": item["ip_location"] if item.get("ip_location") else MissingMappedField(""),
65            "likes": item["like_count"],
66            "replies": item["sub_comment_count"],
67            "unix_timestamp": int(item["create_time"] / 1000)
68        })
class SearchRedNoteComments(backend.lib.search.Search):
15class SearchRedNoteComments(Search):
16    """
17    Import scraped RedNote/Xiaohongshu/XSH comment data
18    """
19    type = "xiaohongshu-comments-search"  # job ID
20    category = "Search"  # category
21    title = "Import scraped RedNote comment data"  # title displayed in UI
22    description = "Import RedNote comment data collected with an external tool such as Zeeschuimer."  # description displayed in UI
23    extension = "ndjson"  # extension of result file, used internally and in UI
24    is_from_zeeschuimer = True
25
26    # not available as a processor for existing datasets
27    accepts = [None]
28    references = [
29        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
30        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
31    ]
32
33    def get_items(self, query):
34        """
35        Run custom search
36
37        Not available for RedNote
38        """
39        raise NotImplementedError("RedNote/Xiaohongshu comment datasets can only be created by importing data from elsewhere")
40
41
42    @staticmethod
43    def map_item(item):
44        """
45        Map XSH comment object to 4CAT item
46
47        Depending on whether the object was captured from JSON or HTML, treat it
48        differently. A lot of data is missing from HTML objects.
49
50        :param item:
51        :return:
52        """
53
54        timestamp = datetime.fromtimestamp(int(item["create_time"]) / 1000).strftime("%Y-%m-%d %H:%M:%S")
55
56        return MappedItem({
57            "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")),  # Zeeschuimer metadata
58            "id": item["id"],
59            "thread_id": item["note_id"],
60            "url": f"https://www.xiaohongshu.com/explore/{item['note_id']}",
61            "body": item.get("content", ""),
62            "timestamp": timestamp,
63            "author": item["user_info"]["nickname"],
64            "author_avatar_url": item["user_info"]["image"],
65            "ip_location": item["ip_location"] if item.get("ip_location") else MissingMappedField(""),
66            "likes": item["like_count"],
67            "replies": item["sub_comment_count"],
68            "unix_timestamp": int(item["create_time"] / 1000)
69        })

Import scraped RedNote/Xiaohongshu/XSH comment data

type = 'xiaohongshu-comments-search'
category = 'Search'
title = 'Import scraped RedNote comment data'
description = 'Import RedNote comment data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def get_items(self, query):
33    def get_items(self, query):
34        """
35        Run custom search
36
37        Not available for RedNote
38        """
39        raise NotImplementedError("RedNote/Xiaohongshu comment datasets can only be created by importing data from elsewhere")

Run custom search

Not available for RedNote

@staticmethod
def map_item(item):
42    @staticmethod
43    def map_item(item):
44        """
45        Map XSH comment object to 4CAT item
46
47        Depending on whether the object was captured from JSON or HTML, treat it
48        differently. A lot of data is missing from HTML objects.
49
50        :param item:
51        :return:
52        """
53
54        timestamp = datetime.fromtimestamp(int(item["create_time"]) / 1000).strftime("%Y-%m-%d %H:%M:%S")
55
56        return MappedItem({
57            "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")),  # Zeeschuimer metadata
58            "id": item["id"],
59            "thread_id": item["note_id"],
60            "url": f"https://www.xiaohongshu.com/explore/{item['note_id']}",
61            "body": item.get("content", ""),
62            "timestamp": timestamp,
63            "author": item["user_info"]["nickname"],
64            "author_avatar_url": item["user_info"]["image"],
65            "ip_location": item["ip_location"] if item.get("ip_location") else MissingMappedField(""),
66            "likes": item["like_count"],
67            "replies": item["sub_comment_count"],
68            "unix_timestamp": int(item["create_time"] / 1000)
69        })

Map XSH comment object to 4CAT item

Depending on whether the object was captured from JSON or HTML, treat it differently. A lot of data is missing from HTML objects.

Parameters
  • item:
Returns