Edit on GitHub

datasources.xiaohongshu.search_rednote

Import scraped RedNote data

It's prohibitively difficult to scrape data from RedNote within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

View Source

  1"""
  2Import scraped RedNote data
  3
  4It's prohibitively difficult to scrape data from RedNote within 4CAT itself due
  5to its aggressive rate limiting. Instead, import data collected elsewhere.
  6"""
  7from datetime import datetime
  8
  9from backend.lib.search import Search
 10from common.lib.item_mapping import MappedItem, MissingMappedField
 11
 12
 13class SearchRedNote(Search):
 14    """
 15    Import scraped RedNote/Xiaohongshu/XSH data
 16    """
 17    type = "xiaohongshu-search"  # job ID
 18    category = "Search"  # category
 19    title = "Import scraped RedNote data"  # title displayed in UI
 20    description = "Import RedNote data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 21    extension = "ndjson"  # extension of result file, used internally and in UI
 22    is_from_zeeschuimer = True
 23
 24    # not available as a processor for existing datasets
 25    accepts = [None]
 26    references = [
 27        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 28        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 29    ]
 30
 31    def get_items(self, query):
 32        """
 33        Run custom search
 34
 35        Not available for RedNote
 36        """
 37        raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")
 38
 39
 40    @staticmethod
 41    def map_item(post):
 42        """
 43        Map XSH object to 4CAT item
 44
 45        Depending on whether the object was captured from JSON or HTML, treat it
 46        differently. A lot of data is missing from HTML objects.
 47
 48        :param post:
 49        :return:
 50        """
 51        if post.get("_zs-origin") == "html":
 52            return SearchRedNote.map_item_from_html(post)
 53        else:
 54            if "note" in post:
 55                return SearchRedNote.map_item_from_json_embedded(post)
 56            else:
 57                return SearchRedNote.map_item_from_json_api_explore(post)
 58
 59    @staticmethod
 60    def map_item_from_json_api_explore(post):
 61        """
 62        Map API-sourced XSH object to 4CAT item
 63
 64        Most straightforward - JSON objects from the XSH web API, which do
 65        however not always contain the same fields.
 66
 67        :param dict post:
 68        :return MappedItem:
 69        """
 70        item = post["note_card"] if post.get("type") != "video" else post
 71        item_id = post.get("id", post.get("note_id"))
 72
 73
 74        # Images
 75        images = []
 76        if item.get("image_list"):
 77            for image in item["image_list"]:
 78                if "url_default" in image and image["url_default"]:
 79                    images.append(image["url_default"])
 80                elif "info_list" in image and image["info_list"]:
 81                    for img_info in image["info_list"]:
 82                        found = False
 83                        if img_info.get("image_scene") == "WB_DFT":
 84                            images.append(img_info["url"])
 85                            found = True
 86                            break
 87                    if not found:
 88                        images.append(image["info_list"][0]["url"])
 89        elif item.get("cover"):
 90            images.append(item["cover"]["url_default"])
 91        else:
 92            # no image found;
 93            images = MissingMappedField("")       
 94
 95        # permalinks need this token to work, else you get a 404 not found
 96        xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else ""
 97        if item.get("video", {}).get("media"):
 98            video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"]
 99        else:
100            video_url = MissingMappedField("")
101
102        timestamp = item.get("time", None)
103        return MappedItem({
104            "id": item_id,
105            "thread_id": item_id,
106            "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}",
107            "title": item.get("display_title", ""),
108            "body": item.get("desc", "") if "desc" in item else MissingMappedField(""),
109            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
110            "author": item["user"]["nickname"],
111            "author_avatar_url": item["user"]["avatar"],
112            "image_urls": ",".join(images) if type(images) is list else images,
113            "video_url": video_url,
114            # only available when loading an individual post page, so skip
115            # "tags": ",".join(t["name"] for t in item["tag_list"]),
116            "likes": item["interact_info"]["liked_count"],
117            # "collects": item["interact_info"]["collected_count"],
118            # "comments": item["interact_info"]["comment_count"],
119            # "shares": item["interact_info"]["share_count"],
120            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
121        })
122
123    @staticmethod
124    def map_item_from_json_embedded(item):
125        """
126        Map JSON object from an XHS HTML page
127
128        JSON objects from the HTML are formatted slightly differently, mostly
129        in that they use camelCase instead of underscores, but we can also
130        make a few more assumptions about the data
131
132        :param dict item:
133        :return MappedItem:
134        """
135        note = item["note"]
136        image = note["imageList"][0]["urlDefault"]
137        # permalinks need this token to work, else you get a 404 not found
138        xsec_bit = f"?xsec_token={note['xsecToken']}"
139        timestamp = item.get("time", None)
140
141        return MappedItem({
142            "id": item["id"],
143            "thread_id": item["id"],
144            "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}",
145            "title": note.get("title", ""),
146            "body": note.get("desc", "") if "desc" in note else MissingMappedField(""),
147            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
148            "author": note["user"]["nickname"],
149            "author_avatar_url": note["user"]["avatar"],
150            "image_url": image,
151            "video_url": MissingMappedField(""),
152            # only available when loading an individual post page, so skip
153            # "tags": ",".join(t["name"] for t in item["tag_list"]),
154            "likes": item["interactInfo"]["likedCount"],
155            # "collects": item["interact_info"]["collected_count"],
156            # "comments": item["interact_info"]["comment_count"],
157            # "shares": item["interact_info"]["share_count"],
158            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
159        })
160
161    def map_item_from_html(item):
162        """
163        Map pre-mapped item
164
165        These have been mapped by Zeeschuimer from the page HTML and contain
166        less data than JSON objects (but enough to be useful in some cases).
167
168        :param dict item:
169        :return MappedItem:
170        """
171        return MappedItem({
172            "id": item["id"],
173            "thread_id": item["id"],
174            "url": f"https://www.xiaohongshu.com{item['url']}",
175            "title": item["title"],
176            "body": MissingMappedField(""),
177            "timestamp": MissingMappedField(""),
178            "author": item["author_name"],
179            "author_avatar_url": item["author_avatar_url"],
180            "image_url": item["thumbnail_url"],
181            "video_url": MissingMappedField(""),
182            # "tags": MissingMappedField(""),
183            "likes": item["likes"],
184            # "collects": MissingMappedField(""),
185            # "comments": MissingMappedField(""),
186            # "shares": MissingMappedField(""),
187            "unix_timestamp": MissingMappedField(""),
188        })

class SearchRedNote(backend.lib.search.Search): View Source

 14class SearchRedNote(Search):
 15    """
 16    Import scraped RedNote/Xiaohongshu/XSH data
 17    """
 18    type = "xiaohongshu-search"  # job ID
 19    category = "Search"  # category
 20    title = "Import scraped RedNote data"  # title displayed in UI
 21    description = "Import RedNote data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 22    extension = "ndjson"  # extension of result file, used internally and in UI
 23    is_from_zeeschuimer = True
 24
 25    # not available as a processor for existing datasets
 26    accepts = [None]
 27    references = [
 28        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 29        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 30    ]
 31
 32    def get_items(self, query):
 33        """
 34        Run custom search
 35
 36        Not available for RedNote
 37        """
 38        raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")
 39
 40
 41    @staticmethod
 42    def map_item(post):
 43        """
 44        Map XSH object to 4CAT item
 45
 46        Depending on whether the object was captured from JSON or HTML, treat it
 47        differently. A lot of data is missing from HTML objects.
 48
 49        :param post:
 50        :return:
 51        """
 52        if post.get("_zs-origin") == "html":
 53            return SearchRedNote.map_item_from_html(post)
 54        else:
 55            if "note" in post:
 56                return SearchRedNote.map_item_from_json_embedded(post)
 57            else:
 58                return SearchRedNote.map_item_from_json_api_explore(post)
 59
 60    @staticmethod
 61    def map_item_from_json_api_explore(post):
 62        """
 63        Map API-sourced XSH object to 4CAT item
 64
 65        Most straightforward - JSON objects from the XSH web API, which do
 66        however not always contain the same fields.
 67
 68        :param dict post:
 69        :return MappedItem:
 70        """
 71        item = post["note_card"] if post.get("type") != "video" else post
 72        item_id = post.get("id", post.get("note_id"))
 73
 74
 75        # Images
 76        images = []
 77        if item.get("image_list"):
 78            for image in item["image_list"]:
 79                if "url_default" in image and image["url_default"]:
 80                    images.append(image["url_default"])
 81                elif "info_list" in image and image["info_list"]:
 82                    for img_info in image["info_list"]:
 83                        found = False
 84                        if img_info.get("image_scene") == "WB_DFT":
 85                            images.append(img_info["url"])
 86                            found = True
 87                            break
 88                    if not found:
 89                        images.append(image["info_list"][0]["url"])
 90        elif item.get("cover"):
 91            images.append(item["cover"]["url_default"])
 92        else:
 93            # no image found;
 94            images = MissingMappedField("")       
 95
 96        # permalinks need this token to work, else you get a 404 not found
 97        xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else ""
 98        if item.get("video", {}).get("media"):
 99            video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"]
100        else:
101            video_url = MissingMappedField("")
102
103        timestamp = item.get("time", None)
104        return MappedItem({
105            "id": item_id,
106            "thread_id": item_id,
107            "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}",
108            "title": item.get("display_title", ""),
109            "body": item.get("desc", "") if "desc" in item else MissingMappedField(""),
110            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
111            "author": item["user"]["nickname"],
112            "author_avatar_url": item["user"]["avatar"],
113            "image_urls": ",".join(images) if type(images) is list else images,
114            "video_url": video_url,
115            # only available when loading an individual post page, so skip
116            # "tags": ",".join(t["name"] for t in item["tag_list"]),
117            "likes": item["interact_info"]["liked_count"],
118            # "collects": item["interact_info"]["collected_count"],
119            # "comments": item["interact_info"]["comment_count"],
120            # "shares": item["interact_info"]["share_count"],
121            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
122        })
123
124    @staticmethod
125    def map_item_from_json_embedded(item):
126        """
127        Map JSON object from an XHS HTML page
128
129        JSON objects from the HTML are formatted slightly differently, mostly
130        in that they use camelCase instead of underscores, but we can also
131        make a few more assumptions about the data
132
133        :param dict item:
134        :return MappedItem:
135        """
136        note = item["note"]
137        image = note["imageList"][0]["urlDefault"]
138        # permalinks need this token to work, else you get a 404 not found
139        xsec_bit = f"?xsec_token={note['xsecToken']}"
140        timestamp = item.get("time", None)
141
142        return MappedItem({
143            "id": item["id"],
144            "thread_id": item["id"],
145            "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}",
146            "title": note.get("title", ""),
147            "body": note.get("desc", "") if "desc" in note else MissingMappedField(""),
148            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
149            "author": note["user"]["nickname"],
150            "author_avatar_url": note["user"]["avatar"],
151            "image_url": image,
152            "video_url": MissingMappedField(""),
153            # only available when loading an individual post page, so skip
154            # "tags": ",".join(t["name"] for t in item["tag_list"]),
155            "likes": item["interactInfo"]["likedCount"],
156            # "collects": item["interact_info"]["collected_count"],
157            # "comments": item["interact_info"]["comment_count"],
158            # "shares": item["interact_info"]["share_count"],
159            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
160        })
161
162    def map_item_from_html(item):
163        """
164        Map pre-mapped item
165
166        These have been mapped by Zeeschuimer from the page HTML and contain
167        less data than JSON objects (but enough to be useful in some cases).
168
169        :param dict item:
170        :return MappedItem:
171        """
172        return MappedItem({
173            "id": item["id"],
174            "thread_id": item["id"],
175            "url": f"https://www.xiaohongshu.com{item['url']}",
176            "title": item["title"],
177            "body": MissingMappedField(""),
178            "timestamp": MissingMappedField(""),
179            "author": item["author_name"],
180            "author_avatar_url": item["author_avatar_url"],
181            "image_url": item["thumbnail_url"],
182            "video_url": MissingMappedField(""),
183            # "tags": MissingMappedField(""),
184            "likes": item["likes"],
185            # "collects": MissingMappedField(""),
186            # "comments": MissingMappedField(""),
187            # "shares": MissingMappedField(""),
188            "unix_timestamp": MissingMappedField(""),
189        })

Import scraped RedNote/Xiaohongshu/XSH data

type = 'xiaohongshu-search'

category = 'Search'

title = 'Import scraped RedNote data'

description = 'Import RedNote data collected with an external tool such as Zeeschuimer.'

extension = 'ndjson'

is_from_zeeschuimer = True

accepts = [None]

references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']

def get_items(self, query): View Source

32    def get_items(self, query):
33        """
34        Run custom search
35
36        Not available for RedNote
37        """
38        raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")

Run custom search

Not available for RedNote

@staticmethod

def map_item(post): View Source

41    @staticmethod
42    def map_item(post):
43        """
44        Map XSH object to 4CAT item
45
46        Depending on whether the object was captured from JSON or HTML, treat it
47        differently. A lot of data is missing from HTML objects.
48
49        :param post:
50        :return:
51        """
52        if post.get("_zs-origin") == "html":
53            return SearchRedNote.map_item_from_html(post)
54        else:
55            if "note" in post:
56                return SearchRedNote.map_item_from_json_embedded(post)
57            else:
58                return SearchRedNote.map_item_from_json_api_explore(post)

Map XSH object to 4CAT item

Depending on whether the object was captured from JSON or HTML, treat it differently. A lot of data is missing from HTML objects.

Parameters

post:

Returns

@staticmethod

def map_item_from_json_api_explore(post): View Source

 60    @staticmethod
 61    def map_item_from_json_api_explore(post):
 62        """
 63        Map API-sourced XSH object to 4CAT item
 64
 65        Most straightforward - JSON objects from the XSH web API, which do
 66        however not always contain the same fields.
 67
 68        :param dict post:
 69        :return MappedItem:
 70        """
 71        item = post["note_card"] if post.get("type") != "video" else post
 72        item_id = post.get("id", post.get("note_id"))
 73
 74
 75        # Images
 76        images = []
 77        if item.get("image_list"):
 78            for image in item["image_list"]:
 79                if "url_default" in image and image["url_default"]:
 80                    images.append(image["url_default"])
 81                elif "info_list" in image and image["info_list"]:
 82                    for img_info in image["info_list"]:
 83                        found = False
 84                        if img_info.get("image_scene") == "WB_DFT":
 85                            images.append(img_info["url"])
 86                            found = True
 87                            break
 88                    if not found:
 89                        images.append(image["info_list"][0]["url"])
 90        elif item.get("cover"):
 91            images.append(item["cover"]["url_default"])
 92        else:
 93            # no image found;
 94            images = MissingMappedField("")       
 95
 96        # permalinks need this token to work, else you get a 404 not found
 97        xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else ""
 98        if item.get("video", {}).get("media"):
 99            video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"]
100        else:
101            video_url = MissingMappedField("")
102
103        timestamp = item.get("time", None)
104        return MappedItem({
105            "id": item_id,
106            "thread_id": item_id,
107            "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}",
108            "title": item.get("display_title", ""),
109            "body": item.get("desc", "") if "desc" in item else MissingMappedField(""),
110            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
111            "author": item["user"]["nickname"],
112            "author_avatar_url": item["user"]["avatar"],
113            "image_urls": ",".join(images) if type(images) is list else images,
114            "video_url": video_url,
115            # only available when loading an individual post page, so skip
116            # "tags": ",".join(t["name"] for t in item["tag_list"]),
117            "likes": item["interact_info"]["liked_count"],
118            # "collects": item["interact_info"]["collected_count"],
119            # "comments": item["interact_info"]["comment_count"],
120            # "shares": item["interact_info"]["share_count"],
121            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
122        })

Map API-sourced XSH object to 4CAT item

Most straightforward - JSON objects from the XSH web API, which do however not always contain the same fields.

Parameters

dict post:

Returns

@staticmethod

def map_item_from_json_embedded(item): View Source

124    @staticmethod
125    def map_item_from_json_embedded(item):
126        """
127        Map JSON object from an XHS HTML page
128
129        JSON objects from the HTML are formatted slightly differently, mostly
130        in that they use camelCase instead of underscores, but we can also
131        make a few more assumptions about the data
132
133        :param dict item:
134        :return MappedItem:
135        """
136        note = item["note"]
137        image = note["imageList"][0]["urlDefault"]
138        # permalinks need this token to work, else you get a 404 not found
139        xsec_bit = f"?xsec_token={note['xsecToken']}"
140        timestamp = item.get("time", None)
141
142        return MappedItem({
143            "id": item["id"],
144            "thread_id": item["id"],
145            "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}",
146            "title": note.get("title", ""),
147            "body": note.get("desc", "") if "desc" in note else MissingMappedField(""),
148            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
149            "author": note["user"]["nickname"],
150            "author_avatar_url": note["user"]["avatar"],
151            "image_url": image,
152            "video_url": MissingMappedField(""),
153            # only available when loading an individual post page, so skip
154            # "tags": ",".join(t["name"] for t in item["tag_list"]),
155            "likes": item["interactInfo"]["likedCount"],
156            # "collects": item["interact_info"]["collected_count"],
157            # "comments": item["interact_info"]["comment_count"],
158            # "shares": item["interact_info"]["share_count"],
159            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
160        })

Map JSON object from an XHS HTML page

JSON objects from the HTML are formatted slightly differently, mostly in that they use camelCase instead of underscores, but we can also make a few more assumptions about the data

Parameters

dict item:

Returns

def map_item_from_html(item): View Source

162    def map_item_from_html(item):
163        """
164        Map pre-mapped item
165
166        These have been mapped by Zeeschuimer from the page HTML and contain
167        less data than JSON objects (but enough to be useful in some cases).
168
169        :param dict item:
170        :return MappedItem:
171        """
172        return MappedItem({
173            "id": item["id"],
174            "thread_id": item["id"],
175            "url": f"https://www.xiaohongshu.com{item['url']}",
176            "title": item["title"],
177            "body": MissingMappedField(""),
178            "timestamp": MissingMappedField(""),
179            "author": item["author_name"],
180            "author_avatar_url": item["author_avatar_url"],
181            "image_url": item["thumbnail_url"],
182            "video_url": MissingMappedField(""),
183            # "tags": MissingMappedField(""),
184            "likes": item["likes"],
185            # "collects": MissingMappedField(""),
186            # "comments": MissingMappedField(""),
187            # "shares": MissingMappedField(""),
188            "unix_timestamp": MissingMappedField(""),
189        })

Map pre-mapped item

These have been mapped by Zeeschuimer from the page HTML and contain less data than JSON objects (but enough to be useful in some cases).

Parameters

dict item:

Returns

Inherited Members

backend.lib.worker.BasicWorker: BasicWorker; INTERRUPT_NONE; INTERRUPT_RETRY; INTERRUPT_CANCEL; queue; log; manager; interrupted; modules; init_time; name; run; clean_up; request_interrupt; is_4cat_class
backend.lib.search.Search: max_workers; prefix; return_cols; import_error_count; import_warning_count; process; search; import_from_file; items_to_csv; items_to_ndjson; items_to_archive
backend.lib.processor.BasicProcessor: db; job; dataset; owner; source_dataset; source_file; config; is_running_in_preset; is_hidden; filepath; work; after_process; remove_files; abort; iterate_proxied_requests; push_proxied_request; flush_proxied_requests; iterate_archive_contents; unpack_archive_contents; extract_archived_file_by_name; write_csv_items_and_finish; write_archive_and_finish; create_standalone; save_annotations; map_item_method_available; get_mapped_item; is_filter; get_options; get_status; is_top_dataset; is_from_collector; get_extension; is_rankable; exclude_followup_processors; is_4cat_processor