Edit on GitHub

datasources.xiaohongshu.search_rednote

Import scraped RedNote data

It's prohibitively difficult to scrape data from RedNote within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

  1"""
  2Import scraped RedNote data
  3
  4It's prohibitively difficult to scrape data from RedNote within 4CAT itself due
  5to its aggressive rate limiting. Instead, import data collected elsewhere.
  6"""
  7from datetime import datetime
  8
  9from backend.lib.search import Search
 10from common.lib.item_mapping import MappedItem, MissingMappedField
 11
 12
 13class SearchRedNote(Search):
 14    """
 15    Import scraped RedNote/Xiaohongshu/XSH data
 16    """
 17    type = "xiaohongshu-search"  # job ID
 18    category = "Search"  # category
 19    title = "Import scraped RedNote data"  # title displayed in UI
 20    description = "Import RedNote data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 21    extension = "ndjson"  # extension of result file, used internally and in UI
 22    is_from_zeeschuimer = True
 23
 24    # not available as a processor for existing datasets
 25    accepts = [None]
 26    references = [
 27        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 28        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 29    ]
 30
 31    def get_items(self, query):
 32        """
 33        Run custom search
 34
 35        Not available for RedNote
 36        """
 37        raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")
 38
 39
 40    @staticmethod
 41    def map_item(post):
 42        """
 43        Map XSH object to 4CAT item
 44
 45        Depending on whether the object was captured from JSON or HTML, treat it
 46        differently. A lot of data is missing from HTML objects.
 47
 48        :param post:
 49        :return:
 50        """
 51        if post.get("_zs-origin") == "html":
 52            return SearchRedNote.map_item_from_html(post)
 53        else:
 54            if "note" in post:
 55                return SearchRedNote.map_item_from_json_embedded(post)
 56            else:
 57                return SearchRedNote.map_item_from_json_api_explore(post)
 58
 59    @staticmethod
 60    def map_item_from_json_api_explore(post):
 61        """
 62        Map API-sourced XSH object to 4CAT item
 63
 64        Most straightforward - JSON objects from the XSH web API, which do
 65        however not always contain the same fields.
 66
 67        :param dict post:
 68        :return MappedItem:
 69        """
 70        item = post["note_card"] if post.get("type") != "video" else post
 71        item_id = post.get("id", post.get("note_id"))
 72
 73        import json
 74
 75        # Images
 76        images = []
 77        if item.get("image_list"):
 78            for image in item["image_list"]:
 79                if "url_default" in image and image["url_default"]:
 80                    images.append(image["url_default"])
 81                elif "info_list" in image and image["info_list"]:
 82                    for img_info in image["info_list"]:
 83                        found = False
 84                        if img_info.get("image_scene") == "WB_DFT":
 85                            images.append(img_info["url"])
 86                            found = True
 87                            break
 88                    if not found:
 89                        images.append(image["info_list"][0]["url"])
 90        elif item.get("cover"):
 91            images.append(item["cover"]["url_default"])
 92        else:
 93            # no image found;
 94            images = MissingMappedField("")       
 95
 96        # permalinks need this token to work, else you get a 404 not found
 97        xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else ""
 98        if item.get("video", {}).get("media"):
 99            video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"]
100        else:
101            video_url = MissingMappedField("")
102
103        timestamp = item.get("time", None)
104        return MappedItem({
105            "id": item_id,
106            "thread_id": item_id,
107            "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}",
108            "title": item.get("display_title", ""),
109            "body": item.get("desc", "") if "desc" in item else MissingMappedField(""),
110            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
111            "author": item["user"]["nickname"],
112            "author_avatar_url": item["user"]["avatar"],
113            "image_urls": ",".join(images) if type(images) is list else images,
114            "video_url": video_url,
115            # only available when loading an individual post page, so skip
116            # "tags": ",".join(t["name"] for t in item["tag_list"]),
117            "likes": item["interact_info"]["liked_count"],
118            # "collects": item["interact_info"]["collected_count"],
119            # "comments": item["interact_info"]["comment_count"],
120            # "shares": item["interact_info"]["share_count"],
121            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
122        })
123
124    @staticmethod
125    def map_item_from_json_embedded(item):
126        """
127        Map JSON object from an XHS HTML page
128
129        JSON objects from the HTML are formatted slightly differently, mostly
130        in that they use camelCase instead of underscores, but we can also
131        make a few more assumptions about the data
132
133        :param dict item:
134        :return MappedItem:
135        """
136        note = item["note"]
137        image = note["imageList"][0]["urlDefault"]
138        # permalinks need this token to work, else you get a 404 not found
139        xsec_bit = f"?xsec_token={note['xsecToken']}"
140        timestamp = item.get("time", None)
141
142        return MappedItem({
143            "id": item["id"],
144            "thread_id": item["id"],
145            "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}",
146            "title": note.get("title", ""),
147            "body": note.get("desc", "") if "desc" in note else MissingMappedField(""),
148            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
149            "author": note["user"]["nickname"],
150            "author_avatar_url": note["user"]["avatar"],
151            "image_url": image,
152            "video_url": MissingMappedField(""),
153            # only available when loading an individual post page, so skip
154            # "tags": ",".join(t["name"] for t in item["tag_list"]),
155            "likes": item["interactInfo"]["likedCount"],
156            # "collects": item["interact_info"]["collected_count"],
157            # "comments": item["interact_info"]["comment_count"],
158            # "shares": item["interact_info"]["share_count"],
159            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
160        })
161
162    def map_item_from_html(item):
163        """
164        Map pre-mapped item
165
166        These have been mapped by Zeeschuimer from the page HTML and contain
167        less data than JSON objects (but enough to be useful in some cases).
168
169        :param dict item:
170        :return MappedItem:
171        """
172        return MappedItem({
173            "id": item["id"],
174            "thread_id": item["id"],
175            "url": f"https://www.xiaohongshu.com{item['url']}",
176            "title": item["title"],
177            "body": MissingMappedField(""),
178            "timestamp": MissingMappedField(""),
179            "author": item["author_name"],
180            "author_avatar_url": item["author_avatar_url"],
181            "image_url": item["thumbnail_url"],
182            "video_url": MissingMappedField(""),
183            # "tags": MissingMappedField(""),
184            "likes": item["likes"],
185            # "collects": MissingMappedField(""),
186            # "comments": MissingMappedField(""),
187            # "shares": MissingMappedField(""),
188            "unix_timestamp": MissingMappedField(""),
189        })
class SearchRedNote(backend.lib.search.Search):
 14class SearchRedNote(Search):
 15    """
 16    Import scraped RedNote/Xiaohongshu/XSH data
 17    """
 18    type = "xiaohongshu-search"  # job ID
 19    category = "Search"  # category
 20    title = "Import scraped RedNote data"  # title displayed in UI
 21    description = "Import RedNote data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 22    extension = "ndjson"  # extension of result file, used internally and in UI
 23    is_from_zeeschuimer = True
 24
 25    # not available as a processor for existing datasets
 26    accepts = [None]
 27    references = [
 28        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 29        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 30    ]
 31
 32    def get_items(self, query):
 33        """
 34        Run custom search
 35
 36        Not available for RedNote
 37        """
 38        raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")
 39
 40
 41    @staticmethod
 42    def map_item(post):
 43        """
 44        Map XSH object to 4CAT item
 45
 46        Depending on whether the object was captured from JSON or HTML, treat it
 47        differently. A lot of data is missing from HTML objects.
 48
 49        :param post:
 50        :return:
 51        """
 52        if post.get("_zs-origin") == "html":
 53            return SearchRedNote.map_item_from_html(post)
 54        else:
 55            if "note" in post:
 56                return SearchRedNote.map_item_from_json_embedded(post)
 57            else:
 58                return SearchRedNote.map_item_from_json_api_explore(post)
 59
 60    @staticmethod
 61    def map_item_from_json_api_explore(post):
 62        """
 63        Map API-sourced XSH object to 4CAT item
 64
 65        Most straightforward - JSON objects from the XSH web API, which do
 66        however not always contain the same fields.
 67
 68        :param dict post:
 69        :return MappedItem:
 70        """
 71        item = post["note_card"] if post.get("type") != "video" else post
 72        item_id = post.get("id", post.get("note_id"))
 73
 74        import json
 75
 76        # Images
 77        images = []
 78        if item.get("image_list"):
 79            for image in item["image_list"]:
 80                if "url_default" in image and image["url_default"]:
 81                    images.append(image["url_default"])
 82                elif "info_list" in image and image["info_list"]:
 83                    for img_info in image["info_list"]:
 84                        found = False
 85                        if img_info.get("image_scene") == "WB_DFT":
 86                            images.append(img_info["url"])
 87                            found = True
 88                            break
 89                    if not found:
 90                        images.append(image["info_list"][0]["url"])
 91        elif item.get("cover"):
 92            images.append(item["cover"]["url_default"])
 93        else:
 94            # no image found;
 95            images = MissingMappedField("")       
 96
 97        # permalinks need this token to work, else you get a 404 not found
 98        xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else ""
 99        if item.get("video", {}).get("media"):
100            video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"]
101        else:
102            video_url = MissingMappedField("")
103
104        timestamp = item.get("time", None)
105        return MappedItem({
106            "id": item_id,
107            "thread_id": item_id,
108            "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}",
109            "title": item.get("display_title", ""),
110            "body": item.get("desc", "") if "desc" in item else MissingMappedField(""),
111            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
112            "author": item["user"]["nickname"],
113            "author_avatar_url": item["user"]["avatar"],
114            "image_urls": ",".join(images) if type(images) is list else images,
115            "video_url": video_url,
116            # only available when loading an individual post page, so skip
117            # "tags": ",".join(t["name"] for t in item["tag_list"]),
118            "likes": item["interact_info"]["liked_count"],
119            # "collects": item["interact_info"]["collected_count"],
120            # "comments": item["interact_info"]["comment_count"],
121            # "shares": item["interact_info"]["share_count"],
122            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
123        })
124
125    @staticmethod
126    def map_item_from_json_embedded(item):
127        """
128        Map JSON object from an XHS HTML page
129
130        JSON objects from the HTML are formatted slightly differently, mostly
131        in that they use camelCase instead of underscores, but we can also
132        make a few more assumptions about the data
133
134        :param dict item:
135        :return MappedItem:
136        """
137        note = item["note"]
138        image = note["imageList"][0]["urlDefault"]
139        # permalinks need this token to work, else you get a 404 not found
140        xsec_bit = f"?xsec_token={note['xsecToken']}"
141        timestamp = item.get("time", None)
142
143        return MappedItem({
144            "id": item["id"],
145            "thread_id": item["id"],
146            "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}",
147            "title": note.get("title", ""),
148            "body": note.get("desc", "") if "desc" in note else MissingMappedField(""),
149            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
150            "author": note["user"]["nickname"],
151            "author_avatar_url": note["user"]["avatar"],
152            "image_url": image,
153            "video_url": MissingMappedField(""),
154            # only available when loading an individual post page, so skip
155            # "tags": ",".join(t["name"] for t in item["tag_list"]),
156            "likes": item["interactInfo"]["likedCount"],
157            # "collects": item["interact_info"]["collected_count"],
158            # "comments": item["interact_info"]["comment_count"],
159            # "shares": item["interact_info"]["share_count"],
160            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
161        })
162
163    def map_item_from_html(item):
164        """
165        Map pre-mapped item
166
167        These have been mapped by Zeeschuimer from the page HTML and contain
168        less data than JSON objects (but enough to be useful in some cases).
169
170        :param dict item:
171        :return MappedItem:
172        """
173        return MappedItem({
174            "id": item["id"],
175            "thread_id": item["id"],
176            "url": f"https://www.xiaohongshu.com{item['url']}",
177            "title": item["title"],
178            "body": MissingMappedField(""),
179            "timestamp": MissingMappedField(""),
180            "author": item["author_name"],
181            "author_avatar_url": item["author_avatar_url"],
182            "image_url": item["thumbnail_url"],
183            "video_url": MissingMappedField(""),
184            # "tags": MissingMappedField(""),
185            "likes": item["likes"],
186            # "collects": MissingMappedField(""),
187            # "comments": MissingMappedField(""),
188            # "shares": MissingMappedField(""),
189            "unix_timestamp": MissingMappedField(""),
190        })

Import scraped RedNote/Xiaohongshu/XSH data

type = 'xiaohongshu-search'
category = 'Search'
title = 'Import scraped RedNote data'
description = 'Import RedNote data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def get_items(self, query):
32    def get_items(self, query):
33        """
34        Run custom search
35
36        Not available for RedNote
37        """
38        raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")

Run custom search

Not available for RedNote

@staticmethod
def map_item(post):
41    @staticmethod
42    def map_item(post):
43        """
44        Map XSH object to 4CAT item
45
46        Depending on whether the object was captured from JSON or HTML, treat it
47        differently. A lot of data is missing from HTML objects.
48
49        :param post:
50        :return:
51        """
52        if post.get("_zs-origin") == "html":
53            return SearchRedNote.map_item_from_html(post)
54        else:
55            if "note" in post:
56                return SearchRedNote.map_item_from_json_embedded(post)
57            else:
58                return SearchRedNote.map_item_from_json_api_explore(post)

Map XSH object to 4CAT item

Depending on whether the object was captured from JSON or HTML, treat it differently. A lot of data is missing from HTML objects.

Parameters
  • post:
Returns
@staticmethod
def map_item_from_json_api_explore(post):
 60    @staticmethod
 61    def map_item_from_json_api_explore(post):
 62        """
 63        Map API-sourced XSH object to 4CAT item
 64
 65        Most straightforward - JSON objects from the XSH web API, which do
 66        however not always contain the same fields.
 67
 68        :param dict post:
 69        :return MappedItem:
 70        """
 71        item = post["note_card"] if post.get("type") != "video" else post
 72        item_id = post.get("id", post.get("note_id"))
 73
 74        import json
 75
 76        # Images
 77        images = []
 78        if item.get("image_list"):
 79            for image in item["image_list"]:
 80                if "url_default" in image and image["url_default"]:
 81                    images.append(image["url_default"])
 82                elif "info_list" in image and image["info_list"]:
 83                    for img_info in image["info_list"]:
 84                        found = False
 85                        if img_info.get("image_scene") == "WB_DFT":
 86                            images.append(img_info["url"])
 87                            found = True
 88                            break
 89                    if not found:
 90                        images.append(image["info_list"][0]["url"])
 91        elif item.get("cover"):
 92            images.append(item["cover"]["url_default"])
 93        else:
 94            # no image found;
 95            images = MissingMappedField("")       
 96
 97        # permalinks need this token to work, else you get a 404 not found
 98        xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else ""
 99        if item.get("video", {}).get("media"):
100            video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"]
101        else:
102            video_url = MissingMappedField("")
103
104        timestamp = item.get("time", None)
105        return MappedItem({
106            "id": item_id,
107            "thread_id": item_id,
108            "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}",
109            "title": item.get("display_title", ""),
110            "body": item.get("desc", "") if "desc" in item else MissingMappedField(""),
111            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
112            "author": item["user"]["nickname"],
113            "author_avatar_url": item["user"]["avatar"],
114            "image_urls": ",".join(images) if type(images) is list else images,
115            "video_url": video_url,
116            # only available when loading an individual post page, so skip
117            # "tags": ",".join(t["name"] for t in item["tag_list"]),
118            "likes": item["interact_info"]["liked_count"],
119            # "collects": item["interact_info"]["collected_count"],
120            # "comments": item["interact_info"]["comment_count"],
121            # "shares": item["interact_info"]["share_count"],
122            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
123        })

Map API-sourced XSH object to 4CAT item

Most straightforward - JSON objects from the XSH web API, which do however not always contain the same fields.

Parameters
  • dict post:
Returns
@staticmethod
def map_item_from_json_embedded(item):
125    @staticmethod
126    def map_item_from_json_embedded(item):
127        """
128        Map JSON object from an XHS HTML page
129
130        JSON objects from the HTML are formatted slightly differently, mostly
131        in that they use camelCase instead of underscores, but we can also
132        make a few more assumptions about the data
133
134        :param dict item:
135        :return MappedItem:
136        """
137        note = item["note"]
138        image = note["imageList"][0]["urlDefault"]
139        # permalinks need this token to work, else you get a 404 not found
140        xsec_bit = f"?xsec_token={note['xsecToken']}"
141        timestamp = item.get("time", None)
142
143        return MappedItem({
144            "id": item["id"],
145            "thread_id": item["id"],
146            "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}",
147            "title": note.get("title", ""),
148            "body": note.get("desc", "") if "desc" in note else MissingMappedField(""),
149            "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""),
150            "author": note["user"]["nickname"],
151            "author_avatar_url": note["user"]["avatar"],
152            "image_url": image,
153            "video_url": MissingMappedField(""),
154            # only available when loading an individual post page, so skip
155            # "tags": ",".join(t["name"] for t in item["tag_list"]),
156            "likes": item["interactInfo"]["likedCount"],
157            # "collects": item["interact_info"]["collected_count"],
158            # "comments": item["interact_info"]["comment_count"],
159            # "shares": item["interact_info"]["share_count"],
160            "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""),
161        })

Map JSON object from an XHS HTML page

JSON objects from the HTML are formatted slightly differently, mostly in that they use camelCase instead of underscores, but we can also make a few more assumptions about the data

Parameters
  • dict item:
Returns
def map_item_from_html(item):
163    def map_item_from_html(item):
164        """
165        Map pre-mapped item
166
167        These have been mapped by Zeeschuimer from the page HTML and contain
168        less data than JSON objects (but enough to be useful in some cases).
169
170        :param dict item:
171        :return MappedItem:
172        """
173        return MappedItem({
174            "id": item["id"],
175            "thread_id": item["id"],
176            "url": f"https://www.xiaohongshu.com{item['url']}",
177            "title": item["title"],
178            "body": MissingMappedField(""),
179            "timestamp": MissingMappedField(""),
180            "author": item["author_name"],
181            "author_avatar_url": item["author_avatar_url"],
182            "image_url": item["thumbnail_url"],
183            "video_url": MissingMappedField(""),
184            # "tags": MissingMappedField(""),
185            "likes": item["likes"],
186            # "collects": MissingMappedField(""),
187            # "comments": MissingMappedField(""),
188            # "shares": MissingMappedField(""),
189            "unix_timestamp": MissingMappedField(""),
190        })

Map pre-mapped item

These have been mapped by Zeeschuimer from the page HTML and contain less data than JSON objects (but enough to be useful in some cases).

Parameters
  • dict item:
Returns