datasources.xiaohongshu.search_rednote
Import scraped RedNote data
It's prohibitively difficult to scrape data from RedNote within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped RedNote data 3 4It's prohibitively difficult to scrape data from RedNote within 4CAT itself due 5to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7from datetime import datetime 8 9from backend.lib.search import Search 10from common.lib.item_mapping import MappedItem, MissingMappedField 11 12 13class SearchRedNote(Search): 14 """ 15 Import scraped RedNote/Xiaohongshu/XSH data 16 """ 17 type = "xiaohongshu-search" # job ID 18 category = "Search" # category 19 title = "Import scraped RedNote data" # title displayed in UI 20 description = "Import RedNote data collected with an external tool such as Zeeschuimer." # description displayed in UI 21 extension = "ndjson" # extension of result file, used internally and in UI 22 is_from_zeeschuimer = True 23 24 # not available as a processor for existing datasets 25 accepts = [None] 26 references = [ 27 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 28 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 29 ] 30 31 def get_items(self, query): 32 """ 33 Run custom search 34 35 Not available for RedNote 36 """ 37 raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere") 38 39 40 @staticmethod 41 def map_item(post): 42 """ 43 Map XSH object to 4CAT item 44 45 Depending on whether the object was captured from JSON or HTML, treat it 46 differently. A lot of data is missing from HTML objects. 47 48 :param post: 49 :return: 50 """ 51 if post.get("_zs-origin") == "html": 52 return SearchRedNote.map_item_from_html(post) 53 else: 54 if "note" in post: 55 return SearchRedNote.map_item_from_json_embedded(post) 56 else: 57 return SearchRedNote.map_item_from_json_api_explore(post) 58 59 @staticmethod 60 def map_item_from_json_api_explore(post): 61 """ 62 Map API-sourced XSH object to 4CAT item 63 64 Most straightforward - JSON objects from the XSH web API, which do 65 however not always contain the same fields. 66 67 :param dict post: 68 :return MappedItem: 69 """ 70 item = post["note_card"] if post.get("type") != "video" else post 71 item_id = post.get("id", post.get("note_id")) 72 73 import json 74 75 # Images 76 images = [] 77 if item.get("image_list"): 78 for image in item["image_list"]: 79 if "url_default" in image and image["url_default"]: 80 images.append(image["url_default"]) 81 elif "info_list" in image and image["info_list"]: 82 for img_info in image["info_list"]: 83 found = False 84 if img_info.get("image_scene") == "WB_DFT": 85 images.append(img_info["url"]) 86 found = True 87 break 88 if not found: 89 images.append(image["info_list"][0]["url"]) 90 elif item.get("cover"): 91 images.append(item["cover"]["url_default"]) 92 else: 93 # no image found; 94 images = MissingMappedField("") 95 96 # permalinks need this token to work, else you get a 404 not found 97 xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" 98 if item.get("video", {}).get("media"): 99 video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"] 100 else: 101 video_url = MissingMappedField("") 102 103 timestamp = item.get("time", None) 104 return MappedItem({ 105 "id": item_id, 106 "thread_id": item_id, 107 "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", 108 "title": item.get("display_title", ""), 109 "body": item.get("desc", "") if "desc" in item else MissingMappedField(""), 110 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 111 "author": item["user"]["nickname"], 112 "author_avatar_url": item["user"]["avatar"], 113 "image_urls": ",".join(images) if type(images) is list else images, 114 "video_url": video_url, 115 # only available when loading an individual post page, so skip 116 # "tags": ",".join(t["name"] for t in item["tag_list"]), 117 "likes": item["interact_info"]["liked_count"], 118 # "collects": item["interact_info"]["collected_count"], 119 # "comments": item["interact_info"]["comment_count"], 120 # "shares": item["interact_info"]["share_count"], 121 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 122 }) 123 124 @staticmethod 125 def map_item_from_json_embedded(item): 126 """ 127 Map JSON object from an XHS HTML page 128 129 JSON objects from the HTML are formatted slightly differently, mostly 130 in that they use camelCase instead of underscores, but we can also 131 make a few more assumptions about the data 132 133 :param dict item: 134 :return MappedItem: 135 """ 136 note = item["note"] 137 image = note["imageList"][0]["urlDefault"] 138 # permalinks need this token to work, else you get a 404 not found 139 xsec_bit = f"?xsec_token={note['xsecToken']}" 140 timestamp = item.get("time", None) 141 142 return MappedItem({ 143 "id": item["id"], 144 "thread_id": item["id"], 145 "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", 146 "title": note.get("title", ""), 147 "body": note.get("desc", "") if "desc" in note else MissingMappedField(""), 148 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 149 "author": note["user"]["nickname"], 150 "author_avatar_url": note["user"]["avatar"], 151 "image_url": image, 152 "video_url": MissingMappedField(""), 153 # only available when loading an individual post page, so skip 154 # "tags": ",".join(t["name"] for t in item["tag_list"]), 155 "likes": item["interactInfo"]["likedCount"], 156 # "collects": item["interact_info"]["collected_count"], 157 # "comments": item["interact_info"]["comment_count"], 158 # "shares": item["interact_info"]["share_count"], 159 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 160 }) 161 162 def map_item_from_html(item): 163 """ 164 Map pre-mapped item 165 166 These have been mapped by Zeeschuimer from the page HTML and contain 167 less data than JSON objects (but enough to be useful in some cases). 168 169 :param dict item: 170 :return MappedItem: 171 """ 172 return MappedItem({ 173 "id": item["id"], 174 "thread_id": item["id"], 175 "url": f"https://www.xiaohongshu.com{item['url']}", 176 "title": item["title"], 177 "body": MissingMappedField(""), 178 "timestamp": MissingMappedField(""), 179 "author": item["author_name"], 180 "author_avatar_url": item["author_avatar_url"], 181 "image_url": item["thumbnail_url"], 182 "video_url": MissingMappedField(""), 183 # "tags": MissingMappedField(""), 184 "likes": item["likes"], 185 # "collects": MissingMappedField(""), 186 # "comments": MissingMappedField(""), 187 # "shares": MissingMappedField(""), 188 "unix_timestamp": MissingMappedField(""), 189 })
14class SearchRedNote(Search): 15 """ 16 Import scraped RedNote/Xiaohongshu/XSH data 17 """ 18 type = "xiaohongshu-search" # job ID 19 category = "Search" # category 20 title = "Import scraped RedNote data" # title displayed in UI 21 description = "Import RedNote data collected with an external tool such as Zeeschuimer." # description displayed in UI 22 extension = "ndjson" # extension of result file, used internally and in UI 23 is_from_zeeschuimer = True 24 25 # not available as a processor for existing datasets 26 accepts = [None] 27 references = [ 28 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 29 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 30 ] 31 32 def get_items(self, query): 33 """ 34 Run custom search 35 36 Not available for RedNote 37 """ 38 raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere") 39 40 41 @staticmethod 42 def map_item(post): 43 """ 44 Map XSH object to 4CAT item 45 46 Depending on whether the object was captured from JSON or HTML, treat it 47 differently. A lot of data is missing from HTML objects. 48 49 :param post: 50 :return: 51 """ 52 if post.get("_zs-origin") == "html": 53 return SearchRedNote.map_item_from_html(post) 54 else: 55 if "note" in post: 56 return SearchRedNote.map_item_from_json_embedded(post) 57 else: 58 return SearchRedNote.map_item_from_json_api_explore(post) 59 60 @staticmethod 61 def map_item_from_json_api_explore(post): 62 """ 63 Map API-sourced XSH object to 4CAT item 64 65 Most straightforward - JSON objects from the XSH web API, which do 66 however not always contain the same fields. 67 68 :param dict post: 69 :return MappedItem: 70 """ 71 item = post["note_card"] if post.get("type") != "video" else post 72 item_id = post.get("id", post.get("note_id")) 73 74 import json 75 76 # Images 77 images = [] 78 if item.get("image_list"): 79 for image in item["image_list"]: 80 if "url_default" in image and image["url_default"]: 81 images.append(image["url_default"]) 82 elif "info_list" in image and image["info_list"]: 83 for img_info in image["info_list"]: 84 found = False 85 if img_info.get("image_scene") == "WB_DFT": 86 images.append(img_info["url"]) 87 found = True 88 break 89 if not found: 90 images.append(image["info_list"][0]["url"]) 91 elif item.get("cover"): 92 images.append(item["cover"]["url_default"]) 93 else: 94 # no image found; 95 images = MissingMappedField("") 96 97 # permalinks need this token to work, else you get a 404 not found 98 xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" 99 if item.get("video", {}).get("media"): 100 video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"] 101 else: 102 video_url = MissingMappedField("") 103 104 timestamp = item.get("time", None) 105 return MappedItem({ 106 "id": item_id, 107 "thread_id": item_id, 108 "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", 109 "title": item.get("display_title", ""), 110 "body": item.get("desc", "") if "desc" in item else MissingMappedField(""), 111 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 112 "author": item["user"]["nickname"], 113 "author_avatar_url": item["user"]["avatar"], 114 "image_urls": ",".join(images) if type(images) is list else images, 115 "video_url": video_url, 116 # only available when loading an individual post page, so skip 117 # "tags": ",".join(t["name"] for t in item["tag_list"]), 118 "likes": item["interact_info"]["liked_count"], 119 # "collects": item["interact_info"]["collected_count"], 120 # "comments": item["interact_info"]["comment_count"], 121 # "shares": item["interact_info"]["share_count"], 122 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 123 }) 124 125 @staticmethod 126 def map_item_from_json_embedded(item): 127 """ 128 Map JSON object from an XHS HTML page 129 130 JSON objects from the HTML are formatted slightly differently, mostly 131 in that they use camelCase instead of underscores, but we can also 132 make a few more assumptions about the data 133 134 :param dict item: 135 :return MappedItem: 136 """ 137 note = item["note"] 138 image = note["imageList"][0]["urlDefault"] 139 # permalinks need this token to work, else you get a 404 not found 140 xsec_bit = f"?xsec_token={note['xsecToken']}" 141 timestamp = item.get("time", None) 142 143 return MappedItem({ 144 "id": item["id"], 145 "thread_id": item["id"], 146 "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", 147 "title": note.get("title", ""), 148 "body": note.get("desc", "") if "desc" in note else MissingMappedField(""), 149 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 150 "author": note["user"]["nickname"], 151 "author_avatar_url": note["user"]["avatar"], 152 "image_url": image, 153 "video_url": MissingMappedField(""), 154 # only available when loading an individual post page, so skip 155 # "tags": ",".join(t["name"] for t in item["tag_list"]), 156 "likes": item["interactInfo"]["likedCount"], 157 # "collects": item["interact_info"]["collected_count"], 158 # "comments": item["interact_info"]["comment_count"], 159 # "shares": item["interact_info"]["share_count"], 160 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 161 }) 162 163 def map_item_from_html(item): 164 """ 165 Map pre-mapped item 166 167 These have been mapped by Zeeschuimer from the page HTML and contain 168 less data than JSON objects (but enough to be useful in some cases). 169 170 :param dict item: 171 :return MappedItem: 172 """ 173 return MappedItem({ 174 "id": item["id"], 175 "thread_id": item["id"], 176 "url": f"https://www.xiaohongshu.com{item['url']}", 177 "title": item["title"], 178 "body": MissingMappedField(""), 179 "timestamp": MissingMappedField(""), 180 "author": item["author_name"], 181 "author_avatar_url": item["author_avatar_url"], 182 "image_url": item["thumbnail_url"], 183 "video_url": MissingMappedField(""), 184 # "tags": MissingMappedField(""), 185 "likes": item["likes"], 186 # "collects": MissingMappedField(""), 187 # "comments": MissingMappedField(""), 188 # "shares": MissingMappedField(""), 189 "unix_timestamp": MissingMappedField(""), 190 })
Import scraped RedNote/Xiaohongshu/XSH data
32 def get_items(self, query): 33 """ 34 Run custom search 35 36 Not available for RedNote 37 """ 38 raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")
Run custom search
Not available for RedNote
41 @staticmethod 42 def map_item(post): 43 """ 44 Map XSH object to 4CAT item 45 46 Depending on whether the object was captured from JSON or HTML, treat it 47 differently. A lot of data is missing from HTML objects. 48 49 :param post: 50 :return: 51 """ 52 if post.get("_zs-origin") == "html": 53 return SearchRedNote.map_item_from_html(post) 54 else: 55 if "note" in post: 56 return SearchRedNote.map_item_from_json_embedded(post) 57 else: 58 return SearchRedNote.map_item_from_json_api_explore(post)
Map XSH object to 4CAT item
Depending on whether the object was captured from JSON or HTML, treat it differently. A lot of data is missing from HTML objects.
Parameters
- post:
Returns
60 @staticmethod 61 def map_item_from_json_api_explore(post): 62 """ 63 Map API-sourced XSH object to 4CAT item 64 65 Most straightforward - JSON objects from the XSH web API, which do 66 however not always contain the same fields. 67 68 :param dict post: 69 :return MappedItem: 70 """ 71 item = post["note_card"] if post.get("type") != "video" else post 72 item_id = post.get("id", post.get("note_id")) 73 74 import json 75 76 # Images 77 images = [] 78 if item.get("image_list"): 79 for image in item["image_list"]: 80 if "url_default" in image and image["url_default"]: 81 images.append(image["url_default"]) 82 elif "info_list" in image and image["info_list"]: 83 for img_info in image["info_list"]: 84 found = False 85 if img_info.get("image_scene") == "WB_DFT": 86 images.append(img_info["url"]) 87 found = True 88 break 89 if not found: 90 images.append(image["info_list"][0]["url"]) 91 elif item.get("cover"): 92 images.append(item["cover"]["url_default"]) 93 else: 94 # no image found; 95 images = MissingMappedField("") 96 97 # permalinks need this token to work, else you get a 404 not found 98 xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" 99 if item.get("video", {}).get("media"): 100 video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"] 101 else: 102 video_url = MissingMappedField("") 103 104 timestamp = item.get("time", None) 105 return MappedItem({ 106 "id": item_id, 107 "thread_id": item_id, 108 "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", 109 "title": item.get("display_title", ""), 110 "body": item.get("desc", "") if "desc" in item else MissingMappedField(""), 111 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 112 "author": item["user"]["nickname"], 113 "author_avatar_url": item["user"]["avatar"], 114 "image_urls": ",".join(images) if type(images) is list else images, 115 "video_url": video_url, 116 # only available when loading an individual post page, so skip 117 # "tags": ",".join(t["name"] for t in item["tag_list"]), 118 "likes": item["interact_info"]["liked_count"], 119 # "collects": item["interact_info"]["collected_count"], 120 # "comments": item["interact_info"]["comment_count"], 121 # "shares": item["interact_info"]["share_count"], 122 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 123 })
Map API-sourced XSH object to 4CAT item
Most straightforward - JSON objects from the XSH web API, which do however not always contain the same fields.
Parameters
- dict post:
Returns
125 @staticmethod 126 def map_item_from_json_embedded(item): 127 """ 128 Map JSON object from an XHS HTML page 129 130 JSON objects from the HTML are formatted slightly differently, mostly 131 in that they use camelCase instead of underscores, but we can also 132 make a few more assumptions about the data 133 134 :param dict item: 135 :return MappedItem: 136 """ 137 note = item["note"] 138 image = note["imageList"][0]["urlDefault"] 139 # permalinks need this token to work, else you get a 404 not found 140 xsec_bit = f"?xsec_token={note['xsecToken']}" 141 timestamp = item.get("time", None) 142 143 return MappedItem({ 144 "id": item["id"], 145 "thread_id": item["id"], 146 "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", 147 "title": note.get("title", ""), 148 "body": note.get("desc", "") if "desc" in note else MissingMappedField(""), 149 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 150 "author": note["user"]["nickname"], 151 "author_avatar_url": note["user"]["avatar"], 152 "image_url": image, 153 "video_url": MissingMappedField(""), 154 # only available when loading an individual post page, so skip 155 # "tags": ",".join(t["name"] for t in item["tag_list"]), 156 "likes": item["interactInfo"]["likedCount"], 157 # "collects": item["interact_info"]["collected_count"], 158 # "comments": item["interact_info"]["comment_count"], 159 # "shares": item["interact_info"]["share_count"], 160 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 161 })
Map JSON object from an XHS HTML page
JSON objects from the HTML are formatted slightly differently, mostly in that they use camelCase instead of underscores, but we can also make a few more assumptions about the data
Parameters
- dict item:
Returns
163 def map_item_from_html(item): 164 """ 165 Map pre-mapped item 166 167 These have been mapped by Zeeschuimer from the page HTML and contain 168 less data than JSON objects (but enough to be useful in some cases). 169 170 :param dict item: 171 :return MappedItem: 172 """ 173 return MappedItem({ 174 "id": item["id"], 175 "thread_id": item["id"], 176 "url": f"https://www.xiaohongshu.com{item['url']}", 177 "title": item["title"], 178 "body": MissingMappedField(""), 179 "timestamp": MissingMappedField(""), 180 "author": item["author_name"], 181 "author_avatar_url": item["author_avatar_url"], 182 "image_url": item["thumbnail_url"], 183 "video_url": MissingMappedField(""), 184 # "tags": MissingMappedField(""), 185 "likes": item["likes"], 186 # "collects": MissingMappedField(""), 187 # "comments": MissingMappedField(""), 188 # "shares": MissingMappedField(""), 189 "unix_timestamp": MissingMappedField(""), 190 })
Map pre-mapped item
These have been mapped by Zeeschuimer from the page HTML and contain less data than JSON objects (but enough to be useful in some cases).
Parameters
- dict item:
Returns
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor