datasources.xiaohongshu.search_rednote
Import scraped RedNote data
It's prohibitively difficult to scrape data from RedNote within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped RedNote data 3 4It's prohibitively difficult to scrape data from RedNote within 4CAT itself due 5to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7from datetime import datetime 8 9from backend.lib.search import Search 10from common.lib.item_mapping import MappedItem, MissingMappedField 11 12 13class SearchRedNote(Search): 14 """ 15 Import scraped RedNote/Xiaohongshu/XSH data 16 """ 17 type = "xiaohongshu-search" # job ID 18 category = "Search" # category 19 title = "Import scraped RedNote data" # title displayed in UI 20 description = "Import RedNote data collected with an external tool such as Zeeschuimer." # description displayed in UI 21 extension = "ndjson" # extension of result file, used internally and in UI 22 is_from_zeeschuimer = True 23 24 # not available as a processor for existing datasets 25 accepts = [None] 26 references = [ 27 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 28 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 29 ] 30 31 def get_items(self, query): 32 """ 33 Run custom search 34 35 Not available for RedNote 36 """ 37 raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere") 38 39 40 @staticmethod 41 def map_item(post): 42 """ 43 Map XSH object to 4CAT item 44 45 Depending on whether the object was captured from JSON or HTML, treat it 46 differently. A lot of data is missing from HTML objects. 47 48 :param post: 49 :return: 50 """ 51 if post.get("_zs-origin") == "html": 52 return SearchRedNote.map_item_from_html(post) 53 else: 54 if "note" in post: 55 return SearchRedNote.map_item_from_json_embedded(post) 56 else: 57 return SearchRedNote.map_item_from_json_api_explore(post) 58 59 @staticmethod 60 def map_item_from_json_api_explore(post): 61 """ 62 Map API-sourced XSH object to 4CAT item 63 64 Most straightforward - JSON objects from the XSH web API, which do 65 however not always contain the same fields. 66 67 :param dict post: 68 :return MappedItem: 69 """ 70 item = post["note_card"] if post.get("type") != "video" else post 71 item_id = post.get("id", post.get("note_id")) 72 73 74 # Images 75 images = [] 76 if item.get("image_list"): 77 for image in item["image_list"]: 78 if "url_default" in image and image["url_default"]: 79 images.append(image["url_default"]) 80 elif "info_list" in image and image["info_list"]: 81 for img_info in image["info_list"]: 82 found = False 83 if img_info.get("image_scene") == "WB_DFT": 84 images.append(img_info["url"]) 85 found = True 86 break 87 if not found: 88 images.append(image["info_list"][0]["url"]) 89 elif item.get("cover"): 90 images.append(item["cover"]["url_default"]) 91 else: 92 # no image found; 93 images = MissingMappedField("") 94 95 # permalinks need this token to work, else you get a 404 not found 96 xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" 97 if item.get("video", {}).get("media"): 98 video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"] 99 else: 100 video_url = MissingMappedField("") 101 102 timestamp = item.get("time", None) 103 return MappedItem({ 104 "id": item_id, 105 "thread_id": item_id, 106 "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", 107 "title": item.get("display_title", ""), 108 "body": item.get("desc", "") if "desc" in item else MissingMappedField(""), 109 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 110 "author": item["user"]["nickname"], 111 "author_avatar_url": item["user"]["avatar"], 112 "image_urls": ",".join(images) if type(images) is list else images, 113 "video_url": video_url, 114 # only available when loading an individual post page, so skip 115 # "tags": ",".join(t["name"] for t in item["tag_list"]), 116 "likes": item["interact_info"]["liked_count"], 117 # "collects": item["interact_info"]["collected_count"], 118 # "comments": item["interact_info"]["comment_count"], 119 # "shares": item["interact_info"]["share_count"], 120 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 121 }) 122 123 @staticmethod 124 def map_item_from_json_embedded(item): 125 """ 126 Map JSON object from an XHS HTML page 127 128 JSON objects from the HTML are formatted slightly differently, mostly 129 in that they use camelCase instead of underscores, but we can also 130 make a few more assumptions about the data 131 132 :param dict item: 133 :return MappedItem: 134 """ 135 note = item["note"] 136 image = note["imageList"][0]["urlDefault"] 137 # permalinks need this token to work, else you get a 404 not found 138 xsec_bit = f"?xsec_token={note['xsecToken']}" 139 timestamp = item.get("time", None) 140 141 return MappedItem({ 142 "id": item["id"], 143 "thread_id": item["id"], 144 "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", 145 "title": note.get("title", ""), 146 "body": note.get("desc", "") if "desc" in note else MissingMappedField(""), 147 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 148 "author": note["user"]["nickname"], 149 "author_avatar_url": note["user"]["avatar"], 150 "image_url": image, 151 "video_url": MissingMappedField(""), 152 # only available when loading an individual post page, so skip 153 # "tags": ",".join(t["name"] for t in item["tag_list"]), 154 "likes": item["interactInfo"]["likedCount"], 155 # "collects": item["interact_info"]["collected_count"], 156 # "comments": item["interact_info"]["comment_count"], 157 # "shares": item["interact_info"]["share_count"], 158 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 159 }) 160 161 def map_item_from_html(item): 162 """ 163 Map pre-mapped item 164 165 These have been mapped by Zeeschuimer from the page HTML and contain 166 less data than JSON objects (but enough to be useful in some cases). 167 168 :param dict item: 169 :return MappedItem: 170 """ 171 return MappedItem({ 172 "id": item["id"], 173 "thread_id": item["id"], 174 "url": f"https://www.xiaohongshu.com{item['url']}", 175 "title": item["title"], 176 "body": MissingMappedField(""), 177 "timestamp": MissingMappedField(""), 178 "author": item["author_name"], 179 "author_avatar_url": item["author_avatar_url"], 180 "image_url": item["thumbnail_url"], 181 "video_url": MissingMappedField(""), 182 # "tags": MissingMappedField(""), 183 "likes": item["likes"], 184 # "collects": MissingMappedField(""), 185 # "comments": MissingMappedField(""), 186 # "shares": MissingMappedField(""), 187 "unix_timestamp": MissingMappedField(""), 188 })
14class SearchRedNote(Search): 15 """ 16 Import scraped RedNote/Xiaohongshu/XSH data 17 """ 18 type = "xiaohongshu-search" # job ID 19 category = "Search" # category 20 title = "Import scraped RedNote data" # title displayed in UI 21 description = "Import RedNote data collected with an external tool such as Zeeschuimer." # description displayed in UI 22 extension = "ndjson" # extension of result file, used internally and in UI 23 is_from_zeeschuimer = True 24 25 # not available as a processor for existing datasets 26 accepts = [None] 27 references = [ 28 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 29 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 30 ] 31 32 def get_items(self, query): 33 """ 34 Run custom search 35 36 Not available for RedNote 37 """ 38 raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere") 39 40 41 @staticmethod 42 def map_item(post): 43 """ 44 Map XSH object to 4CAT item 45 46 Depending on whether the object was captured from JSON or HTML, treat it 47 differently. A lot of data is missing from HTML objects. 48 49 :param post: 50 :return: 51 """ 52 if post.get("_zs-origin") == "html": 53 return SearchRedNote.map_item_from_html(post) 54 else: 55 if "note" in post: 56 return SearchRedNote.map_item_from_json_embedded(post) 57 else: 58 return SearchRedNote.map_item_from_json_api_explore(post) 59 60 @staticmethod 61 def map_item_from_json_api_explore(post): 62 """ 63 Map API-sourced XSH object to 4CAT item 64 65 Most straightforward - JSON objects from the XSH web API, which do 66 however not always contain the same fields. 67 68 :param dict post: 69 :return MappedItem: 70 """ 71 item = post["note_card"] if post.get("type") != "video" else post 72 item_id = post.get("id", post.get("note_id")) 73 74 75 # Images 76 images = [] 77 if item.get("image_list"): 78 for image in item["image_list"]: 79 if "url_default" in image and image["url_default"]: 80 images.append(image["url_default"]) 81 elif "info_list" in image and image["info_list"]: 82 for img_info in image["info_list"]: 83 found = False 84 if img_info.get("image_scene") == "WB_DFT": 85 images.append(img_info["url"]) 86 found = True 87 break 88 if not found: 89 images.append(image["info_list"][0]["url"]) 90 elif item.get("cover"): 91 images.append(item["cover"]["url_default"]) 92 else: 93 # no image found; 94 images = MissingMappedField("") 95 96 # permalinks need this token to work, else you get a 404 not found 97 xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" 98 if item.get("video", {}).get("media"): 99 video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"] 100 else: 101 video_url = MissingMappedField("") 102 103 timestamp = item.get("time", None) 104 return MappedItem({ 105 "id": item_id, 106 "thread_id": item_id, 107 "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", 108 "title": item.get("display_title", ""), 109 "body": item.get("desc", "") if "desc" in item else MissingMappedField(""), 110 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 111 "author": item["user"]["nickname"], 112 "author_avatar_url": item["user"]["avatar"], 113 "image_urls": ",".join(images) if type(images) is list else images, 114 "video_url": video_url, 115 # only available when loading an individual post page, so skip 116 # "tags": ",".join(t["name"] for t in item["tag_list"]), 117 "likes": item["interact_info"]["liked_count"], 118 # "collects": item["interact_info"]["collected_count"], 119 # "comments": item["interact_info"]["comment_count"], 120 # "shares": item["interact_info"]["share_count"], 121 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 122 }) 123 124 @staticmethod 125 def map_item_from_json_embedded(item): 126 """ 127 Map JSON object from an XHS HTML page 128 129 JSON objects from the HTML are formatted slightly differently, mostly 130 in that they use camelCase instead of underscores, but we can also 131 make a few more assumptions about the data 132 133 :param dict item: 134 :return MappedItem: 135 """ 136 note = item["note"] 137 image = note["imageList"][0]["urlDefault"] 138 # permalinks need this token to work, else you get a 404 not found 139 xsec_bit = f"?xsec_token={note['xsecToken']}" 140 timestamp = item.get("time", None) 141 142 return MappedItem({ 143 "id": item["id"], 144 "thread_id": item["id"], 145 "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", 146 "title": note.get("title", ""), 147 "body": note.get("desc", "") if "desc" in note else MissingMappedField(""), 148 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 149 "author": note["user"]["nickname"], 150 "author_avatar_url": note["user"]["avatar"], 151 "image_url": image, 152 "video_url": MissingMappedField(""), 153 # only available when loading an individual post page, so skip 154 # "tags": ",".join(t["name"] for t in item["tag_list"]), 155 "likes": item["interactInfo"]["likedCount"], 156 # "collects": item["interact_info"]["collected_count"], 157 # "comments": item["interact_info"]["comment_count"], 158 # "shares": item["interact_info"]["share_count"], 159 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 160 }) 161 162 def map_item_from_html(item): 163 """ 164 Map pre-mapped item 165 166 These have been mapped by Zeeschuimer from the page HTML and contain 167 less data than JSON objects (but enough to be useful in some cases). 168 169 :param dict item: 170 :return MappedItem: 171 """ 172 return MappedItem({ 173 "id": item["id"], 174 "thread_id": item["id"], 175 "url": f"https://www.xiaohongshu.com{item['url']}", 176 "title": item["title"], 177 "body": MissingMappedField(""), 178 "timestamp": MissingMappedField(""), 179 "author": item["author_name"], 180 "author_avatar_url": item["author_avatar_url"], 181 "image_url": item["thumbnail_url"], 182 "video_url": MissingMappedField(""), 183 # "tags": MissingMappedField(""), 184 "likes": item["likes"], 185 # "collects": MissingMappedField(""), 186 # "comments": MissingMappedField(""), 187 # "shares": MissingMappedField(""), 188 "unix_timestamp": MissingMappedField(""), 189 })
Import scraped RedNote/Xiaohongshu/XSH data
32 def get_items(self, query): 33 """ 34 Run custom search 35 36 Not available for RedNote 37 """ 38 raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")
Run custom search
Not available for RedNote
41 @staticmethod 42 def map_item(post): 43 """ 44 Map XSH object to 4CAT item 45 46 Depending on whether the object was captured from JSON or HTML, treat it 47 differently. A lot of data is missing from HTML objects. 48 49 :param post: 50 :return: 51 """ 52 if post.get("_zs-origin") == "html": 53 return SearchRedNote.map_item_from_html(post) 54 else: 55 if "note" in post: 56 return SearchRedNote.map_item_from_json_embedded(post) 57 else: 58 return SearchRedNote.map_item_from_json_api_explore(post)
Map XSH object to 4CAT item
Depending on whether the object was captured from JSON or HTML, treat it differently. A lot of data is missing from HTML objects.
Parameters
- post:
Returns
60 @staticmethod 61 def map_item_from_json_api_explore(post): 62 """ 63 Map API-sourced XSH object to 4CAT item 64 65 Most straightforward - JSON objects from the XSH web API, which do 66 however not always contain the same fields. 67 68 :param dict post: 69 :return MappedItem: 70 """ 71 item = post["note_card"] if post.get("type") != "video" else post 72 item_id = post.get("id", post.get("note_id")) 73 74 75 # Images 76 images = [] 77 if item.get("image_list"): 78 for image in item["image_list"]: 79 if "url_default" in image and image["url_default"]: 80 images.append(image["url_default"]) 81 elif "info_list" in image and image["info_list"]: 82 for img_info in image["info_list"]: 83 found = False 84 if img_info.get("image_scene") == "WB_DFT": 85 images.append(img_info["url"]) 86 found = True 87 break 88 if not found: 89 images.append(image["info_list"][0]["url"]) 90 elif item.get("cover"): 91 images.append(item["cover"]["url_default"]) 92 else: 93 # no image found; 94 images = MissingMappedField("") 95 96 # permalinks need this token to work, else you get a 404 not found 97 xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" 98 if item.get("video", {}).get("media"): 99 video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"] 100 else: 101 video_url = MissingMappedField("") 102 103 timestamp = item.get("time", None) 104 return MappedItem({ 105 "id": item_id, 106 "thread_id": item_id, 107 "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", 108 "title": item.get("display_title", ""), 109 "body": item.get("desc", "") if "desc" in item else MissingMappedField(""), 110 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 111 "author": item["user"]["nickname"], 112 "author_avatar_url": item["user"]["avatar"], 113 "image_urls": ",".join(images) if type(images) is list else images, 114 "video_url": video_url, 115 # only available when loading an individual post page, so skip 116 # "tags": ",".join(t["name"] for t in item["tag_list"]), 117 "likes": item["interact_info"]["liked_count"], 118 # "collects": item["interact_info"]["collected_count"], 119 # "comments": item["interact_info"]["comment_count"], 120 # "shares": item["interact_info"]["share_count"], 121 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 122 })
Map API-sourced XSH object to 4CAT item
Most straightforward - JSON objects from the XSH web API, which do however not always contain the same fields.
Parameters
- dict post:
Returns
124 @staticmethod 125 def map_item_from_json_embedded(item): 126 """ 127 Map JSON object from an XHS HTML page 128 129 JSON objects from the HTML are formatted slightly differently, mostly 130 in that they use camelCase instead of underscores, but we can also 131 make a few more assumptions about the data 132 133 :param dict item: 134 :return MappedItem: 135 """ 136 note = item["note"] 137 image = note["imageList"][0]["urlDefault"] 138 # permalinks need this token to work, else you get a 404 not found 139 xsec_bit = f"?xsec_token={note['xsecToken']}" 140 timestamp = item.get("time", None) 141 142 return MappedItem({ 143 "id": item["id"], 144 "thread_id": item["id"], 145 "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", 146 "title": note.get("title", ""), 147 "body": note.get("desc", "") if "desc" in note else MissingMappedField(""), 148 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 149 "author": note["user"]["nickname"], 150 "author_avatar_url": note["user"]["avatar"], 151 "image_url": image, 152 "video_url": MissingMappedField(""), 153 # only available when loading an individual post page, so skip 154 # "tags": ",".join(t["name"] for t in item["tag_list"]), 155 "likes": item["interactInfo"]["likedCount"], 156 # "collects": item["interact_info"]["collected_count"], 157 # "comments": item["interact_info"]["comment_count"], 158 # "shares": item["interact_info"]["share_count"], 159 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 160 })
Map JSON object from an XHS HTML page
JSON objects from the HTML are formatted slightly differently, mostly in that they use camelCase instead of underscores, but we can also make a few more assumptions about the data
Parameters
- dict item:
Returns
162 def map_item_from_html(item): 163 """ 164 Map pre-mapped item 165 166 These have been mapped by Zeeschuimer from the page HTML and contain 167 less data than JSON objects (but enough to be useful in some cases). 168 169 :param dict item: 170 :return MappedItem: 171 """ 172 return MappedItem({ 173 "id": item["id"], 174 "thread_id": item["id"], 175 "url": f"https://www.xiaohongshu.com{item['url']}", 176 "title": item["title"], 177 "body": MissingMappedField(""), 178 "timestamp": MissingMappedField(""), 179 "author": item["author_name"], 180 "author_avatar_url": item["author_avatar_url"], 181 "image_url": item["thumbnail_url"], 182 "video_url": MissingMappedField(""), 183 # "tags": MissingMappedField(""), 184 "likes": item["likes"], 185 # "collects": MissingMappedField(""), 186 # "comments": MissingMappedField(""), 187 # "shares": MissingMappedField(""), 188 "unix_timestamp": MissingMappedField(""), 189 })
Map pre-mapped item
These have been mapped by Zeeschuimer from the page HTML and contain less data than JSON objects (but enough to be useful in some cases).
Parameters
- dict item:
Returns
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor