datasources.xiaohongshu.search_rednote
Import scraped RedNote data
It's prohibitively difficult to scrape data from RedNote within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped RedNote data 3 4It's prohibitively difficult to scrape data from RedNote within 4CAT itself due 5to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7import re 8 9from datetime import datetime 10 11from backend.lib.search import Search 12from common.lib.item_mapping import MappedItem, MissingMappedField 13from common.lib.helpers import normalize_url_encoding 14 15 16class SearchRedNote(Search): 17 """ 18 Import scraped RedNote/Xiaohongshu/XSH data 19 """ 20 type = "xiaohongshu-search" # job ID 21 category = "Search" # category 22 title = "Import scraped RedNote data" # title displayed in UI 23 description = "Import RedNote data collected with an external tool such as Zeeschuimer." # description displayed in UI 24 extension = "ndjson" # extension of result file, used internally and in UI 25 is_from_zeeschuimer = True 26 27 # not available as a processor for existing datasets 28 accepts = [None] 29 references = [ 30 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 31 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 32 ] 33 34 def get_items(self, query): 35 """ 36 Run custom search 37 38 Not available for RedNote 39 """ 40 raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere") 41 42 43 @staticmethod 44 def map_item(post): 45 """ 46 Map XSH object to 4CAT item 47 48 Depending on whether the object was captured from JSON or HTML, treat it 49 differently. A lot of data is missing from HTML objects. 50 51 :param post: 52 :return: 53 """ 54 if post.get("_zs-origin") == "html": 55 return SearchRedNote.map_item_from_html(post) 56 else: 57 if "note" in post: 58 return SearchRedNote.map_item_from_json_embedded(post) 59 else: 60 return SearchRedNote.map_item_from_json_api_explore(post) 61 62 @staticmethod 63 def map_item_from_json_api_explore(post): 64 """ 65 Map API-sourced XSH object to 4CAT item 66 67 Most straightforward - JSON objects from the XSH web API, which do 68 however not always contain the same fields. 69 70 :param dict post: 71 :return MappedItem: 72 """ 73 item = post["note_card"] if post.get("type") != "video" else post 74 item_id = post.get("id", post.get("note_id")) 75 76 77 # Images 78 images = [] 79 if item.get("image_list"): 80 for image in item["image_list"]: 81 if "url_default" in image and image["url_default"]: 82 images.append(image["url_default"]) 83 elif "info_list" in image and image["info_list"]: 84 for img_info in image["info_list"]: 85 found = False 86 if img_info.get("image_scene") == "WB_DFT": 87 images.append(img_info["url"]) 88 found = True 89 break 90 if not found: 91 images.append(image["info_list"][0]["url"]) 92 elif item.get("cover"): 93 images.append(item["cover"]["url_default"]) 94 else: 95 # no image found; 96 images = MissingMappedField("") 97 98 # permalinks need this token to work, else you get a 404 not found 99 xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" 100 if item.get("video", {}).get("media"): 101 video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"] 102 else: 103 video_url = MissingMappedField("") 104 105 timestamp = item.get("time", None) 106 return MappedItem({ 107 "collected_from_url": normalize_url_encoding(post.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 108 "id": item_id, 109 "thread_id": item_id, 110 "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", 111 "title": item.get("display_title", ""), 112 "body": item.get("desc", "") if "desc" in item else MissingMappedField(""), 113 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", item["desc"])) if "desc" in item else MissingMappedField(""), 114 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 115 "author": item["user"]["nickname"], 116 "author_avatar_url": item["user"]["avatar"], 117 "image_urls": ",".join(images) if type(images) is list else images, 118 "video_url": video_url, 119 # only available when loading an individual post page, so skip 120 # "tags": ",".join(t["name"] for t in item["tag_list"]), 121 "likes": item["interact_info"]["liked_count"], 122 # "collects": item["interact_info"]["collected_count"], 123 # "comments": item["interact_info"]["comment_count"], 124 # "shares": item["interact_info"]["share_count"], 125 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 126 }) 127 128 @staticmethod 129 def map_item_from_json_embedded(item): 130 """ 131 Map JSON object from an XHS HTML page 132 133 JSON objects from the HTML are formatted slightly differently, mostly 134 in that they use camelCase instead of underscores, but we can also 135 make a few more assumptions about the data 136 137 :param dict item: 138 :return MappedItem: 139 """ 140 note = item["note"] 141 image = note["imageList"][0]["urlDefault"] 142 # permalinks need this token to work, else you get a 404 not found 143 xsec_bit = f"?xsec_token={note['xsecToken']}" 144 timestamp = note.get("time", None) 145 146 if "interactInfo" in note: 147 likes = note["interactInfo"]["likedCount"] 148 elif "interact_info" in note: 149 likes = note["interact_info"]["liked_count"] 150 elif "likes" in note: 151 likes = note["likes"] 152 else: 153 likes = MissingMappedField("") 154 155 return MappedItem({ 156 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 157 "id": item["id"], 158 "thread_id": item["id"], 159 "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", 160 "title": note.get("title", ""), 161 "body": note.get("desc", "") if "desc" in note else MissingMappedField(""), 162 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", note["desc"])) if "desc" in note else MissingMappedField(""), 163 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 164 "author": note["user"]["nickname"], 165 "author_avatar_url": note["user"]["avatar"], 166 "image_url": image, 167 "video_url": MissingMappedField(""), 168 # only available when loading an individual post page, so skip 169 # "tags": ",".join(t["name"] for t in item["tag_list"]), 170 "likes": likes, 171 # "collects": item["interact_info"]["collected_count"], 172 # "comments": item["interact_info"]["comment_count"], 173 # "shares": item["interact_info"]["share_count"], 174 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 175 }) 176 177 def map_item_from_html(item): 178 """ 179 Map pre-mapped item 180 181 These have been mapped by Zeeschuimer from the page HTML and contain 182 less data than JSON objects (but enough to be useful in some cases). 183 184 :param dict item: 185 :return MappedItem: 186 """ 187 return MappedItem({ 188 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 189 "id": item["id"], 190 "thread_id": item["id"], 191 "url": f"https://www.xiaohongshu.com{item['url']}", 192 "title": item["title"], 193 "body": MissingMappedField(""), 194 "hashtags": MissingMappedField(""), 195 "timestamp": MissingMappedField(""), 196 "author": item["author_name"], 197 "author_avatar_url": item["author_avatar_url"], 198 "image_url": item["thumbnail_url"], 199 "video_url": MissingMappedField(""), 200 # "tags": MissingMappedField(""), 201 "likes": item["likes"], 202 # "collects": MissingMappedField(""), 203 # "comments": MissingMappedField(""), 204 # "shares": MissingMappedField(""), 205 "unix_timestamp": MissingMappedField(""), 206 })
17class SearchRedNote(Search): 18 """ 19 Import scraped RedNote/Xiaohongshu/XSH data 20 """ 21 type = "xiaohongshu-search" # job ID 22 category = "Search" # category 23 title = "Import scraped RedNote data" # title displayed in UI 24 description = "Import RedNote data collected with an external tool such as Zeeschuimer." # description displayed in UI 25 extension = "ndjson" # extension of result file, used internally and in UI 26 is_from_zeeschuimer = True 27 28 # not available as a processor for existing datasets 29 accepts = [None] 30 references = [ 31 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 32 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 33 ] 34 35 def get_items(self, query): 36 """ 37 Run custom search 38 39 Not available for RedNote 40 """ 41 raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere") 42 43 44 @staticmethod 45 def map_item(post): 46 """ 47 Map XSH object to 4CAT item 48 49 Depending on whether the object was captured from JSON or HTML, treat it 50 differently. A lot of data is missing from HTML objects. 51 52 :param post: 53 :return: 54 """ 55 if post.get("_zs-origin") == "html": 56 return SearchRedNote.map_item_from_html(post) 57 else: 58 if "note" in post: 59 return SearchRedNote.map_item_from_json_embedded(post) 60 else: 61 return SearchRedNote.map_item_from_json_api_explore(post) 62 63 @staticmethod 64 def map_item_from_json_api_explore(post): 65 """ 66 Map API-sourced XSH object to 4CAT item 67 68 Most straightforward - JSON objects from the XSH web API, which do 69 however not always contain the same fields. 70 71 :param dict post: 72 :return MappedItem: 73 """ 74 item = post["note_card"] if post.get("type") != "video" else post 75 item_id = post.get("id", post.get("note_id")) 76 77 78 # Images 79 images = [] 80 if item.get("image_list"): 81 for image in item["image_list"]: 82 if "url_default" in image and image["url_default"]: 83 images.append(image["url_default"]) 84 elif "info_list" in image and image["info_list"]: 85 for img_info in image["info_list"]: 86 found = False 87 if img_info.get("image_scene") == "WB_DFT": 88 images.append(img_info["url"]) 89 found = True 90 break 91 if not found: 92 images.append(image["info_list"][0]["url"]) 93 elif item.get("cover"): 94 images.append(item["cover"]["url_default"]) 95 else: 96 # no image found; 97 images = MissingMappedField("") 98 99 # permalinks need this token to work, else you get a 404 not found 100 xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" 101 if item.get("video", {}).get("media"): 102 video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"] 103 else: 104 video_url = MissingMappedField("") 105 106 timestamp = item.get("time", None) 107 return MappedItem({ 108 "collected_from_url": normalize_url_encoding(post.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 109 "id": item_id, 110 "thread_id": item_id, 111 "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", 112 "title": item.get("display_title", ""), 113 "body": item.get("desc", "") if "desc" in item else MissingMappedField(""), 114 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", item["desc"])) if "desc" in item else MissingMappedField(""), 115 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 116 "author": item["user"]["nickname"], 117 "author_avatar_url": item["user"]["avatar"], 118 "image_urls": ",".join(images) if type(images) is list else images, 119 "video_url": video_url, 120 # only available when loading an individual post page, so skip 121 # "tags": ",".join(t["name"] for t in item["tag_list"]), 122 "likes": item["interact_info"]["liked_count"], 123 # "collects": item["interact_info"]["collected_count"], 124 # "comments": item["interact_info"]["comment_count"], 125 # "shares": item["interact_info"]["share_count"], 126 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 127 }) 128 129 @staticmethod 130 def map_item_from_json_embedded(item): 131 """ 132 Map JSON object from an XHS HTML page 133 134 JSON objects from the HTML are formatted slightly differently, mostly 135 in that they use camelCase instead of underscores, but we can also 136 make a few more assumptions about the data 137 138 :param dict item: 139 :return MappedItem: 140 """ 141 note = item["note"] 142 image = note["imageList"][0]["urlDefault"] 143 # permalinks need this token to work, else you get a 404 not found 144 xsec_bit = f"?xsec_token={note['xsecToken']}" 145 timestamp = note.get("time", None) 146 147 if "interactInfo" in note: 148 likes = note["interactInfo"]["likedCount"] 149 elif "interact_info" in note: 150 likes = note["interact_info"]["liked_count"] 151 elif "likes" in note: 152 likes = note["likes"] 153 else: 154 likes = MissingMappedField("") 155 156 return MappedItem({ 157 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 158 "id": item["id"], 159 "thread_id": item["id"], 160 "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", 161 "title": note.get("title", ""), 162 "body": note.get("desc", "") if "desc" in note else MissingMappedField(""), 163 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", note["desc"])) if "desc" in note else MissingMappedField(""), 164 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 165 "author": note["user"]["nickname"], 166 "author_avatar_url": note["user"]["avatar"], 167 "image_url": image, 168 "video_url": MissingMappedField(""), 169 # only available when loading an individual post page, so skip 170 # "tags": ",".join(t["name"] for t in item["tag_list"]), 171 "likes": likes, 172 # "collects": item["interact_info"]["collected_count"], 173 # "comments": item["interact_info"]["comment_count"], 174 # "shares": item["interact_info"]["share_count"], 175 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 176 }) 177 178 def map_item_from_html(item): 179 """ 180 Map pre-mapped item 181 182 These have been mapped by Zeeschuimer from the page HTML and contain 183 less data than JSON objects (but enough to be useful in some cases). 184 185 :param dict item: 186 :return MappedItem: 187 """ 188 return MappedItem({ 189 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 190 "id": item["id"], 191 "thread_id": item["id"], 192 "url": f"https://www.xiaohongshu.com{item['url']}", 193 "title": item["title"], 194 "body": MissingMappedField(""), 195 "hashtags": MissingMappedField(""), 196 "timestamp": MissingMappedField(""), 197 "author": item["author_name"], 198 "author_avatar_url": item["author_avatar_url"], 199 "image_url": item["thumbnail_url"], 200 "video_url": MissingMappedField(""), 201 # "tags": MissingMappedField(""), 202 "likes": item["likes"], 203 # "collects": MissingMappedField(""), 204 # "comments": MissingMappedField(""), 205 # "shares": MissingMappedField(""), 206 "unix_timestamp": MissingMappedField(""), 207 })
Import scraped RedNote/Xiaohongshu/XSH data
35 def get_items(self, query): 36 """ 37 Run custom search 38 39 Not available for RedNote 40 """ 41 raise NotImplementedError("RedNote/Xiaohongshu datasets can only be created by importing data from elsewhere")
Run custom search
Not available for RedNote
44 @staticmethod 45 def map_item(post): 46 """ 47 Map XSH object to 4CAT item 48 49 Depending on whether the object was captured from JSON or HTML, treat it 50 differently. A lot of data is missing from HTML objects. 51 52 :param post: 53 :return: 54 """ 55 if post.get("_zs-origin") == "html": 56 return SearchRedNote.map_item_from_html(post) 57 else: 58 if "note" in post: 59 return SearchRedNote.map_item_from_json_embedded(post) 60 else: 61 return SearchRedNote.map_item_from_json_api_explore(post)
Map XSH object to 4CAT item
Depending on whether the object was captured from JSON or HTML, treat it differently. A lot of data is missing from HTML objects.
Parameters
- post:
Returns
63 @staticmethod 64 def map_item_from_json_api_explore(post): 65 """ 66 Map API-sourced XSH object to 4CAT item 67 68 Most straightforward - JSON objects from the XSH web API, which do 69 however not always contain the same fields. 70 71 :param dict post: 72 :return MappedItem: 73 """ 74 item = post["note_card"] if post.get("type") != "video" else post 75 item_id = post.get("id", post.get("note_id")) 76 77 78 # Images 79 images = [] 80 if item.get("image_list"): 81 for image in item["image_list"]: 82 if "url_default" in image and image["url_default"]: 83 images.append(image["url_default"]) 84 elif "info_list" in image and image["info_list"]: 85 for img_info in image["info_list"]: 86 found = False 87 if img_info.get("image_scene") == "WB_DFT": 88 images.append(img_info["url"]) 89 found = True 90 break 91 if not found: 92 images.append(image["info_list"][0]["url"]) 93 elif item.get("cover"): 94 images.append(item["cover"]["url_default"]) 95 else: 96 # no image found; 97 images = MissingMappedField("") 98 99 # permalinks need this token to work, else you get a 404 not found 100 xsec_bit = f"?xsec_token={post['xsec_token']}" if post.get("xsec_token") else "" 101 if item.get("video", {}).get("media"): 102 video_url = item["video"]["media"]["stream"]["h264"][0]["master_url"] 103 else: 104 video_url = MissingMappedField("") 105 106 timestamp = item.get("time", None) 107 return MappedItem({ 108 "collected_from_url": normalize_url_encoding(post.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 109 "id": item_id, 110 "thread_id": item_id, 111 "url": f"https://www.xiaohongshu.com/explore/{post['id']}{xsec_bit}", 112 "title": item.get("display_title", ""), 113 "body": item.get("desc", "") if "desc" in item else MissingMappedField(""), 114 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", item["desc"])) if "desc" in item else MissingMappedField(""), 115 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 116 "author": item["user"]["nickname"], 117 "author_avatar_url": item["user"]["avatar"], 118 "image_urls": ",".join(images) if type(images) is list else images, 119 "video_url": video_url, 120 # only available when loading an individual post page, so skip 121 # "tags": ",".join(t["name"] for t in item["tag_list"]), 122 "likes": item["interact_info"]["liked_count"], 123 # "collects": item["interact_info"]["collected_count"], 124 # "comments": item["interact_info"]["comment_count"], 125 # "shares": item["interact_info"]["share_count"], 126 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 127 })
Map API-sourced XSH object to 4CAT item
Most straightforward - JSON objects from the XSH web API, which do however not always contain the same fields.
Parameters
- dict post:
Returns
129 @staticmethod 130 def map_item_from_json_embedded(item): 131 """ 132 Map JSON object from an XHS HTML page 133 134 JSON objects from the HTML are formatted slightly differently, mostly 135 in that they use camelCase instead of underscores, but we can also 136 make a few more assumptions about the data 137 138 :param dict item: 139 :return MappedItem: 140 """ 141 note = item["note"] 142 image = note["imageList"][0]["urlDefault"] 143 # permalinks need this token to work, else you get a 404 not found 144 xsec_bit = f"?xsec_token={note['xsecToken']}" 145 timestamp = note.get("time", None) 146 147 if "interactInfo" in note: 148 likes = note["interactInfo"]["likedCount"] 149 elif "interact_info" in note: 150 likes = note["interact_info"]["liked_count"] 151 elif "likes" in note: 152 likes = note["likes"] 153 else: 154 likes = MissingMappedField("") 155 156 return MappedItem({ 157 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 158 "id": item["id"], 159 "thread_id": item["id"], 160 "url": f"https://www.xiaohongshu.com/explore/{item['id']}{xsec_bit}", 161 "title": note.get("title", ""), 162 "body": note.get("desc", "") if "desc" in note else MissingMappedField(""), 163 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", note["desc"])) if "desc" in note else MissingMappedField(""), 164 "timestamp": datetime.fromtimestamp(timestamp / 1000).strftime("%Y-%m-%d %H:%M:%S") if timestamp else MissingMappedField(""), 165 "author": note["user"]["nickname"], 166 "author_avatar_url": note["user"]["avatar"], 167 "image_url": image, 168 "video_url": MissingMappedField(""), 169 # only available when loading an individual post page, so skip 170 # "tags": ",".join(t["name"] for t in item["tag_list"]), 171 "likes": likes, 172 # "collects": item["interact_info"]["collected_count"], 173 # "comments": item["interact_info"]["comment_count"], 174 # "shares": item["interact_info"]["share_count"], 175 "unix_timestamp": int(timestamp / 1000) if timestamp else MissingMappedField(""), 176 })
Map JSON object from an XHS HTML page
JSON objects from the HTML are formatted slightly differently, mostly in that they use camelCase instead of underscores, but we can also make a few more assumptions about the data
Parameters
- dict item:
Returns
178 def map_item_from_html(item): 179 """ 180 Map pre-mapped item 181 182 These have been mapped by Zeeschuimer from the page HTML and contain 183 less data than JSON objects (but enough to be useful in some cases). 184 185 :param dict item: 186 :return MappedItem: 187 """ 188 return MappedItem({ 189 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 190 "id": item["id"], 191 "thread_id": item["id"], 192 "url": f"https://www.xiaohongshu.com{item['url']}", 193 "title": item["title"], 194 "body": MissingMappedField(""), 195 "hashtags": MissingMappedField(""), 196 "timestamp": MissingMappedField(""), 197 "author": item["author_name"], 198 "author_avatar_url": item["author_avatar_url"], 199 "image_url": item["thumbnail_url"], 200 "video_url": MissingMappedField(""), 201 # "tags": MissingMappedField(""), 202 "likes": item["likes"], 203 # "collects": MissingMappedField(""), 204 # "comments": MissingMappedField(""), 205 # "shares": MissingMappedField(""), 206 "unix_timestamp": MissingMappedField(""), 207 })
Map pre-mapped item
These have been mapped by Zeeschuimer from the page HTML and contain less data than JSON objects (but enough to be useful in some cases).
Parameters
- dict item:
Returns
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- run_interruptable_process
- get_queue_id
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- for_cleanup
- work
- after_process
- clean_up_on_error
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor