datasources.douyin.search_douyin
Import scraped Douyin data
1""" 2Import scraped Douyin data 3""" 4import urllib 5import json 6import re 7from datetime import datetime 8 9from backend.lib.search import Search 10from common.lib.item_mapping import MappedItem, MissingMappedField 11 12class SearchDouyin(Search): 13 """ 14 Import scraped Douyin data 15 """ 16 type = "douyin-search" # job ID 17 category = "Search" # category 18 title = "Import scraped Douyin data" # title displayed in UI 19 description = "Import Douyin data collected with an external tool such as Zeeschuimer." # description displayed in UI 20 extension = "ndjson" # extension of result file, used internally and in UI 21 is_from_zeeschuimer = True 22 23 # not available as a processor for existing datasets 24 accepts = [None] 25 references = [ 26 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 27 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 28 ] 29 30 def get_items(self, query): 31 """ 32 Run custom search 33 34 Not available for Douyin 35 """ 36 raise NotImplementedError("Douyin datasets can only be created by importing data from elsewhere") 37 38 @staticmethod 39 def map_item(item): 40 """ 41 """ 42 metadata = item.get("__import_meta") 43 subject = "Post" 44 if "ZS_collected_from_embed" in item and item["ZS_collected_from_embed"]: 45 # HTML embedded posts formated differently than JSON posts 46 47 stream_data = item.get("cellRoom", {}).get("rawdata") if item.get("cellRoom") != "$undefined" else {} 48 if stream_data: 49 # These appear to be streams 50 subject = "Stream" 51 post_timestamp = datetime.fromtimestamp(stream_data.get("createtime", item.get( 52 "requestTime") / 1000)) # These may only have the timestamp of the request 53 video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1") 54 video_description = stream_data.get("title") 55 duration = "Unknown" 56 prevent_download = None 57 stats = stream_data.get("stats") 58 59 # Author is stream owner 60 author = stream_data.get("owner") 61 author_sec_key = "sec_uid" 62 avatar_thumb_key = "avatar_thumb" 63 url_list_key = "url_list" 64 is_fake_key = "is_ad_fake" # have not seen... 65 else: 66 post_timestamp = datetime.fromtimestamp(item["createTime"]) 67 videos_list = item.get("video").get("bitRateList") 68 if not videos_list: 69 # Image galleries do not have video data 70 video_url = "" 71 else: 72 videos = sorted([vid for vid in item.get("video").get("bitRateList")], key=lambda d: d.get("bitRate"), 73 reverse=True) 74 video_url = "https" + videos[0]["playApi"] 75 video_description = item["desc"] 76 duration = item.get("duration", item.get("video", {}).get("duration", "Unknown")) 77 prevent_download = "yes" if item["download"]["prevent"] else "no" 78 stats = item["stats"] 79 80 # Author is, well, author 81 author = item["authorInfo"] 82 author_sec_key = "secUid" 83 avatar_thumb_key = "avatarThumb" 84 url_list_key = "urlList" 85 is_fake_key = "isAdFake" 86 87 # Embedded Keys 88 aweme_id_key = "awemeId" 89 group_id_key = "groupId" 90 text_extra_key = "textExtra" 91 hashtag_key = "hashtagName" 92 mention_key = "secUid" 93 author_id_key = "authorUserId" 94 mix_info_key = "mixInfo" 95 mix_id_key = "mixId" 96 mix_name_key = "mixName" 97 98 # Stats 99 collect_count = stats.get("collectCount", MissingMappedField("Unknown")) 100 comment_count = stats.get("commentCount", MissingMappedField("Unknown")) 101 digg_count = stats.get("diggCount", MissingMappedField("Unknown")) 102 download_count = stats.get("downloadCount", MissingMappedField("Unknown")) 103 forward_count = stats.get("forwardCount", MissingMappedField("Unknown")) 104 play_count = stats.get("playCount", MissingMappedField("Unknown")) 105 share_count = stats.get("shareCount", MissingMappedField("Unknown")) 106 live_watch_count = stats.get("liveWatchCount", MissingMappedField("Unknown")) 107 108 # This is a guess, I have not encountered it 109 video_tags = ",".join([tag["tagName"] for tag in item.get("videoTag", []) if "tagName" in tag]) 110 111 mix_current_episode = item.get(mix_info_key, {}).get("currentEpisode", "N/A") 112 113 else: 114 stream_data = item.get("rawdata", item.get("cell_room", {}).get("rawdata")) 115 if stream_data: 116 subject = "Stream" 117 stream_data = json.loads(stream_data) 118 post_timestamp = datetime.fromtimestamp( 119 stream_data.get("create_time", item.get("create_time", metadata.get( 120 "timestamp_collected") / 1000))) # Some posts appear to have no timestamp! We substitute collection time 121 video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1") 122 video_description = stream_data.get("title") 123 duration = "Unknown" 124 125 # Author is stream owner 126 author = stream_data.get("owner") 127 video_tags = stream_data.get("video_feed_tag") 128 stats = stream_data.get("stats") 129 130 else: 131 post_timestamp = datetime.fromtimestamp(item["create_time"]) 132 videos_list = item.get("video").get("bit_rate") 133 if not videos_list: 134 # Image galleries do not have video data 135 video_url = "" 136 else: 137 videos = sorted([vid for vid in item["video"]["bit_rate"]], key=lambda d: d.get("bit_rate"), 138 reverse=True) 139 video_url = videos[0]["play_addr"].get("url_list", [''])[-1] if len(videos) > 0 else "" 140 video_description = item["desc"] 141 duration = item.get("duration", item.get("video", {}).get("duration", "Unknown")) 142 143 # Author is, well, author 144 author = item["author"] 145 stats = item.get("statistics") 146 147 prevent_download = ("yes" if item["prevent_download"] else "no") if "prevent_download" in item else None 148 149 # Keys 150 aweme_id_key = "aweme_id" 151 group_id_key = "group_id" 152 text_extra_key = "text_extra" 153 hashtag_key = "hashtag_name" 154 mention_key = "sec_uid" 155 author_id_key = "author_user_id" 156 mix_info_key = "mix_info" 157 mix_id_key = "mix_id" 158 mix_name_key = "mix_name" 159 160 author_sec_key = "sec_uid" 161 avatar_thumb_key = "avatar_thumb" 162 url_list_key = "url_list" 163 is_fake_key = "is_ad_fake" 164 165 # Stats 166 collect_count = stats.get("collect_count") if stats else MissingMappedField("Unknown") 167 comment_count = stats.get("comment_count") if stats else MissingMappedField("Unknown") 168 digg_count = stats.get("digg_count") if stats else MissingMappedField("Unknown") 169 download_count = stats.get("download_count") if stats else MissingMappedField("Unknown") 170 forward_count = stats.get("forward_count") if stats else MissingMappedField("Unknown") 171 play_count = stats.get("play_count") if stats else MissingMappedField("Unknown") 172 share_count = stats.get("share_count") if stats else MissingMappedField("Unknown") 173 live_watch_count = stats.get("live_watch_count") if stats else MissingMappedField("Unknown") 174 175 video_tags = ",".join( 176 [tag["tag_name"] for tag in (item["video_tag"] if item["video_tag"] is not None else []) if 177 "tag_name" in tag]) 178 179 mix_current_episode = item.get(mix_info_key, {}).get("statis", {}).get("current_episode", "N/A") 180 181 # Stream Stats 182 count_total_streams_viewers = stats.get("total_user", "N/A") 183 count_current_stream_viewers = SearchDouyin.get_chinese_number(stats.get("user_count_str")) if "user_count_str" in stats else "N/A" 184 185 # Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed 186 displayed = True 187 if item.get("ZS_collected_from_mix") and not item.get("ZS_first_mix_vid"): 188 displayed = False 189 190 # Image galleries have been added to Douyin 191 image_urls = [] 192 if item.get("images"): 193 for img in item["images"]: 194 if "url_list" in img: 195 image_urls.append(img["url_list"][0]) 196 elif "urlList" in img: 197 image_urls.append(img["urlList"][0]) 198 199 # Music 200 music_author = item.get('music').get('author') if item.get('music') and item.get("music") != "$undefined" else "" 201 music_title = item.get('music').get('title') if item.get('music') and item.get("music") != "$undefined" else "" 202 music_url = item.get('music').get('play_url', {}).get('uri') if item.get('music') and item.get("music") != "$undefined" else "" 203 204 # Collection 205 mix_current_episode = mix_current_episode if mix_current_episode != "$undefined" else "N/A" 206 collection_id = item.get(mix_info_key, {}).get(mix_id_key, "N/A") 207 collection_id = collection_id if collection_id != "$undefined" else "N/A" 208 collection_name = item.get(mix_info_key, {}).get(mix_name_key, "N/A") 209 collection_name = collection_name if collection_name != "$undefined" else "N/A" 210 part_of_collection = "yes" if mix_info_key in item and mix_id_key in item[ 211 mix_info_key] and collection_id != "N/A" else "no" 212 213 return MappedItem({ 214 "id": item[aweme_id_key], 215 "thread_id": item[group_id_key], 216 "subject": subject, 217 "body": video_description, 218 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 219 "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")), 220 # Adding this as different Douyin pages contain different data 221 "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}", 222 "region": item.get("region", ""), 223 "hashtags": ",".join( 224 [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if 225 hashtag_key in tag]), 226 "mentions": ",".join([f"https://www.douyin.com/user/{tag[mention_key]}" for tag in 227 (item[text_extra_key] if item[text_extra_key] is not None else []) if 228 mention_key in tag]), 229 # Actual username does not appear in object, but the sec_uid can be used to form a link to their profile 230 "video_tags": video_tags, 231 "prevent_download": prevent_download, 232 "video_url": video_url, 233 "video_duration": duration, 234 "image_urls": ','.join(image_urls), 235 "music_author": music_author, 236 "music_title": music_title, 237 "music_url": music_url, 238 # Video stats 239 "collect_count": collect_count, 240 "comment_count": comment_count, 241 "digg_count": digg_count, 242 "download_count": download_count, 243 "forward_count": forward_count, 244 "play_count": play_count, 245 "share_count": share_count, 246 "count_total_streams_viewers": count_total_streams_viewers, 247 "count_current_stream_viewers": count_current_stream_viewers, 248 # Author data 249 "author_user_id": item[author_id_key] if author_id_key in item else author.get("uid", author.get("id")), 250 "author_nickname": author["nickname"], 251 "author_profile_url": f"https://www.douyin.com/user/{author[author_sec_key]}", 252 "author_thumbnail_url": author[avatar_thumb_key].get(url_list_key, [''])[0], 253 "author_region": author.get("region"), 254 "author_is_ad_fake": author.get(is_fake_key), 255 # Collection/Mix 256 "part_of_collection": part_of_collection, 257 "4CAT_first_video_displayed": "yes" if displayed else "no", 258 # other videos may have been viewed, but this is unknown to us 259 "collection_id": collection_id, 260 "collection_name": collection_name, 261 "place_in_collection": mix_current_episode, 262 "unix_timestamp": int(post_timestamp.timestamp()), 263 }) 264 265 @staticmethod 266 def get_chinese_number(num): 267 if type(num) in (float, int): 268 return num 269 elif type(num) is not str: 270 return 0 271 272 if "δΈ‡" in num: 273 return float(re.sub(r"[^0-9.]", "", num)) * 10000 274 else: 275 return int(re.sub(r"[^0-9.]", "", num))
13class SearchDouyin(Search): 14 """ 15 Import scraped Douyin data 16 """ 17 type = "douyin-search" # job ID 18 category = "Search" # category 19 title = "Import scraped Douyin data" # title displayed in UI 20 description = "Import Douyin data collected with an external tool such as Zeeschuimer." # description displayed in UI 21 extension = "ndjson" # extension of result file, used internally and in UI 22 is_from_zeeschuimer = True 23 24 # not available as a processor for existing datasets 25 accepts = [None] 26 references = [ 27 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 28 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 29 ] 30 31 def get_items(self, query): 32 """ 33 Run custom search 34 35 Not available for Douyin 36 """ 37 raise NotImplementedError("Douyin datasets can only be created by importing data from elsewhere") 38 39 @staticmethod 40 def map_item(item): 41 """ 42 """ 43 metadata = item.get("__import_meta") 44 subject = "Post" 45 if "ZS_collected_from_embed" in item and item["ZS_collected_from_embed"]: 46 # HTML embedded posts formated differently than JSON posts 47 48 stream_data = item.get("cellRoom", {}).get("rawdata") if item.get("cellRoom") != "$undefined" else {} 49 if stream_data: 50 # These appear to be streams 51 subject = "Stream" 52 post_timestamp = datetime.fromtimestamp(stream_data.get("createtime", item.get( 53 "requestTime") / 1000)) # These may only have the timestamp of the request 54 video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1") 55 video_description = stream_data.get("title") 56 duration = "Unknown" 57 prevent_download = None 58 stats = stream_data.get("stats") 59 60 # Author is stream owner 61 author = stream_data.get("owner") 62 author_sec_key = "sec_uid" 63 avatar_thumb_key = "avatar_thumb" 64 url_list_key = "url_list" 65 is_fake_key = "is_ad_fake" # have not seen... 66 else: 67 post_timestamp = datetime.fromtimestamp(item["createTime"]) 68 videos_list = item.get("video").get("bitRateList") 69 if not videos_list: 70 # Image galleries do not have video data 71 video_url = "" 72 else: 73 videos = sorted([vid for vid in item.get("video").get("bitRateList")], key=lambda d: d.get("bitRate"), 74 reverse=True) 75 video_url = "https" + videos[0]["playApi"] 76 video_description = item["desc"] 77 duration = item.get("duration", item.get("video", {}).get("duration", "Unknown")) 78 prevent_download = "yes" if item["download"]["prevent"] else "no" 79 stats = item["stats"] 80 81 # Author is, well, author 82 author = item["authorInfo"] 83 author_sec_key = "secUid" 84 avatar_thumb_key = "avatarThumb" 85 url_list_key = "urlList" 86 is_fake_key = "isAdFake" 87 88 # Embedded Keys 89 aweme_id_key = "awemeId" 90 group_id_key = "groupId" 91 text_extra_key = "textExtra" 92 hashtag_key = "hashtagName" 93 mention_key = "secUid" 94 author_id_key = "authorUserId" 95 mix_info_key = "mixInfo" 96 mix_id_key = "mixId" 97 mix_name_key = "mixName" 98 99 # Stats 100 collect_count = stats.get("collectCount", MissingMappedField("Unknown")) 101 comment_count = stats.get("commentCount", MissingMappedField("Unknown")) 102 digg_count = stats.get("diggCount", MissingMappedField("Unknown")) 103 download_count = stats.get("downloadCount", MissingMappedField("Unknown")) 104 forward_count = stats.get("forwardCount", MissingMappedField("Unknown")) 105 play_count = stats.get("playCount", MissingMappedField("Unknown")) 106 share_count = stats.get("shareCount", MissingMappedField("Unknown")) 107 live_watch_count = stats.get("liveWatchCount", MissingMappedField("Unknown")) 108 109 # This is a guess, I have not encountered it 110 video_tags = ",".join([tag["tagName"] for tag in item.get("videoTag", []) if "tagName" in tag]) 111 112 mix_current_episode = item.get(mix_info_key, {}).get("currentEpisode", "N/A") 113 114 else: 115 stream_data = item.get("rawdata", item.get("cell_room", {}).get("rawdata")) 116 if stream_data: 117 subject = "Stream" 118 stream_data = json.loads(stream_data) 119 post_timestamp = datetime.fromtimestamp( 120 stream_data.get("create_time", item.get("create_time", metadata.get( 121 "timestamp_collected") / 1000))) # Some posts appear to have no timestamp! We substitute collection time 122 video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1") 123 video_description = stream_data.get("title") 124 duration = "Unknown" 125 126 # Author is stream owner 127 author = stream_data.get("owner") 128 video_tags = stream_data.get("video_feed_tag") 129 stats = stream_data.get("stats") 130 131 else: 132 post_timestamp = datetime.fromtimestamp(item["create_time"]) 133 videos_list = item.get("video").get("bit_rate") 134 if not videos_list: 135 # Image galleries do not have video data 136 video_url = "" 137 else: 138 videos = sorted([vid for vid in item["video"]["bit_rate"]], key=lambda d: d.get("bit_rate"), 139 reverse=True) 140 video_url = videos[0]["play_addr"].get("url_list", [''])[-1] if len(videos) > 0 else "" 141 video_description = item["desc"] 142 duration = item.get("duration", item.get("video", {}).get("duration", "Unknown")) 143 144 # Author is, well, author 145 author = item["author"] 146 stats = item.get("statistics") 147 148 prevent_download = ("yes" if item["prevent_download"] else "no") if "prevent_download" in item else None 149 150 # Keys 151 aweme_id_key = "aweme_id" 152 group_id_key = "group_id" 153 text_extra_key = "text_extra" 154 hashtag_key = "hashtag_name" 155 mention_key = "sec_uid" 156 author_id_key = "author_user_id" 157 mix_info_key = "mix_info" 158 mix_id_key = "mix_id" 159 mix_name_key = "mix_name" 160 161 author_sec_key = "sec_uid" 162 avatar_thumb_key = "avatar_thumb" 163 url_list_key = "url_list" 164 is_fake_key = "is_ad_fake" 165 166 # Stats 167 collect_count = stats.get("collect_count") if stats else MissingMappedField("Unknown") 168 comment_count = stats.get("comment_count") if stats else MissingMappedField("Unknown") 169 digg_count = stats.get("digg_count") if stats else MissingMappedField("Unknown") 170 download_count = stats.get("download_count") if stats else MissingMappedField("Unknown") 171 forward_count = stats.get("forward_count") if stats else MissingMappedField("Unknown") 172 play_count = stats.get("play_count") if stats else MissingMappedField("Unknown") 173 share_count = stats.get("share_count") if stats else MissingMappedField("Unknown") 174 live_watch_count = stats.get("live_watch_count") if stats else MissingMappedField("Unknown") 175 176 video_tags = ",".join( 177 [tag["tag_name"] for tag in (item["video_tag"] if item["video_tag"] is not None else []) if 178 "tag_name" in tag]) 179 180 mix_current_episode = item.get(mix_info_key, {}).get("statis", {}).get("current_episode", "N/A") 181 182 # Stream Stats 183 count_total_streams_viewers = stats.get("total_user", "N/A") 184 count_current_stream_viewers = SearchDouyin.get_chinese_number(stats.get("user_count_str")) if "user_count_str" in stats else "N/A" 185 186 # Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed 187 displayed = True 188 if item.get("ZS_collected_from_mix") and not item.get("ZS_first_mix_vid"): 189 displayed = False 190 191 # Image galleries have been added to Douyin 192 image_urls = [] 193 if item.get("images"): 194 for img in item["images"]: 195 if "url_list" in img: 196 image_urls.append(img["url_list"][0]) 197 elif "urlList" in img: 198 image_urls.append(img["urlList"][0]) 199 200 # Music 201 music_author = item.get('music').get('author') if item.get('music') and item.get("music") != "$undefined" else "" 202 music_title = item.get('music').get('title') if item.get('music') and item.get("music") != "$undefined" else "" 203 music_url = item.get('music').get('play_url', {}).get('uri') if item.get('music') and item.get("music") != "$undefined" else "" 204 205 # Collection 206 mix_current_episode = mix_current_episode if mix_current_episode != "$undefined" else "N/A" 207 collection_id = item.get(mix_info_key, {}).get(mix_id_key, "N/A") 208 collection_id = collection_id if collection_id != "$undefined" else "N/A" 209 collection_name = item.get(mix_info_key, {}).get(mix_name_key, "N/A") 210 collection_name = collection_name if collection_name != "$undefined" else "N/A" 211 part_of_collection = "yes" if mix_info_key in item and mix_id_key in item[ 212 mix_info_key] and collection_id != "N/A" else "no" 213 214 return MappedItem({ 215 "id": item[aweme_id_key], 216 "thread_id": item[group_id_key], 217 "subject": subject, 218 "body": video_description, 219 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 220 "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")), 221 # Adding this as different Douyin pages contain different data 222 "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}", 223 "region": item.get("region", ""), 224 "hashtags": ",".join( 225 [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if 226 hashtag_key in tag]), 227 "mentions": ",".join([f"https://www.douyin.com/user/{tag[mention_key]}" for tag in 228 (item[text_extra_key] if item[text_extra_key] is not None else []) if 229 mention_key in tag]), 230 # Actual username does not appear in object, but the sec_uid can be used to form a link to their profile 231 "video_tags": video_tags, 232 "prevent_download": prevent_download, 233 "video_url": video_url, 234 "video_duration": duration, 235 "image_urls": ','.join(image_urls), 236 "music_author": music_author, 237 "music_title": music_title, 238 "music_url": music_url, 239 # Video stats 240 "collect_count": collect_count, 241 "comment_count": comment_count, 242 "digg_count": digg_count, 243 "download_count": download_count, 244 "forward_count": forward_count, 245 "play_count": play_count, 246 "share_count": share_count, 247 "count_total_streams_viewers": count_total_streams_viewers, 248 "count_current_stream_viewers": count_current_stream_viewers, 249 # Author data 250 "author_user_id": item[author_id_key] if author_id_key in item else author.get("uid", author.get("id")), 251 "author_nickname": author["nickname"], 252 "author_profile_url": f"https://www.douyin.com/user/{author[author_sec_key]}", 253 "author_thumbnail_url": author[avatar_thumb_key].get(url_list_key, [''])[0], 254 "author_region": author.get("region"), 255 "author_is_ad_fake": author.get(is_fake_key), 256 # Collection/Mix 257 "part_of_collection": part_of_collection, 258 "4CAT_first_video_displayed": "yes" if displayed else "no", 259 # other videos may have been viewed, but this is unknown to us 260 "collection_id": collection_id, 261 "collection_name": collection_name, 262 "place_in_collection": mix_current_episode, 263 "unix_timestamp": int(post_timestamp.timestamp()), 264 }) 265 266 @staticmethod 267 def get_chinese_number(num): 268 if type(num) in (float, int): 269 return num 270 elif type(num) is not str: 271 return 0 272 273 if "δΈ‡" in num: 274 return float(re.sub(r"[^0-9.]", "", num)) * 10000 275 else: 276 return int(re.sub(r"[^0-9.]", "", num))
Import scraped Douyin data
references =
['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def
get_items(self, query):
31 def get_items(self, query): 32 """ 33 Run custom search 34 35 Not available for Douyin 36 """ 37 raise NotImplementedError("Douyin datasets can only be created by importing data from elsewhere")
Run custom search
Not available for Douyin
@staticmethod
def
map_item(item):
39 @staticmethod 40 def map_item(item): 41 """ 42 """ 43 metadata = item.get("__import_meta") 44 subject = "Post" 45 if "ZS_collected_from_embed" in item and item["ZS_collected_from_embed"]: 46 # HTML embedded posts formated differently than JSON posts 47 48 stream_data = item.get("cellRoom", {}).get("rawdata") if item.get("cellRoom") != "$undefined" else {} 49 if stream_data: 50 # These appear to be streams 51 subject = "Stream" 52 post_timestamp = datetime.fromtimestamp(stream_data.get("createtime", item.get( 53 "requestTime") / 1000)) # These may only have the timestamp of the request 54 video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1") 55 video_description = stream_data.get("title") 56 duration = "Unknown" 57 prevent_download = None 58 stats = stream_data.get("stats") 59 60 # Author is stream owner 61 author = stream_data.get("owner") 62 author_sec_key = "sec_uid" 63 avatar_thumb_key = "avatar_thumb" 64 url_list_key = "url_list" 65 is_fake_key = "is_ad_fake" # have not seen... 66 else: 67 post_timestamp = datetime.fromtimestamp(item["createTime"]) 68 videos_list = item.get("video").get("bitRateList") 69 if not videos_list: 70 # Image galleries do not have video data 71 video_url = "" 72 else: 73 videos = sorted([vid for vid in item.get("video").get("bitRateList")], key=lambda d: d.get("bitRate"), 74 reverse=True) 75 video_url = "https" + videos[0]["playApi"] 76 video_description = item["desc"] 77 duration = item.get("duration", item.get("video", {}).get("duration", "Unknown")) 78 prevent_download = "yes" if item["download"]["prevent"] else "no" 79 stats = item["stats"] 80 81 # Author is, well, author 82 author = item["authorInfo"] 83 author_sec_key = "secUid" 84 avatar_thumb_key = "avatarThumb" 85 url_list_key = "urlList" 86 is_fake_key = "isAdFake" 87 88 # Embedded Keys 89 aweme_id_key = "awemeId" 90 group_id_key = "groupId" 91 text_extra_key = "textExtra" 92 hashtag_key = "hashtagName" 93 mention_key = "secUid" 94 author_id_key = "authorUserId" 95 mix_info_key = "mixInfo" 96 mix_id_key = "mixId" 97 mix_name_key = "mixName" 98 99 # Stats 100 collect_count = stats.get("collectCount", MissingMappedField("Unknown")) 101 comment_count = stats.get("commentCount", MissingMappedField("Unknown")) 102 digg_count = stats.get("diggCount", MissingMappedField("Unknown")) 103 download_count = stats.get("downloadCount", MissingMappedField("Unknown")) 104 forward_count = stats.get("forwardCount", MissingMappedField("Unknown")) 105 play_count = stats.get("playCount", MissingMappedField("Unknown")) 106 share_count = stats.get("shareCount", MissingMappedField("Unknown")) 107 live_watch_count = stats.get("liveWatchCount", MissingMappedField("Unknown")) 108 109 # This is a guess, I have not encountered it 110 video_tags = ",".join([tag["tagName"] for tag in item.get("videoTag", []) if "tagName" in tag]) 111 112 mix_current_episode = item.get(mix_info_key, {}).get("currentEpisode", "N/A") 113 114 else: 115 stream_data = item.get("rawdata", item.get("cell_room", {}).get("rawdata")) 116 if stream_data: 117 subject = "Stream" 118 stream_data = json.loads(stream_data) 119 post_timestamp = datetime.fromtimestamp( 120 stream_data.get("create_time", item.get("create_time", metadata.get( 121 "timestamp_collected") / 1000))) # Some posts appear to have no timestamp! We substitute collection time 122 video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1") 123 video_description = stream_data.get("title") 124 duration = "Unknown" 125 126 # Author is stream owner 127 author = stream_data.get("owner") 128 video_tags = stream_data.get("video_feed_tag") 129 stats = stream_data.get("stats") 130 131 else: 132 post_timestamp = datetime.fromtimestamp(item["create_time"]) 133 videos_list = item.get("video").get("bit_rate") 134 if not videos_list: 135 # Image galleries do not have video data 136 video_url = "" 137 else: 138 videos = sorted([vid for vid in item["video"]["bit_rate"]], key=lambda d: d.get("bit_rate"), 139 reverse=True) 140 video_url = videos[0]["play_addr"].get("url_list", [''])[-1] if len(videos) > 0 else "" 141 video_description = item["desc"] 142 duration = item.get("duration", item.get("video", {}).get("duration", "Unknown")) 143 144 # Author is, well, author 145 author = item["author"] 146 stats = item.get("statistics") 147 148 prevent_download = ("yes" if item["prevent_download"] else "no") if "prevent_download" in item else None 149 150 # Keys 151 aweme_id_key = "aweme_id" 152 group_id_key = "group_id" 153 text_extra_key = "text_extra" 154 hashtag_key = "hashtag_name" 155 mention_key = "sec_uid" 156 author_id_key = "author_user_id" 157 mix_info_key = "mix_info" 158 mix_id_key = "mix_id" 159 mix_name_key = "mix_name" 160 161 author_sec_key = "sec_uid" 162 avatar_thumb_key = "avatar_thumb" 163 url_list_key = "url_list" 164 is_fake_key = "is_ad_fake" 165 166 # Stats 167 collect_count = stats.get("collect_count") if stats else MissingMappedField("Unknown") 168 comment_count = stats.get("comment_count") if stats else MissingMappedField("Unknown") 169 digg_count = stats.get("digg_count") if stats else MissingMappedField("Unknown") 170 download_count = stats.get("download_count") if stats else MissingMappedField("Unknown") 171 forward_count = stats.get("forward_count") if stats else MissingMappedField("Unknown") 172 play_count = stats.get("play_count") if stats else MissingMappedField("Unknown") 173 share_count = stats.get("share_count") if stats else MissingMappedField("Unknown") 174 live_watch_count = stats.get("live_watch_count") if stats else MissingMappedField("Unknown") 175 176 video_tags = ",".join( 177 [tag["tag_name"] for tag in (item["video_tag"] if item["video_tag"] is not None else []) if 178 "tag_name" in tag]) 179 180 mix_current_episode = item.get(mix_info_key, {}).get("statis", {}).get("current_episode", "N/A") 181 182 # Stream Stats 183 count_total_streams_viewers = stats.get("total_user", "N/A") 184 count_current_stream_viewers = SearchDouyin.get_chinese_number(stats.get("user_count_str")) if "user_count_str" in stats else "N/A" 185 186 # Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed 187 displayed = True 188 if item.get("ZS_collected_from_mix") and not item.get("ZS_first_mix_vid"): 189 displayed = False 190 191 # Image galleries have been added to Douyin 192 image_urls = [] 193 if item.get("images"): 194 for img in item["images"]: 195 if "url_list" in img: 196 image_urls.append(img["url_list"][0]) 197 elif "urlList" in img: 198 image_urls.append(img["urlList"][0]) 199 200 # Music 201 music_author = item.get('music').get('author') if item.get('music') and item.get("music") != "$undefined" else "" 202 music_title = item.get('music').get('title') if item.get('music') and item.get("music") != "$undefined" else "" 203 music_url = item.get('music').get('play_url', {}).get('uri') if item.get('music') and item.get("music") != "$undefined" else "" 204 205 # Collection 206 mix_current_episode = mix_current_episode if mix_current_episode != "$undefined" else "N/A" 207 collection_id = item.get(mix_info_key, {}).get(mix_id_key, "N/A") 208 collection_id = collection_id if collection_id != "$undefined" else "N/A" 209 collection_name = item.get(mix_info_key, {}).get(mix_name_key, "N/A") 210 collection_name = collection_name if collection_name != "$undefined" else "N/A" 211 part_of_collection = "yes" if mix_info_key in item and mix_id_key in item[ 212 mix_info_key] and collection_id != "N/A" else "no" 213 214 return MappedItem({ 215 "id": item[aweme_id_key], 216 "thread_id": item[group_id_key], 217 "subject": subject, 218 "body": video_description, 219 "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), 220 "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")), 221 # Adding this as different Douyin pages contain different data 222 "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}", 223 "region": item.get("region", ""), 224 "hashtags": ",".join( 225 [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if 226 hashtag_key in tag]), 227 "mentions": ",".join([f"https://www.douyin.com/user/{tag[mention_key]}" for tag in 228 (item[text_extra_key] if item[text_extra_key] is not None else []) if 229 mention_key in tag]), 230 # Actual username does not appear in object, but the sec_uid can be used to form a link to their profile 231 "video_tags": video_tags, 232 "prevent_download": prevent_download, 233 "video_url": video_url, 234 "video_duration": duration, 235 "image_urls": ','.join(image_urls), 236 "music_author": music_author, 237 "music_title": music_title, 238 "music_url": music_url, 239 # Video stats 240 "collect_count": collect_count, 241 "comment_count": comment_count, 242 "digg_count": digg_count, 243 "download_count": download_count, 244 "forward_count": forward_count, 245 "play_count": play_count, 246 "share_count": share_count, 247 "count_total_streams_viewers": count_total_streams_viewers, 248 "count_current_stream_viewers": count_current_stream_viewers, 249 # Author data 250 "author_user_id": item[author_id_key] if author_id_key in item else author.get("uid", author.get("id")), 251 "author_nickname": author["nickname"], 252 "author_profile_url": f"https://www.douyin.com/user/{author[author_sec_key]}", 253 "author_thumbnail_url": author[avatar_thumb_key].get(url_list_key, [''])[0], 254 "author_region": author.get("region"), 255 "author_is_ad_fake": author.get(is_fake_key), 256 # Collection/Mix 257 "part_of_collection": part_of_collection, 258 "4CAT_first_video_displayed": "yes" if displayed else "no", 259 # other videos may have been viewed, but this is unknown to us 260 "collection_id": collection_id, 261 "collection_name": collection_name, 262 "place_in_collection": mix_current_episode, 263 "unix_timestamp": int(post_timestamp.timestamp()), 264 })
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor