Edit on GitHub

datasources.douyin.search_douyin

Import scraped Douyin data

  1"""
  2Import scraped Douyin data
  3"""
  4import urllib
  5import json
  6import re
  7from datetime import datetime
  8
  9from backend.lib.search import Search
 10from common.lib.item_mapping import MappedItem, MissingMappedField
 11
 12class SearchDouyin(Search):
 13    """
 14    Import scraped Douyin data
 15    """
 16    type = "douyin-search"  # job ID
 17    category = "Search"  # category
 18    title = "Import scraped Douyin data"  # title displayed in UI
 19    description = "Import Douyin data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 20    extension = "ndjson"  # extension of result file, used internally and in UI
 21    is_from_zeeschuimer = True
 22
 23    # not available as a processor for existing datasets
 24    accepts = [None]
 25    references = [
 26        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 27        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 28    ]
 29    
 30    def get_items(self, query):
 31        """
 32        Run custom search
 33
 34        Not available for Douyin
 35        """
 36        raise NotImplementedError("Douyin datasets can only be created by importing data from elsewhere")
 37
 38    @staticmethod
 39    def map_item(item):
 40        """
 41        """
 42        metadata = item.get("__import_meta")
 43        subject = "Post"
 44        if "ZS_collected_from_embed" in item and item["ZS_collected_from_embed"]:
 45            # HTML embedded posts formated differently than JSON posts
 46
 47            stream_data = item.get("cellRoom", {}).get("rawdata") if item.get("cellRoom") != "$undefined" else {}
 48            if stream_data:
 49                # These appear to be streams
 50                subject = "Stream"
 51                post_timestamp = datetime.fromtimestamp(stream_data.get("createtime", item.get(
 52                    "requestTime") / 1000))  # These may only have the timestamp of the request
 53                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
 54                video_description = stream_data.get("title")
 55                duration = "Unknown"
 56                prevent_download = None
 57                stats = stream_data.get("stats")
 58
 59                # Author is stream owner
 60                author = stream_data.get("owner")
 61                author_sec_key = "sec_uid"
 62                avatar_thumb_key = "avatar_thumb"
 63                url_list_key = "url_list"
 64                is_fake_key = "is_ad_fake"  # have not seen...
 65            else:
 66                post_timestamp = datetime.fromtimestamp(item["createTime"])
 67                videos_list = item.get("video").get("bitRateList")
 68                if not videos_list:
 69                    # Image galleries do not have video data
 70                    video_url = ""
 71                else:
 72                    videos = sorted([vid for vid in item.get("video").get("bitRateList")], key=lambda d: d.get("bitRate"),
 73                                reverse=True)
 74                    video_url = "https" + videos[0]["playApi"]
 75                video_description = item["desc"]
 76                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
 77                prevent_download = "yes" if item["download"]["prevent"] else "no"
 78                stats = item["stats"]
 79
 80                # Author is, well, author
 81                author = item["authorInfo"]
 82                author_sec_key = "secUid"
 83                avatar_thumb_key = "avatarThumb"
 84                url_list_key = "urlList"
 85                is_fake_key = "isAdFake"
 86
 87            # Embedded Keys
 88            aweme_id_key = "awemeId"
 89            group_id_key = "groupId"
 90            text_extra_key = "textExtra"
 91            hashtag_key = "hashtagName"
 92            mention_key = "secUid"
 93            author_id_key = "authorUserId"
 94            mix_info_key = "mixInfo"
 95            mix_id_key = "mixId"
 96            mix_name_key = "mixName"
 97
 98            # Stats
 99            collect_count = stats.get("collectCount", MissingMappedField("Unknown"))
100            comment_count = stats.get("commentCount", MissingMappedField("Unknown"))
101            digg_count = stats.get("diggCount", MissingMappedField("Unknown"))
102            download_count = stats.get("downloadCount", MissingMappedField("Unknown"))
103            forward_count = stats.get("forwardCount", MissingMappedField("Unknown"))
104            play_count = stats.get("playCount", MissingMappedField("Unknown"))
105            share_count = stats.get("shareCount", MissingMappedField("Unknown"))
106            # live_watch_count = stats.get("liveWatchCount", MissingMappedField("Unknown"))
107
108            # This is a guess, I have not encountered it
109            video_tags = ",".join([tag["tagName"] for tag in item.get("videoTag", []) if "tagName" in tag])
110
111            mix_current_episode = item.get(mix_info_key, {}).get("currentEpisode", "N/A")
112
113        else:
114            stream_data = item.get("rawdata", item.get("cell_room", {}).get("rawdata"))
115            if stream_data:
116                subject = "Stream"
117                stream_data = json.loads(stream_data)
118                post_timestamp = datetime.fromtimestamp(
119                    stream_data.get("create_time", item.get("create_time", metadata.get(
120                        "timestamp_collected") / 1000)))  # Some posts appear to have no timestamp! We substitute collection time
121                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
122                video_description = stream_data.get("title")
123                duration = "Unknown"
124
125                # Author is stream owner
126                author = stream_data.get("owner")
127                video_tags = stream_data.get("video_feed_tag")
128                stats = stream_data.get("stats")
129
130            else:
131                post_timestamp = datetime.fromtimestamp(item["create_time"])
132                videos_list = item.get("video").get("bit_rate")
133                if not videos_list:
134                    # Image galleries do not have video data
135                    video_url = ""
136                else:
137                    videos = sorted([vid for vid in item["video"]["bit_rate"]], key=lambda d: d.get("bit_rate"),
138                                reverse=True)
139                    video_url = videos[0]["play_addr"].get("url_list", [''])[-1] if len(videos) > 0 else ""
140                video_description = item["desc"]
141                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
142
143                # Author is, well, author
144                author = item["author"]
145                stats = item.get("statistics")
146
147            prevent_download = ("yes" if item["prevent_download"] else "no") if "prevent_download" in item else None
148
149            # Keys
150            aweme_id_key = "aweme_id"
151            group_id_key = "group_id"
152            text_extra_key = "text_extra"
153            hashtag_key = "hashtag_name"
154            mention_key = "sec_uid"
155            author_id_key = "author_user_id"
156            mix_info_key = "mix_info"
157            mix_id_key = "mix_id"
158            mix_name_key = "mix_name"
159
160            author_sec_key = "sec_uid"
161            avatar_thumb_key = "avatar_thumb"
162            url_list_key = "url_list"
163            is_fake_key = "is_ad_fake"
164
165            # Stats
166            collect_count = stats.get("collect_count") if stats else MissingMappedField("Unknown")
167            comment_count = stats.get("comment_count") if stats else MissingMappedField("Unknown")
168            digg_count = stats.get("digg_count") if stats else MissingMappedField("Unknown")
169            download_count = stats.get("download_count") if stats else MissingMappedField("Unknown")
170            forward_count = stats.get("forward_count") if stats else MissingMappedField("Unknown")
171            play_count = stats.get("play_count") if stats else MissingMappedField("Unknown")
172            share_count = stats.get("share_count") if stats else MissingMappedField("Unknown")
173            # live_watch_count = stats.get("live_watch_count") if stats else MissingMappedField("Unknown")
174
175            video_tags = ",".join(
176                [tag["tag_name"] for tag in (item["video_tag"] if item["video_tag"] is not None else []) if
177                 "tag_name" in tag])
178
179            mix_current_episode = item.get(mix_info_key).get("statis", {}).get("current_episode", "N/A") if item.get(
180                mix_info_key) else "N/A"
181
182        # Stream Stats
183        count_total_streams_viewers = stats.get("total_user", "N/A")
184        count_current_stream_viewers = SearchDouyin.get_chinese_number(stats.get("user_count_str")) if "user_count_str" in stats else "N/A"
185
186        # Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed
187        displayed = True
188        if item.get("ZS_collected_from_mix") and not item.get("ZS_first_mix_vid"):
189            displayed = False
190
191        # Image galleries have been added to Douyin
192        image_urls = []
193        if item.get("images"):
194            for img in item["images"]:
195                if "url_list" in img:
196                    image_urls.append(img["url_list"][0])
197                elif "urlList" in img:
198                    image_urls.append(img["urlList"][0])
199
200        # Music
201        music_author = item.get('music').get('author') if item.get('music') and item.get("music") != "$undefined" else ""
202        music_title = item.get('music').get('title') if item.get('music') and item.get("music") != "$undefined" else ""
203        music_url = item.get('music').get('play_url', {}).get('uri') if item.get('music') and item.get("music") != "$undefined" else ""
204
205        # Collection
206        mix_current_episode = mix_current_episode if mix_current_episode != "$undefined" else "N/A"
207        collection_id = item.get(mix_info_key).get(mix_id_key, "N/A") if item.get(mix_info_key) else "N/A"
208        collection_id = collection_id if collection_id != "$undefined" else "N/A"
209        collection_name = item.get(mix_info_key).get(mix_name_key, "N/A") if item.get(mix_info_key) else "N/A"
210        collection_name = collection_name if collection_name != "$undefined" else "N/A"
211        part_of_collection = "yes" if  item.get(mix_info_key) and mix_id_key in item[
212            mix_info_key] and collection_id != "N/A" else "no"
213
214        return MappedItem({
215            "id": item[aweme_id_key],
216            "thread_id": item[group_id_key],
217            "subject": subject,
218            "body": video_description,
219            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
220            "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")),
221            # Adding this as different Douyin pages contain different data
222            "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}",
223            "region": item.get("region", ""),
224            "hashtags": ",".join(
225                [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if
226                 tag.get(hashtag_key)]),
227            "mentions": ",".join([f"https://www.douyin.com/user/{tag[mention_key]}" for tag in
228                                  (item[text_extra_key] if item[text_extra_key] is not None else []) if
229                                  tag.get(mention_key)]),
230            # Actual username does not appear in object, but the sec_uid can be used to form a link to their profile
231            "video_tags": video_tags,
232            "prevent_download": prevent_download,
233            "video_url": video_url,
234            "video_duration": duration,
235            "image_urls": ','.join(image_urls),
236            "music_author": music_author,
237            "music_title": music_title,
238            "music_url": music_url,
239            # Video stats
240            "collect_count": collect_count,
241            "comment_count": comment_count,
242            "digg_count": digg_count,
243            "download_count": download_count,
244            "forward_count": forward_count,
245            "play_count": play_count,
246            "share_count": share_count,
247            "count_total_streams_viewers": count_total_streams_viewers,
248            "count_current_stream_viewers": count_current_stream_viewers,
249            # Author data
250            "author_user_id": item[author_id_key] if author_id_key in item else author.get("uid", author.get("id")),
251            "author_nickname": author["nickname"],
252            "author_profile_url": f"https://www.douyin.com/user/{author[author_sec_key]}",
253            "author_thumbnail_url": author[avatar_thumb_key].get(url_list_key, [''])[0],
254            "author_region": author.get("region"),
255            "author_is_ad_fake": author.get(is_fake_key),
256            # Collection/Mix
257            "part_of_collection": part_of_collection,
258            "4CAT_first_video_displayed": "yes" if displayed else "no",
259            # other videos may have been viewed, but this is unknown to us
260            "collection_id": collection_id,
261            "collection_name": collection_name,
262            "place_in_collection": mix_current_episode,
263            "unix_timestamp": int(post_timestamp.timestamp()),
264        })
265
266    @staticmethod
267    def get_chinese_number(num):
268        if type(num) in (float, int):
269            return num
270        elif type(num) is not str:
271            return 0
272
273        if "δΈ‡" in num:
274            return float(re.sub(r"[^0-9.]", "", num)) * 10000
275        else:
276            return int(re.sub(r"[^0-9.]", "", num))
class SearchDouyin(backend.lib.search.Search):
 13class SearchDouyin(Search):
 14    """
 15    Import scraped Douyin data
 16    """
 17    type = "douyin-search"  # job ID
 18    category = "Search"  # category
 19    title = "Import scraped Douyin data"  # title displayed in UI
 20    description = "Import Douyin data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 21    extension = "ndjson"  # extension of result file, used internally and in UI
 22    is_from_zeeschuimer = True
 23
 24    # not available as a processor for existing datasets
 25    accepts = [None]
 26    references = [
 27        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 28        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 29    ]
 30    
 31    def get_items(self, query):
 32        """
 33        Run custom search
 34
 35        Not available for Douyin
 36        """
 37        raise NotImplementedError("Douyin datasets can only be created by importing data from elsewhere")
 38
 39    @staticmethod
 40    def map_item(item):
 41        """
 42        """
 43        metadata = item.get("__import_meta")
 44        subject = "Post"
 45        if "ZS_collected_from_embed" in item and item["ZS_collected_from_embed"]:
 46            # HTML embedded posts formated differently than JSON posts
 47
 48            stream_data = item.get("cellRoom", {}).get("rawdata") if item.get("cellRoom") != "$undefined" else {}
 49            if stream_data:
 50                # These appear to be streams
 51                subject = "Stream"
 52                post_timestamp = datetime.fromtimestamp(stream_data.get("createtime", item.get(
 53                    "requestTime") / 1000))  # These may only have the timestamp of the request
 54                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
 55                video_description = stream_data.get("title")
 56                duration = "Unknown"
 57                prevent_download = None
 58                stats = stream_data.get("stats")
 59
 60                # Author is stream owner
 61                author = stream_data.get("owner")
 62                author_sec_key = "sec_uid"
 63                avatar_thumb_key = "avatar_thumb"
 64                url_list_key = "url_list"
 65                is_fake_key = "is_ad_fake"  # have not seen...
 66            else:
 67                post_timestamp = datetime.fromtimestamp(item["createTime"])
 68                videos_list = item.get("video").get("bitRateList")
 69                if not videos_list:
 70                    # Image galleries do not have video data
 71                    video_url = ""
 72                else:
 73                    videos = sorted([vid for vid in item.get("video").get("bitRateList")], key=lambda d: d.get("bitRate"),
 74                                reverse=True)
 75                    video_url = "https" + videos[0]["playApi"]
 76                video_description = item["desc"]
 77                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
 78                prevent_download = "yes" if item["download"]["prevent"] else "no"
 79                stats = item["stats"]
 80
 81                # Author is, well, author
 82                author = item["authorInfo"]
 83                author_sec_key = "secUid"
 84                avatar_thumb_key = "avatarThumb"
 85                url_list_key = "urlList"
 86                is_fake_key = "isAdFake"
 87
 88            # Embedded Keys
 89            aweme_id_key = "awemeId"
 90            group_id_key = "groupId"
 91            text_extra_key = "textExtra"
 92            hashtag_key = "hashtagName"
 93            mention_key = "secUid"
 94            author_id_key = "authorUserId"
 95            mix_info_key = "mixInfo"
 96            mix_id_key = "mixId"
 97            mix_name_key = "mixName"
 98
 99            # Stats
100            collect_count = stats.get("collectCount", MissingMappedField("Unknown"))
101            comment_count = stats.get("commentCount", MissingMappedField("Unknown"))
102            digg_count = stats.get("diggCount", MissingMappedField("Unknown"))
103            download_count = stats.get("downloadCount", MissingMappedField("Unknown"))
104            forward_count = stats.get("forwardCount", MissingMappedField("Unknown"))
105            play_count = stats.get("playCount", MissingMappedField("Unknown"))
106            share_count = stats.get("shareCount", MissingMappedField("Unknown"))
107            # live_watch_count = stats.get("liveWatchCount", MissingMappedField("Unknown"))
108
109            # This is a guess, I have not encountered it
110            video_tags = ",".join([tag["tagName"] for tag in item.get("videoTag", []) if "tagName" in tag])
111
112            mix_current_episode = item.get(mix_info_key, {}).get("currentEpisode", "N/A")
113
114        else:
115            stream_data = item.get("rawdata", item.get("cell_room", {}).get("rawdata"))
116            if stream_data:
117                subject = "Stream"
118                stream_data = json.loads(stream_data)
119                post_timestamp = datetime.fromtimestamp(
120                    stream_data.get("create_time", item.get("create_time", metadata.get(
121                        "timestamp_collected") / 1000)))  # Some posts appear to have no timestamp! We substitute collection time
122                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
123                video_description = stream_data.get("title")
124                duration = "Unknown"
125
126                # Author is stream owner
127                author = stream_data.get("owner")
128                video_tags = stream_data.get("video_feed_tag")
129                stats = stream_data.get("stats")
130
131            else:
132                post_timestamp = datetime.fromtimestamp(item["create_time"])
133                videos_list = item.get("video").get("bit_rate")
134                if not videos_list:
135                    # Image galleries do not have video data
136                    video_url = ""
137                else:
138                    videos = sorted([vid for vid in item["video"]["bit_rate"]], key=lambda d: d.get("bit_rate"),
139                                reverse=True)
140                    video_url = videos[0]["play_addr"].get("url_list", [''])[-1] if len(videos) > 0 else ""
141                video_description = item["desc"]
142                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
143
144                # Author is, well, author
145                author = item["author"]
146                stats = item.get("statistics")
147
148            prevent_download = ("yes" if item["prevent_download"] else "no") if "prevent_download" in item else None
149
150            # Keys
151            aweme_id_key = "aweme_id"
152            group_id_key = "group_id"
153            text_extra_key = "text_extra"
154            hashtag_key = "hashtag_name"
155            mention_key = "sec_uid"
156            author_id_key = "author_user_id"
157            mix_info_key = "mix_info"
158            mix_id_key = "mix_id"
159            mix_name_key = "mix_name"
160
161            author_sec_key = "sec_uid"
162            avatar_thumb_key = "avatar_thumb"
163            url_list_key = "url_list"
164            is_fake_key = "is_ad_fake"
165
166            # Stats
167            collect_count = stats.get("collect_count") if stats else MissingMappedField("Unknown")
168            comment_count = stats.get("comment_count") if stats else MissingMappedField("Unknown")
169            digg_count = stats.get("digg_count") if stats else MissingMappedField("Unknown")
170            download_count = stats.get("download_count") if stats else MissingMappedField("Unknown")
171            forward_count = stats.get("forward_count") if stats else MissingMappedField("Unknown")
172            play_count = stats.get("play_count") if stats else MissingMappedField("Unknown")
173            share_count = stats.get("share_count") if stats else MissingMappedField("Unknown")
174            # live_watch_count = stats.get("live_watch_count") if stats else MissingMappedField("Unknown")
175
176            video_tags = ",".join(
177                [tag["tag_name"] for tag in (item["video_tag"] if item["video_tag"] is not None else []) if
178                 "tag_name" in tag])
179
180            mix_current_episode = item.get(mix_info_key).get("statis", {}).get("current_episode", "N/A") if item.get(
181                mix_info_key) else "N/A"
182
183        # Stream Stats
184        count_total_streams_viewers = stats.get("total_user", "N/A")
185        count_current_stream_viewers = SearchDouyin.get_chinese_number(stats.get("user_count_str")) if "user_count_str" in stats else "N/A"
186
187        # Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed
188        displayed = True
189        if item.get("ZS_collected_from_mix") and not item.get("ZS_first_mix_vid"):
190            displayed = False
191
192        # Image galleries have been added to Douyin
193        image_urls = []
194        if item.get("images"):
195            for img in item["images"]:
196                if "url_list" in img:
197                    image_urls.append(img["url_list"][0])
198                elif "urlList" in img:
199                    image_urls.append(img["urlList"][0])
200
201        # Music
202        music_author = item.get('music').get('author') if item.get('music') and item.get("music") != "$undefined" else ""
203        music_title = item.get('music').get('title') if item.get('music') and item.get("music") != "$undefined" else ""
204        music_url = item.get('music').get('play_url', {}).get('uri') if item.get('music') and item.get("music") != "$undefined" else ""
205
206        # Collection
207        mix_current_episode = mix_current_episode if mix_current_episode != "$undefined" else "N/A"
208        collection_id = item.get(mix_info_key).get(mix_id_key, "N/A") if item.get(mix_info_key) else "N/A"
209        collection_id = collection_id if collection_id != "$undefined" else "N/A"
210        collection_name = item.get(mix_info_key).get(mix_name_key, "N/A") if item.get(mix_info_key) else "N/A"
211        collection_name = collection_name if collection_name != "$undefined" else "N/A"
212        part_of_collection = "yes" if  item.get(mix_info_key) and mix_id_key in item[
213            mix_info_key] and collection_id != "N/A" else "no"
214
215        return MappedItem({
216            "id": item[aweme_id_key],
217            "thread_id": item[group_id_key],
218            "subject": subject,
219            "body": video_description,
220            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
221            "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")),
222            # Adding this as different Douyin pages contain different data
223            "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}",
224            "region": item.get("region", ""),
225            "hashtags": ",".join(
226                [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if
227                 tag.get(hashtag_key)]),
228            "mentions": ",".join([f"https://www.douyin.com/user/{tag[mention_key]}" for tag in
229                                  (item[text_extra_key] if item[text_extra_key] is not None else []) if
230                                  tag.get(mention_key)]),
231            # Actual username does not appear in object, but the sec_uid can be used to form a link to their profile
232            "video_tags": video_tags,
233            "prevent_download": prevent_download,
234            "video_url": video_url,
235            "video_duration": duration,
236            "image_urls": ','.join(image_urls),
237            "music_author": music_author,
238            "music_title": music_title,
239            "music_url": music_url,
240            # Video stats
241            "collect_count": collect_count,
242            "comment_count": comment_count,
243            "digg_count": digg_count,
244            "download_count": download_count,
245            "forward_count": forward_count,
246            "play_count": play_count,
247            "share_count": share_count,
248            "count_total_streams_viewers": count_total_streams_viewers,
249            "count_current_stream_viewers": count_current_stream_viewers,
250            # Author data
251            "author_user_id": item[author_id_key] if author_id_key in item else author.get("uid", author.get("id")),
252            "author_nickname": author["nickname"],
253            "author_profile_url": f"https://www.douyin.com/user/{author[author_sec_key]}",
254            "author_thumbnail_url": author[avatar_thumb_key].get(url_list_key, [''])[0],
255            "author_region": author.get("region"),
256            "author_is_ad_fake": author.get(is_fake_key),
257            # Collection/Mix
258            "part_of_collection": part_of_collection,
259            "4CAT_first_video_displayed": "yes" if displayed else "no",
260            # other videos may have been viewed, but this is unknown to us
261            "collection_id": collection_id,
262            "collection_name": collection_name,
263            "place_in_collection": mix_current_episode,
264            "unix_timestamp": int(post_timestamp.timestamp()),
265        })
266
267    @staticmethod
268    def get_chinese_number(num):
269        if type(num) in (float, int):
270            return num
271        elif type(num) is not str:
272            return 0
273
274        if "δΈ‡" in num:
275            return float(re.sub(r"[^0-9.]", "", num)) * 10000
276        else:
277            return int(re.sub(r"[^0-9.]", "", num))

Import scraped Douyin data

type = 'douyin-search'
category = 'Search'
title = 'Import scraped Douyin data'
description = 'Import Douyin data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def get_items(self, query):
31    def get_items(self, query):
32        """
33        Run custom search
34
35        Not available for Douyin
36        """
37        raise NotImplementedError("Douyin datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Douyin

@staticmethod
def map_item(item):
 39    @staticmethod
 40    def map_item(item):
 41        """
 42        """
 43        metadata = item.get("__import_meta")
 44        subject = "Post"
 45        if "ZS_collected_from_embed" in item and item["ZS_collected_from_embed"]:
 46            # HTML embedded posts formated differently than JSON posts
 47
 48            stream_data = item.get("cellRoom", {}).get("rawdata") if item.get("cellRoom") != "$undefined" else {}
 49            if stream_data:
 50                # These appear to be streams
 51                subject = "Stream"
 52                post_timestamp = datetime.fromtimestamp(stream_data.get("createtime", item.get(
 53                    "requestTime") / 1000))  # These may only have the timestamp of the request
 54                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
 55                video_description = stream_data.get("title")
 56                duration = "Unknown"
 57                prevent_download = None
 58                stats = stream_data.get("stats")
 59
 60                # Author is stream owner
 61                author = stream_data.get("owner")
 62                author_sec_key = "sec_uid"
 63                avatar_thumb_key = "avatar_thumb"
 64                url_list_key = "url_list"
 65                is_fake_key = "is_ad_fake"  # have not seen...
 66            else:
 67                post_timestamp = datetime.fromtimestamp(item["createTime"])
 68                videos_list = item.get("video").get("bitRateList")
 69                if not videos_list:
 70                    # Image galleries do not have video data
 71                    video_url = ""
 72                else:
 73                    videos = sorted([vid for vid in item.get("video").get("bitRateList")], key=lambda d: d.get("bitRate"),
 74                                reverse=True)
 75                    video_url = "https" + videos[0]["playApi"]
 76                video_description = item["desc"]
 77                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
 78                prevent_download = "yes" if item["download"]["prevent"] else "no"
 79                stats = item["stats"]
 80
 81                # Author is, well, author
 82                author = item["authorInfo"]
 83                author_sec_key = "secUid"
 84                avatar_thumb_key = "avatarThumb"
 85                url_list_key = "urlList"
 86                is_fake_key = "isAdFake"
 87
 88            # Embedded Keys
 89            aweme_id_key = "awemeId"
 90            group_id_key = "groupId"
 91            text_extra_key = "textExtra"
 92            hashtag_key = "hashtagName"
 93            mention_key = "secUid"
 94            author_id_key = "authorUserId"
 95            mix_info_key = "mixInfo"
 96            mix_id_key = "mixId"
 97            mix_name_key = "mixName"
 98
 99            # Stats
100            collect_count = stats.get("collectCount", MissingMappedField("Unknown"))
101            comment_count = stats.get("commentCount", MissingMappedField("Unknown"))
102            digg_count = stats.get("diggCount", MissingMappedField("Unknown"))
103            download_count = stats.get("downloadCount", MissingMappedField("Unknown"))
104            forward_count = stats.get("forwardCount", MissingMappedField("Unknown"))
105            play_count = stats.get("playCount", MissingMappedField("Unknown"))
106            share_count = stats.get("shareCount", MissingMappedField("Unknown"))
107            # live_watch_count = stats.get("liveWatchCount", MissingMappedField("Unknown"))
108
109            # This is a guess, I have not encountered it
110            video_tags = ",".join([tag["tagName"] for tag in item.get("videoTag", []) if "tagName" in tag])
111
112            mix_current_episode = item.get(mix_info_key, {}).get("currentEpisode", "N/A")
113
114        else:
115            stream_data = item.get("rawdata", item.get("cell_room", {}).get("rawdata"))
116            if stream_data:
117                subject = "Stream"
118                stream_data = json.loads(stream_data)
119                post_timestamp = datetime.fromtimestamp(
120                    stream_data.get("create_time", item.get("create_time", metadata.get(
121                        "timestamp_collected") / 1000)))  # Some posts appear to have no timestamp! We substitute collection time
122                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
123                video_description = stream_data.get("title")
124                duration = "Unknown"
125
126                # Author is stream owner
127                author = stream_data.get("owner")
128                video_tags = stream_data.get("video_feed_tag")
129                stats = stream_data.get("stats")
130
131            else:
132                post_timestamp = datetime.fromtimestamp(item["create_time"])
133                videos_list = item.get("video").get("bit_rate")
134                if not videos_list:
135                    # Image galleries do not have video data
136                    video_url = ""
137                else:
138                    videos = sorted([vid for vid in item["video"]["bit_rate"]], key=lambda d: d.get("bit_rate"),
139                                reverse=True)
140                    video_url = videos[0]["play_addr"].get("url_list", [''])[-1] if len(videos) > 0 else ""
141                video_description = item["desc"]
142                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
143
144                # Author is, well, author
145                author = item["author"]
146                stats = item.get("statistics")
147
148            prevent_download = ("yes" if item["prevent_download"] else "no") if "prevent_download" in item else None
149
150            # Keys
151            aweme_id_key = "aweme_id"
152            group_id_key = "group_id"
153            text_extra_key = "text_extra"
154            hashtag_key = "hashtag_name"
155            mention_key = "sec_uid"
156            author_id_key = "author_user_id"
157            mix_info_key = "mix_info"
158            mix_id_key = "mix_id"
159            mix_name_key = "mix_name"
160
161            author_sec_key = "sec_uid"
162            avatar_thumb_key = "avatar_thumb"
163            url_list_key = "url_list"
164            is_fake_key = "is_ad_fake"
165
166            # Stats
167            collect_count = stats.get("collect_count") if stats else MissingMappedField("Unknown")
168            comment_count = stats.get("comment_count") if stats else MissingMappedField("Unknown")
169            digg_count = stats.get("digg_count") if stats else MissingMappedField("Unknown")
170            download_count = stats.get("download_count") if stats else MissingMappedField("Unknown")
171            forward_count = stats.get("forward_count") if stats else MissingMappedField("Unknown")
172            play_count = stats.get("play_count") if stats else MissingMappedField("Unknown")
173            share_count = stats.get("share_count") if stats else MissingMappedField("Unknown")
174            # live_watch_count = stats.get("live_watch_count") if stats else MissingMappedField("Unknown")
175
176            video_tags = ",".join(
177                [tag["tag_name"] for tag in (item["video_tag"] if item["video_tag"] is not None else []) if
178                 "tag_name" in tag])
179
180            mix_current_episode = item.get(mix_info_key).get("statis", {}).get("current_episode", "N/A") if item.get(
181                mix_info_key) else "N/A"
182
183        # Stream Stats
184        count_total_streams_viewers = stats.get("total_user", "N/A")
185        count_current_stream_viewers = SearchDouyin.get_chinese_number(stats.get("user_count_str")) if "user_count_str" in stats else "N/A"
186
187        # Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed
188        displayed = True
189        if item.get("ZS_collected_from_mix") and not item.get("ZS_first_mix_vid"):
190            displayed = False
191
192        # Image galleries have been added to Douyin
193        image_urls = []
194        if item.get("images"):
195            for img in item["images"]:
196                if "url_list" in img:
197                    image_urls.append(img["url_list"][0])
198                elif "urlList" in img:
199                    image_urls.append(img["urlList"][0])
200
201        # Music
202        music_author = item.get('music').get('author') if item.get('music') and item.get("music") != "$undefined" else ""
203        music_title = item.get('music').get('title') if item.get('music') and item.get("music") != "$undefined" else ""
204        music_url = item.get('music').get('play_url', {}).get('uri') if item.get('music') and item.get("music") != "$undefined" else ""
205
206        # Collection
207        mix_current_episode = mix_current_episode if mix_current_episode != "$undefined" else "N/A"
208        collection_id = item.get(mix_info_key).get(mix_id_key, "N/A") if item.get(mix_info_key) else "N/A"
209        collection_id = collection_id if collection_id != "$undefined" else "N/A"
210        collection_name = item.get(mix_info_key).get(mix_name_key, "N/A") if item.get(mix_info_key) else "N/A"
211        collection_name = collection_name if collection_name != "$undefined" else "N/A"
212        part_of_collection = "yes" if  item.get(mix_info_key) and mix_id_key in item[
213            mix_info_key] and collection_id != "N/A" else "no"
214
215        return MappedItem({
216            "id": item[aweme_id_key],
217            "thread_id": item[group_id_key],
218            "subject": subject,
219            "body": video_description,
220            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
221            "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")),
222            # Adding this as different Douyin pages contain different data
223            "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}",
224            "region": item.get("region", ""),
225            "hashtags": ",".join(
226                [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if
227                 tag.get(hashtag_key)]),
228            "mentions": ",".join([f"https://www.douyin.com/user/{tag[mention_key]}" for tag in
229                                  (item[text_extra_key] if item[text_extra_key] is not None else []) if
230                                  tag.get(mention_key)]),
231            # Actual username does not appear in object, but the sec_uid can be used to form a link to their profile
232            "video_tags": video_tags,
233            "prevent_download": prevent_download,
234            "video_url": video_url,
235            "video_duration": duration,
236            "image_urls": ','.join(image_urls),
237            "music_author": music_author,
238            "music_title": music_title,
239            "music_url": music_url,
240            # Video stats
241            "collect_count": collect_count,
242            "comment_count": comment_count,
243            "digg_count": digg_count,
244            "download_count": download_count,
245            "forward_count": forward_count,
246            "play_count": play_count,
247            "share_count": share_count,
248            "count_total_streams_viewers": count_total_streams_viewers,
249            "count_current_stream_viewers": count_current_stream_viewers,
250            # Author data
251            "author_user_id": item[author_id_key] if author_id_key in item else author.get("uid", author.get("id")),
252            "author_nickname": author["nickname"],
253            "author_profile_url": f"https://www.douyin.com/user/{author[author_sec_key]}",
254            "author_thumbnail_url": author[avatar_thumb_key].get(url_list_key, [''])[0],
255            "author_region": author.get("region"),
256            "author_is_ad_fake": author.get(is_fake_key),
257            # Collection/Mix
258            "part_of_collection": part_of_collection,
259            "4CAT_first_video_displayed": "yes" if displayed else "no",
260            # other videos may have been viewed, but this is unknown to us
261            "collection_id": collection_id,
262            "collection_name": collection_name,
263            "place_in_collection": mix_current_episode,
264            "unix_timestamp": int(post_timestamp.timestamp()),
265        })
@staticmethod
def get_chinese_number(num):
267    @staticmethod
268    def get_chinese_number(num):
269        if type(num) in (float, int):
270            return num
271        elif type(num) is not str:
272            return 0
273
274        if "δΈ‡" in num:
275            return float(re.sub(r"[^0-9.]", "", num)) * 10000
276        else:
277            return int(re.sub(r"[^0-9.]", "", num))