Edit on GitHub

datasources.douyin.search_douyin

Import scraped Douyin data

View Source

  1"""
  2Import scraped Douyin data
  3"""
  4import urllib
  5import json
  6import re
  7from datetime import datetime
  8
  9from backend.lib.search import Search
 10from common.lib.item_mapping import MappedItem, MissingMappedField
 11
 12class SearchDouyin(Search):
 13    """
 14    Import scraped Douyin data
 15    """
 16    type = "douyin-search"  # job ID
 17    category = "Search"  # category
 18    title = "Import scraped Douyin data"  # title displayed in UI
 19    description = "Import Douyin data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 20    extension = "ndjson"  # extension of result file, used internally and in UI
 21    is_from_zeeschuimer = True
 22
 23    # not available as a processor for existing datasets
 24    accepts = [None]
 25    references = [
 26        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 27        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 28    ]
 29
 30    def get_items(self, query):
 31        """
 32        Run custom search
 33
 34        Not available for Douyin
 35        """
 36        raise NotImplementedError("Douyin datasets can only be created by importing data from elsewhere")
 37
 38    @staticmethod
 39    def map_item(item):
 40        """
 41        """
 42        metadata = item.get("__import_meta")
 43        subject = "Post"
 44        if "ZS_collected_from_embed" in item and item["ZS_collected_from_embed"]:
 45            # HTML embedded posts formated differently than JSON posts
 46
 47            stream_data = item.get("cellRoom", {}).get("rawdata") if item.get("cellRoom") != "$undefined" else {}
 48            if stream_data:
 49                # These appear to be streams
 50                subject = "Stream"
 51                post_timestamp = datetime.fromtimestamp(stream_data.get("createtime", item.get(
 52                    "requestTime") / 1000))  # These may only have the timestamp of the request
 53                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
 54                video_description = stream_data.get("title")
 55                duration = "Unknown"
 56                prevent_download = None
 57                stats = stream_data.get("stats")
 58
 59                # Author is stream owner
 60                author = stream_data.get("owner")
 61                author_sec_key = "sec_uid"
 62                avatar_thumb_key = "avatar_thumb"
 63                url_list_key = "url_list"
 64                is_fake_key = "is_ad_fake"  # have not seen...
 65            else:
 66                post_timestamp = datetime.fromtimestamp(item["createTime"])
 67                videos_list = item.get("video").get("bitRateList")
 68                if not videos_list:
 69                    # Image galleries do not have video data
 70                    video_url = ""
 71                else:
 72                    videos = sorted([vid for vid in item.get("video").get("bitRateList")], key=lambda d: d.get("bitRate"),
 73                                reverse=True)
 74                    video_url = "https" + videos[0]["playApi"]
 75                video_description = item["desc"]
 76                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
 77                prevent_download = "yes" if item["download"]["prevent"] else "no"
 78                stats = item["stats"]
 79
 80                # Author is, well, author
 81                author = item["authorInfo"]
 82                author_sec_key = "secUid"
 83                avatar_thumb_key = "avatarThumb"
 84                url_list_key = "urlList"
 85                is_fake_key = "isAdFake"
 86
 87            # Embedded Keys
 88            aweme_id_key = "awemeId"
 89            group_id_key = "groupId"
 90            text_extra_key = "textExtra"
 91            hashtag_key = "hashtagName"
 92            mention_key = "secUid"
 93            author_id_key = "authorUserId"
 94            mix_info_key = "mixInfo"
 95            mix_id_key = "mixId"
 96            mix_name_key = "mixName"
 97
 98            # Stats
 99            collect_count = stats.get("collectCount", MissingMappedField("Unknown"))
100            comment_count = stats.get("commentCount", MissingMappedField("Unknown"))
101            digg_count = stats.get("diggCount", MissingMappedField("Unknown"))
102            download_count = stats.get("downloadCount", MissingMappedField("Unknown"))
103            forward_count = stats.get("forwardCount", MissingMappedField("Unknown"))
104            play_count = stats.get("playCount", MissingMappedField("Unknown"))
105            share_count = stats.get("shareCount", MissingMappedField("Unknown"))
106            live_watch_count = stats.get("liveWatchCount", MissingMappedField("Unknown"))
107
108            # This is a guess, I have not encountered it
109            video_tags = ",".join([tag["tagName"] for tag in item.get("videoTag", []) if "tagName" in tag])
110
111            mix_current_episode = item.get(mix_info_key, {}).get("currentEpisode", "N/A")
112
113        else:
114            stream_data = item.get("rawdata", item.get("cell_room", {}).get("rawdata"))
115            if stream_data:
116                subject = "Stream"
117                stream_data = json.loads(stream_data)
118                post_timestamp = datetime.fromtimestamp(
119                    stream_data.get("create_time", item.get("create_time", metadata.get(
120                        "timestamp_collected") / 1000)))  # Some posts appear to have no timestamp! We substitute collection time
121                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
122                video_description = stream_data.get("title")
123                duration = "Unknown"
124
125                # Author is stream owner
126                author = stream_data.get("owner")
127                video_tags = stream_data.get("video_feed_tag")
128                stats = stream_data.get("stats")
129
130            else:
131                post_timestamp = datetime.fromtimestamp(item["create_time"])
132                videos_list = item.get("video").get("bit_rate")
133                if not videos_list:
134                    # Image galleries do not have video data
135                    video_url = ""
136                else:
137                    videos = sorted([vid for vid in item["video"]["bit_rate"]], key=lambda d: d.get("bit_rate"),
138                                reverse=True)
139                    video_url = videos[0]["play_addr"].get("url_list", [''])[-1] if len(videos) > 0 else ""
140                video_description = item["desc"]
141                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
142
143                # Author is, well, author
144                author = item["author"]
145                stats = item.get("statistics")
146
147            prevent_download = ("yes" if item["prevent_download"] else "no") if "prevent_download" in item else None
148
149            # Keys
150            aweme_id_key = "aweme_id"
151            group_id_key = "group_id"
152            text_extra_key = "text_extra"
153            hashtag_key = "hashtag_name"
154            mention_key = "sec_uid"
155            author_id_key = "author_user_id"
156            mix_info_key = "mix_info"
157            mix_id_key = "mix_id"
158            mix_name_key = "mix_name"
159
160            author_sec_key = "sec_uid"
161            avatar_thumb_key = "avatar_thumb"
162            url_list_key = "url_list"
163            is_fake_key = "is_ad_fake"
164
165            # Stats
166            collect_count = stats.get("collect_count") if stats else MissingMappedField("Unknown")
167            comment_count = stats.get("comment_count") if stats else MissingMappedField("Unknown")
168            digg_count = stats.get("digg_count") if stats else MissingMappedField("Unknown")
169            download_count = stats.get("download_count") if stats else MissingMappedField("Unknown")
170            forward_count = stats.get("forward_count") if stats else MissingMappedField("Unknown")
171            play_count = stats.get("play_count") if stats else MissingMappedField("Unknown")
172            share_count = stats.get("share_count") if stats else MissingMappedField("Unknown")
173            live_watch_count = stats.get("live_watch_count") if stats else MissingMappedField("Unknown")
174
175            video_tags = ",".join(
176                [tag["tag_name"] for tag in (item["video_tag"] if item["video_tag"] is not None else []) if
177                 "tag_name" in tag])
178
179            mix_current_episode = item.get(mix_info_key, {}).get("statis", {}).get("current_episode", "N/A")
180
181        # Stream Stats
182        count_total_streams_viewers = stats.get("total_user", "N/A")
183        count_current_stream_viewers = SearchDouyin.get_chinese_number(stats.get("user_count_str")) if "user_count_str" in stats else "N/A"
184
185        # Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed
186        displayed = True
187        if item.get("ZS_collected_from_mix") and not item.get("ZS_first_mix_vid"):
188            displayed = False
189
190        # Image galleries have been added to Douyin
191        image_urls = []
192        if item.get("images"):
193            for img in item["images"]:
194                if "url_list" in img:
195                    image_urls.append(img["url_list"][0])
196                elif "urlList" in img:
197                    image_urls.append(img["urlList"][0])
198
199        # Music
200        music_author = item.get('music').get('author') if item.get('music') and item.get("music") != "$undefined" else ""
201        music_title = item.get('music').get('title') if item.get('music') and item.get("music") != "$undefined" else ""
202        music_url = item.get('music').get('play_url', {}).get('uri') if item.get('music') and item.get("music") != "$undefined" else ""
203
204        # Collection
205        mix_current_episode = mix_current_episode if mix_current_episode != "$undefined" else "N/A"
206        collection_id = item.get(mix_info_key, {}).get(mix_id_key, "N/A")
207        collection_id = collection_id if collection_id != "$undefined" else "N/A"
208        collection_name = item.get(mix_info_key, {}).get(mix_name_key, "N/A")
209        collection_name = collection_name if collection_name != "$undefined" else "N/A"
210        part_of_collection = "yes" if mix_info_key in item and mix_id_key in item[
211            mix_info_key] and collection_id != "N/A" else "no"
212
213        return MappedItem({
214            "id": item[aweme_id_key],
215            "thread_id": item[group_id_key],
216            "subject": subject,
217            "body": video_description,
218            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
219            "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")),
220            # Adding this as different Douyin pages contain different data
221            "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}",
222            "region": item.get("region", ""),
223            "hashtags": ",".join(
224                [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if
225                 hashtag_key in tag]),
226            "mentions": ",".join([f"https://www.douyin.com/user/{tag[mention_key]}" for tag in
227                                  (item[text_extra_key] if item[text_extra_key] is not None else []) if
228                                  mention_key in tag]),
229            # Actual username does not appear in object, but the sec_uid can be used to form a link to their profile
230            "video_tags": video_tags,
231            "prevent_download": prevent_download,
232            "video_url": video_url,
233            "video_duration": duration,
234            "image_urls": ','.join(image_urls),
235            "music_author": music_author,
236            "music_title": music_title,
237            "music_url": music_url,
238            # Video stats
239            "collect_count": collect_count,
240            "comment_count": comment_count,
241            "digg_count": digg_count,
242            "download_count": download_count,
243            "forward_count": forward_count,
244            "play_count": play_count,
245            "share_count": share_count,
246            "count_total_streams_viewers": count_total_streams_viewers,
247            "count_current_stream_viewers": count_current_stream_viewers,
248            # Author data
249            "author_user_id": item[author_id_key] if author_id_key in item else author.get("uid", author.get("id")),
250            "author_nickname": author["nickname"],
251            "author_profile_url": f"https://www.douyin.com/user/{author[author_sec_key]}",
252            "author_thumbnail_url": author[avatar_thumb_key].get(url_list_key, [''])[0],
253            "author_region": author.get("region"),
254            "author_is_ad_fake": author.get(is_fake_key),
255            # Collection/Mix
256            "part_of_collection": part_of_collection,
257            "4CAT_first_video_displayed": "yes" if displayed else "no",
258            # other videos may have been viewed, but this is unknown to us
259            "collection_id": collection_id,
260            "collection_name": collection_name,
261            "place_in_collection": mix_current_episode,
262            "unix_timestamp": int(post_timestamp.timestamp()),
263        })
264
265    @staticmethod
266    def get_chinese_number(num):
267        if type(num) in (float, int):
268            return num
269        elif type(num) is not str:
270            return 0
271
272        if "万" in num:
273            return float(re.sub(r"[^0-9.]", "", num)) * 10000
274        else:
275            return int(re.sub(r"[^0-9.]", "", num))

class SearchDouyin(backend.lib.search.Search): View Source

 13class SearchDouyin(Search):
 14    """
 15    Import scraped Douyin data
 16    """
 17    type = "douyin-search"  # job ID
 18    category = "Search"  # category
 19    title = "Import scraped Douyin data"  # title displayed in UI
 20    description = "Import Douyin data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 21    extension = "ndjson"  # extension of result file, used internally and in UI
 22    is_from_zeeschuimer = True
 23
 24    # not available as a processor for existing datasets
 25    accepts = [None]
 26    references = [
 27        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 28        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 29    ]
 30
 31    def get_items(self, query):
 32        """
 33        Run custom search
 34
 35        Not available for Douyin
 36        """
 37        raise NotImplementedError("Douyin datasets can only be created by importing data from elsewhere")
 38
 39    @staticmethod
 40    def map_item(item):
 41        """
 42        """
 43        metadata = item.get("__import_meta")
 44        subject = "Post"
 45        if "ZS_collected_from_embed" in item and item["ZS_collected_from_embed"]:
 46            # HTML embedded posts formated differently than JSON posts
 47
 48            stream_data = item.get("cellRoom", {}).get("rawdata") if item.get("cellRoom") != "$undefined" else {}
 49            if stream_data:
 50                # These appear to be streams
 51                subject = "Stream"
 52                post_timestamp = datetime.fromtimestamp(stream_data.get("createtime", item.get(
 53                    "requestTime") / 1000))  # These may only have the timestamp of the request
 54                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
 55                video_description = stream_data.get("title")
 56                duration = "Unknown"
 57                prevent_download = None
 58                stats = stream_data.get("stats")
 59
 60                # Author is stream owner
 61                author = stream_data.get("owner")
 62                author_sec_key = "sec_uid"
 63                avatar_thumb_key = "avatar_thumb"
 64                url_list_key = "url_list"
 65                is_fake_key = "is_ad_fake"  # have not seen...
 66            else:
 67                post_timestamp = datetime.fromtimestamp(item["createTime"])
 68                videos_list = item.get("video").get("bitRateList")
 69                if not videos_list:
 70                    # Image galleries do not have video data
 71                    video_url = ""
 72                else:
 73                    videos = sorted([vid for vid in item.get("video").get("bitRateList")], key=lambda d: d.get("bitRate"),
 74                                reverse=True)
 75                    video_url = "https" + videos[0]["playApi"]
 76                video_description = item["desc"]
 77                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
 78                prevent_download = "yes" if item["download"]["prevent"] else "no"
 79                stats = item["stats"]
 80
 81                # Author is, well, author
 82                author = item["authorInfo"]
 83                author_sec_key = "secUid"
 84                avatar_thumb_key = "avatarThumb"
 85                url_list_key = "urlList"
 86                is_fake_key = "isAdFake"
 87
 88            # Embedded Keys
 89            aweme_id_key = "awemeId"
 90            group_id_key = "groupId"
 91            text_extra_key = "textExtra"
 92            hashtag_key = "hashtagName"
 93            mention_key = "secUid"
 94            author_id_key = "authorUserId"
 95            mix_info_key = "mixInfo"
 96            mix_id_key = "mixId"
 97            mix_name_key = "mixName"
 98
 99            # Stats
100            collect_count = stats.get("collectCount", MissingMappedField("Unknown"))
101            comment_count = stats.get("commentCount", MissingMappedField("Unknown"))
102            digg_count = stats.get("diggCount", MissingMappedField("Unknown"))
103            download_count = stats.get("downloadCount", MissingMappedField("Unknown"))
104            forward_count = stats.get("forwardCount", MissingMappedField("Unknown"))
105            play_count = stats.get("playCount", MissingMappedField("Unknown"))
106            share_count = stats.get("shareCount", MissingMappedField("Unknown"))
107            live_watch_count = stats.get("liveWatchCount", MissingMappedField("Unknown"))
108
109            # This is a guess, I have not encountered it
110            video_tags = ",".join([tag["tagName"] for tag in item.get("videoTag", []) if "tagName" in tag])
111
112            mix_current_episode = item.get(mix_info_key, {}).get("currentEpisode", "N/A")
113
114        else:
115            stream_data = item.get("rawdata", item.get("cell_room", {}).get("rawdata"))
116            if stream_data:
117                subject = "Stream"
118                stream_data = json.loads(stream_data)
119                post_timestamp = datetime.fromtimestamp(
120                    stream_data.get("create_time", item.get("create_time", metadata.get(
121                        "timestamp_collected") / 1000)))  # Some posts appear to have no timestamp! We substitute collection time
122                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
123                video_description = stream_data.get("title")
124                duration = "Unknown"
125
126                # Author is stream owner
127                author = stream_data.get("owner")
128                video_tags = stream_data.get("video_feed_tag")
129                stats = stream_data.get("stats")
130
131            else:
132                post_timestamp = datetime.fromtimestamp(item["create_time"])
133                videos_list = item.get("video").get("bit_rate")
134                if not videos_list:
135                    # Image galleries do not have video data
136                    video_url = ""
137                else:
138                    videos = sorted([vid for vid in item["video"]["bit_rate"]], key=lambda d: d.get("bit_rate"),
139                                reverse=True)
140                    video_url = videos[0]["play_addr"].get("url_list", [''])[-1] if len(videos) > 0 else ""
141                video_description = item["desc"]
142                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
143
144                # Author is, well, author
145                author = item["author"]
146                stats = item.get("statistics")
147
148            prevent_download = ("yes" if item["prevent_download"] else "no") if "prevent_download" in item else None
149
150            # Keys
151            aweme_id_key = "aweme_id"
152            group_id_key = "group_id"
153            text_extra_key = "text_extra"
154            hashtag_key = "hashtag_name"
155            mention_key = "sec_uid"
156            author_id_key = "author_user_id"
157            mix_info_key = "mix_info"
158            mix_id_key = "mix_id"
159            mix_name_key = "mix_name"
160
161            author_sec_key = "sec_uid"
162            avatar_thumb_key = "avatar_thumb"
163            url_list_key = "url_list"
164            is_fake_key = "is_ad_fake"
165
166            # Stats
167            collect_count = stats.get("collect_count") if stats else MissingMappedField("Unknown")
168            comment_count = stats.get("comment_count") if stats else MissingMappedField("Unknown")
169            digg_count = stats.get("digg_count") if stats else MissingMappedField("Unknown")
170            download_count = stats.get("download_count") if stats else MissingMappedField("Unknown")
171            forward_count = stats.get("forward_count") if stats else MissingMappedField("Unknown")
172            play_count = stats.get("play_count") if stats else MissingMappedField("Unknown")
173            share_count = stats.get("share_count") if stats else MissingMappedField("Unknown")
174            live_watch_count = stats.get("live_watch_count") if stats else MissingMappedField("Unknown")
175
176            video_tags = ",".join(
177                [tag["tag_name"] for tag in (item["video_tag"] if item["video_tag"] is not None else []) if
178                 "tag_name" in tag])
179
180            mix_current_episode = item.get(mix_info_key, {}).get("statis", {}).get("current_episode", "N/A")
181
182        # Stream Stats
183        count_total_streams_viewers = stats.get("total_user", "N/A")
184        count_current_stream_viewers = SearchDouyin.get_chinese_number(stats.get("user_count_str")) if "user_count_str" in stats else "N/A"
185
186        # Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed
187        displayed = True
188        if item.get("ZS_collected_from_mix") and not item.get("ZS_first_mix_vid"):
189            displayed = False
190
191        # Image galleries have been added to Douyin
192        image_urls = []
193        if item.get("images"):
194            for img in item["images"]:
195                if "url_list" in img:
196                    image_urls.append(img["url_list"][0])
197                elif "urlList" in img:
198                    image_urls.append(img["urlList"][0])
199
200        # Music
201        music_author = item.get('music').get('author') if item.get('music') and item.get("music") != "$undefined" else ""
202        music_title = item.get('music').get('title') if item.get('music') and item.get("music") != "$undefined" else ""
203        music_url = item.get('music').get('play_url', {}).get('uri') if item.get('music') and item.get("music") != "$undefined" else ""
204
205        # Collection
206        mix_current_episode = mix_current_episode if mix_current_episode != "$undefined" else "N/A"
207        collection_id = item.get(mix_info_key, {}).get(mix_id_key, "N/A")
208        collection_id = collection_id if collection_id != "$undefined" else "N/A"
209        collection_name = item.get(mix_info_key, {}).get(mix_name_key, "N/A")
210        collection_name = collection_name if collection_name != "$undefined" else "N/A"
211        part_of_collection = "yes" if mix_info_key in item and mix_id_key in item[
212            mix_info_key] and collection_id != "N/A" else "no"
213
214        return MappedItem({
215            "id": item[aweme_id_key],
216            "thread_id": item[group_id_key],
217            "subject": subject,
218            "body": video_description,
219            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
220            "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")),
221            # Adding this as different Douyin pages contain different data
222            "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}",
223            "region": item.get("region", ""),
224            "hashtags": ",".join(
225                [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if
226                 hashtag_key in tag]),
227            "mentions": ",".join([f"https://www.douyin.com/user/{tag[mention_key]}" for tag in
228                                  (item[text_extra_key] if item[text_extra_key] is not None else []) if
229                                  mention_key in tag]),
230            # Actual username does not appear in object, but the sec_uid can be used to form a link to their profile
231            "video_tags": video_tags,
232            "prevent_download": prevent_download,
233            "video_url": video_url,
234            "video_duration": duration,
235            "image_urls": ','.join(image_urls),
236            "music_author": music_author,
237            "music_title": music_title,
238            "music_url": music_url,
239            # Video stats
240            "collect_count": collect_count,
241            "comment_count": comment_count,
242            "digg_count": digg_count,
243            "download_count": download_count,
244            "forward_count": forward_count,
245            "play_count": play_count,
246            "share_count": share_count,
247            "count_total_streams_viewers": count_total_streams_viewers,
248            "count_current_stream_viewers": count_current_stream_viewers,
249            # Author data
250            "author_user_id": item[author_id_key] if author_id_key in item else author.get("uid", author.get("id")),
251            "author_nickname": author["nickname"],
252            "author_profile_url": f"https://www.douyin.com/user/{author[author_sec_key]}",
253            "author_thumbnail_url": author[avatar_thumb_key].get(url_list_key, [''])[0],
254            "author_region": author.get("region"),
255            "author_is_ad_fake": author.get(is_fake_key),
256            # Collection/Mix
257            "part_of_collection": part_of_collection,
258            "4CAT_first_video_displayed": "yes" if displayed else "no",
259            # other videos may have been viewed, but this is unknown to us
260            "collection_id": collection_id,
261            "collection_name": collection_name,
262            "place_in_collection": mix_current_episode,
263            "unix_timestamp": int(post_timestamp.timestamp()),
264        })
265
266    @staticmethod
267    def get_chinese_number(num):
268        if type(num) in (float, int):
269            return num
270        elif type(num) is not str:
271            return 0
272
273        if "万" in num:
274            return float(re.sub(r"[^0-9.]", "", num)) * 10000
275        else:
276            return int(re.sub(r"[^0-9.]", "", num))

Import scraped Douyin data

type = 'douyin-search'

category = 'Search'

title = 'Import scraped Douyin data'

description = 'Import Douyin data collected with an external tool such as Zeeschuimer.'

extension = 'ndjson'

is_from_zeeschuimer = True

accepts = [None]

references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']

def get_items(self, query): View Source

31    def get_items(self, query):
32        """
33        Run custom search
34
35        Not available for Douyin
36        """
37        raise NotImplementedError("Douyin datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Douyin

@staticmethod

def map_item(item): View Source

 39    @staticmethod
 40    def map_item(item):
 41        """
 42        """
 43        metadata = item.get("__import_meta")
 44        subject = "Post"
 45        if "ZS_collected_from_embed" in item and item["ZS_collected_from_embed"]:
 46            # HTML embedded posts formated differently than JSON posts
 47
 48            stream_data = item.get("cellRoom", {}).get("rawdata") if item.get("cellRoom") != "$undefined" else {}
 49            if stream_data:
 50                # These appear to be streams
 51                subject = "Stream"
 52                post_timestamp = datetime.fromtimestamp(stream_data.get("createtime", item.get(
 53                    "requestTime") / 1000))  # These may only have the timestamp of the request
 54                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
 55                video_description = stream_data.get("title")
 56                duration = "Unknown"
 57                prevent_download = None
 58                stats = stream_data.get("stats")
 59
 60                # Author is stream owner
 61                author = stream_data.get("owner")
 62                author_sec_key = "sec_uid"
 63                avatar_thumb_key = "avatar_thumb"
 64                url_list_key = "url_list"
 65                is_fake_key = "is_ad_fake"  # have not seen...
 66            else:
 67                post_timestamp = datetime.fromtimestamp(item["createTime"])
 68                videos_list = item.get("video").get("bitRateList")
 69                if not videos_list:
 70                    # Image galleries do not have video data
 71                    video_url = ""
 72                else:
 73                    videos = sorted([vid for vid in item.get("video").get("bitRateList")], key=lambda d: d.get("bitRate"),
 74                                reverse=True)
 75                    video_url = "https" + videos[0]["playApi"]
 76                video_description = item["desc"]
 77                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
 78                prevent_download = "yes" if item["download"]["prevent"] else "no"
 79                stats = item["stats"]
 80
 81                # Author is, well, author
 82                author = item["authorInfo"]
 83                author_sec_key = "secUid"
 84                avatar_thumb_key = "avatarThumb"
 85                url_list_key = "urlList"
 86                is_fake_key = "isAdFake"
 87
 88            # Embedded Keys
 89            aweme_id_key = "awemeId"
 90            group_id_key = "groupId"
 91            text_extra_key = "textExtra"
 92            hashtag_key = "hashtagName"
 93            mention_key = "secUid"
 94            author_id_key = "authorUserId"
 95            mix_info_key = "mixInfo"
 96            mix_id_key = "mixId"
 97            mix_name_key = "mixName"
 98
 99            # Stats
100            collect_count = stats.get("collectCount", MissingMappedField("Unknown"))
101            comment_count = stats.get("commentCount", MissingMappedField("Unknown"))
102            digg_count = stats.get("diggCount", MissingMappedField("Unknown"))
103            download_count = stats.get("downloadCount", MissingMappedField("Unknown"))
104            forward_count = stats.get("forwardCount", MissingMappedField("Unknown"))
105            play_count = stats.get("playCount", MissingMappedField("Unknown"))
106            share_count = stats.get("shareCount", MissingMappedField("Unknown"))
107            live_watch_count = stats.get("liveWatchCount", MissingMappedField("Unknown"))
108
109            # This is a guess, I have not encountered it
110            video_tags = ",".join([tag["tagName"] for tag in item.get("videoTag", []) if "tagName" in tag])
111
112            mix_current_episode = item.get(mix_info_key, {}).get("currentEpisode", "N/A")
113
114        else:
115            stream_data = item.get("rawdata", item.get("cell_room", {}).get("rawdata"))
116            if stream_data:
117                subject = "Stream"
118                stream_data = json.loads(stream_data)
119                post_timestamp = datetime.fromtimestamp(
120                    stream_data.get("create_time", item.get("create_time", metadata.get(
121                        "timestamp_collected") / 1000)))  # Some posts appear to have no timestamp! We substitute collection time
122                video_url = stream_data.get("stream_url").get("flv_pull_url", {}).get("FULL_HD1")
123                video_description = stream_data.get("title")
124                duration = "Unknown"
125
126                # Author is stream owner
127                author = stream_data.get("owner")
128                video_tags = stream_data.get("video_feed_tag")
129                stats = stream_data.get("stats")
130
131            else:
132                post_timestamp = datetime.fromtimestamp(item["create_time"])
133                videos_list = item.get("video").get("bit_rate")
134                if not videos_list:
135                    # Image galleries do not have video data
136                    video_url = ""
137                else:
138                    videos = sorted([vid for vid in item["video"]["bit_rate"]], key=lambda d: d.get("bit_rate"),
139                                reverse=True)
140                    video_url = videos[0]["play_addr"].get("url_list", [''])[-1] if len(videos) > 0 else ""
141                video_description = item["desc"]
142                duration = item.get("duration", item.get("video", {}).get("duration", "Unknown"))
143
144                # Author is, well, author
145                author = item["author"]
146                stats = item.get("statistics")
147
148            prevent_download = ("yes" if item["prevent_download"] else "no") if "prevent_download" in item else None
149
150            # Keys
151            aweme_id_key = "aweme_id"
152            group_id_key = "group_id"
153            text_extra_key = "text_extra"
154            hashtag_key = "hashtag_name"
155            mention_key = "sec_uid"
156            author_id_key = "author_user_id"
157            mix_info_key = "mix_info"
158            mix_id_key = "mix_id"
159            mix_name_key = "mix_name"
160
161            author_sec_key = "sec_uid"
162            avatar_thumb_key = "avatar_thumb"
163            url_list_key = "url_list"
164            is_fake_key = "is_ad_fake"
165
166            # Stats
167            collect_count = stats.get("collect_count") if stats else MissingMappedField("Unknown")
168            comment_count = stats.get("comment_count") if stats else MissingMappedField("Unknown")
169            digg_count = stats.get("digg_count") if stats else MissingMappedField("Unknown")
170            download_count = stats.get("download_count") if stats else MissingMappedField("Unknown")
171            forward_count = stats.get("forward_count") if stats else MissingMappedField("Unknown")
172            play_count = stats.get("play_count") if stats else MissingMappedField("Unknown")
173            share_count = stats.get("share_count") if stats else MissingMappedField("Unknown")
174            live_watch_count = stats.get("live_watch_count") if stats else MissingMappedField("Unknown")
175
176            video_tags = ",".join(
177                [tag["tag_name"] for tag in (item["video_tag"] if item["video_tag"] is not None else []) if
178                 "tag_name" in tag])
179
180            mix_current_episode = item.get(mix_info_key, {}).get("statis", {}).get("current_episode", "N/A")
181
182        # Stream Stats
183        count_total_streams_viewers = stats.get("total_user", "N/A")
184        count_current_stream_viewers = SearchDouyin.get_chinese_number(stats.get("user_count_str")) if "user_count_str" in stats else "N/A"
185
186        # Some videos are collected from "mixes"/"collections"; only the first video is definitely displayed while others may or may not be viewed
187        displayed = True
188        if item.get("ZS_collected_from_mix") and not item.get("ZS_first_mix_vid"):
189            displayed = False
190
191        # Image galleries have been added to Douyin
192        image_urls = []
193        if item.get("images"):
194            for img in item["images"]:
195                if "url_list" in img:
196                    image_urls.append(img["url_list"][0])
197                elif "urlList" in img:
198                    image_urls.append(img["urlList"][0])
199
200        # Music
201        music_author = item.get('music').get('author') if item.get('music') and item.get("music") != "$undefined" else ""
202        music_title = item.get('music').get('title') if item.get('music') and item.get("music") != "$undefined" else ""
203        music_url = item.get('music').get('play_url', {}).get('uri') if item.get('music') and item.get("music") != "$undefined" else ""
204
205        # Collection
206        mix_current_episode = mix_current_episode if mix_current_episode != "$undefined" else "N/A"
207        collection_id = item.get(mix_info_key, {}).get(mix_id_key, "N/A")
208        collection_id = collection_id if collection_id != "$undefined" else "N/A"
209        collection_name = item.get(mix_info_key, {}).get(mix_name_key, "N/A")
210        collection_name = collection_name if collection_name != "$undefined" else "N/A"
211        part_of_collection = "yes" if mix_info_key in item and mix_id_key in item[
212            mix_info_key] and collection_id != "N/A" else "no"
213
214        return MappedItem({
215            "id": item[aweme_id_key],
216            "thread_id": item[group_id_key],
217            "subject": subject,
218            "body": video_description,
219            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
220            "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")),
221            # Adding this as different Douyin pages contain different data
222            "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}",
223            "region": item.get("region", ""),
224            "hashtags": ",".join(
225                [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if
226                 hashtag_key in tag]),
227            "mentions": ",".join([f"https://www.douyin.com/user/{tag[mention_key]}" for tag in
228                                  (item[text_extra_key] if item[text_extra_key] is not None else []) if
229                                  mention_key in tag]),
230            # Actual username does not appear in object, but the sec_uid can be used to form a link to their profile
231            "video_tags": video_tags,
232            "prevent_download": prevent_download,
233            "video_url": video_url,
234            "video_duration": duration,
235            "image_urls": ','.join(image_urls),
236            "music_author": music_author,
237            "music_title": music_title,
238            "music_url": music_url,
239            # Video stats
240            "collect_count": collect_count,
241            "comment_count": comment_count,
242            "digg_count": digg_count,
243            "download_count": download_count,
244            "forward_count": forward_count,
245            "play_count": play_count,
246            "share_count": share_count,
247            "count_total_streams_viewers": count_total_streams_viewers,
248            "count_current_stream_viewers": count_current_stream_viewers,
249            # Author data
250            "author_user_id": item[author_id_key] if author_id_key in item else author.get("uid", author.get("id")),
251            "author_nickname": author["nickname"],
252            "author_profile_url": f"https://www.douyin.com/user/{author[author_sec_key]}",
253            "author_thumbnail_url": author[avatar_thumb_key].get(url_list_key, [''])[0],
254            "author_region": author.get("region"),
255            "author_is_ad_fake": author.get(is_fake_key),
256            # Collection/Mix
257            "part_of_collection": part_of_collection,
258            "4CAT_first_video_displayed": "yes" if displayed else "no",
259            # other videos may have been viewed, but this is unknown to us
260            "collection_id": collection_id,
261            "collection_name": collection_name,
262            "place_in_collection": mix_current_episode,
263            "unix_timestamp": int(post_timestamp.timestamp()),
264        })

@staticmethod

def get_chinese_number(num): View Source

266    @staticmethod
267    def get_chinese_number(num):
268        if type(num) in (float, int):
269            return num
270        elif type(num) is not str:
271            return 0
272
273        if "万" in num:
274            return float(re.sub(r"[^0-9.]", "", num)) * 10000
275        else:
276            return int(re.sub(r"[^0-9.]", "", num))

Inherited Members

backend.lib.worker.BasicWorker: BasicWorker; INTERRUPT_NONE; INTERRUPT_RETRY; INTERRUPT_CANCEL; queue; log; manager; interrupted; modules; init_time; name; run; clean_up; request_interrupt; is_4cat_class
backend.lib.search.Search: max_workers; prefix; return_cols; import_error_count; import_warning_count; process; search; import_from_file; items_to_csv; items_to_ndjson; items_to_archive
backend.lib.processor.BasicProcessor: db; job; dataset; owner; source_dataset; source_file; config; is_running_in_preset; is_hidden; filepath; work; after_process; remove_files; abort; add_field_to_parent; iterate_archive_contents; unpack_archive_contents; extract_archived_file_by_name; write_csv_items_and_finish; write_archive_and_finish; create_standalone; map_item_method_available; get_mapped_item; is_filter; get_options; get_status; is_top_dataset; is_from_collector; get_extension; is_rankable; exclude_followup_processors; is_4cat_processor