datasources.instagram.search_instagram
Import scraped Instagram data
It's prohibitively difficult to scrape data from Instagram within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped Instagram data 3 4It's prohibitively difficult to scrape data from Instagram within 4CAT itself 5due to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7import datetime 8import re 9 10from backend.lib.search import Search 11from common.lib.item_mapping import MappedItem, MissingMappedField 12from common.lib.exceptions import MapItemException 13 14 15class SearchInstagram(Search): 16 """ 17 Import scraped Instagram data 18 """ 19 type = "instagram-search" # job ID 20 category = "Search" # category 21 title = "Import scraped Instagram data" # title displayed in UI 22 description = "Import Instagram data collected with an external tool such as Zeeschuimer." # description displayed in UI 23 extension = "ndjson" # extension of result file, used internally and in UI 24 is_from_zeeschuimer = True 25 26 # not available as a processor for existing datasets 27 accepts = [None] 28 references = [ 29 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 30 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)" 31 ] 32 33 # some magic numbers instagram uses 34 MEDIA_TYPE_PHOTO = 1 35 MEDIA_TYPE_VIDEO = 2 36 MEDIA_TYPE_CAROUSEL = 8 37 38 def get_items(self, query): 39 """ 40 Run custom search 41 42 Not available for Instagram 43 """ 44 raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere") 45 46 47 @staticmethod 48 def map_item(item): 49 """ 50 Map Instagram item 51 52 Instagram importing is a little bit roundabout since we can expect 53 input in two separate and not completely overlapping formats - an "edge 54 list" or an "item list", and posts are structured differently between 55 those, and do not contain the same data. So we find a middle ground 56 here... each format has its own handler function 57 58 :param dict item: Item to map 59 :return: Mapped item 60 """ 61 link = item.get("link", "") 62 if (item.get("product_type", "") == "ad") or \ 63 (link and link.startswith("https://www.facebook.com/ads/ig_redirect")): 64 # These are ads 65 raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.") 66 67 is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",) 68 69 if is_graph_response: 70 return MappedItem(SearchInstagram.parse_graph_item(item)) 71 else: 72 return MappedItem(SearchInstagram.parse_itemlist_item(item)) 73 74 @staticmethod 75 def parse_graph_item(node): 76 """ 77 Parse Instagram post in Graph format 78 79 2025-6-5: potentially legacy format 80 81 :param node: Data as received from Instagram 82 :return dict: Mapped item 83 """ 84 try: 85 caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"] 86 except IndexError: 87 caption = MissingMappedField("") 88 89 num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"]) 90 91 # get media url 92 # for carousels, get the first media item, for videos, get the video 93 # url, for photos, get the highest resolution 94 if node["__typename"] == "GraphSidecar": 95 media_node = node["edge_sidecar_to_children"]["edges"][0]["node"] 96 else: 97 media_node = node 98 99 if media_node["__typename"] == "GraphVideo": 100 media_url = media_node["video_url"] 101 elif media_node["__typename"] == "GraphImage": 102 resources = media_node.get("display_resources", media_node.get("thumbnail_resources")) 103 try: 104 media_url = resources.pop()["src"] 105 except AttributeError: 106 media_url = media_node.get("display_url", "") 107 else: 108 media_url = media_node["display_url"] 109 110 # type, 'mixed' means carousel with video and photo 111 type_map = {"GraphSidecar": "photo", "GraphVideo": "video"} 112 if node["__typename"] != "GraphSidecar": 113 media_type = type_map.get(node["__typename"], "unknown") 114 else: 115 media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]]) 116 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 117 118 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 119 # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems 120 if node.get("location"): 121 location["name"] = node["location"].get("name") 122 location["location_id"] = node["location"].get("pk") 123 # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future... 124 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 125 "location"].get("lat") else "" 126 location["city"] = node["location"].get("city") 127 128 no_likes = bool(node.get("like_and_view_counts_disabled")) 129 130 user = node.get("user") 131 owner = node.get("owner") 132 if node.get("user") and node.get("owner"): 133 if user.get("username") != owner.get("username"): 134 raise MapItemException("Unable to parse item: different user and owner") 135 136 mapped_item = { 137 # Post data 138 "id": node["shortcode"], 139 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 140 "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), 141 "thread_id": node["shortcode"], 142 "parent_id": node["shortcode"], 143 "url": "https://www.instagram.com/p/" + node["shortcode"], 144 "body": caption, 145 146 147 # Author data 148 "author": user.get("username", owner.get("username", MissingMappedField(""))), 149 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 150 "is_verified": True if user.get("is_verified") else False, 151 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 152 # Unable to find graph type posts to test 153 "coauthors": MissingMappedField(""), 154 "coauthor_fullnames": MissingMappedField(""), 155 "coauthor_ids": MissingMappedField(""), 156 157 # Media 158 "media_type": media_type, 159 "num_media": num_media, 160 "image_urls": node["display_url"], 161 "media_urls": media_url, 162 163 # Engagement 164 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)), 165 # Unsure if usertags will work; need data (this could raise it to attention...) 166 "usertags": ",".join( 167 [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 168 "likes_hidden": "yes" if no_likes else "no", 169 "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0), 170 "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0), 171 172 # Location data 173 "location_name": location["name"], 174 "location_id": location["location_id"], 175 "location_latlong": location["latlong"], 176 "location_city": location["city"], 177 178 # Metadata 179 "unix_timestamp": node["taken_at_timestamp"], 180 "missing_media": None 181 } 182 183 return mapped_item 184 185 @staticmethod 186 def parse_itemlist_item(node): 187 """ 188 Parse Instagram post in 'item list' format 189 190 :param node: Data as received from Instagram 191 :return dict: Mapped item 192 """ 193 num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"]) 194 caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"] 195 196 # get media urls 197 display_urls = [] 198 media_urls = [] 199 missing_media = None 200 type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"} 201 media_types = set() 202 # for carousels, get the first media item, for videos, get the video 203 # url, for photos, get the highest resolution 204 if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL: 205 media_nodes = node["carousel_media"] 206 else: 207 media_nodes = [node] 208 209 for media_node in media_nodes: 210 if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO: 211 # Videos 212 media_urls.append(media_node["video_versions"][0]["url"]) 213 if "image_versions2" in media_node: 214 display_urls.append(media_node["image_versions2"]["candidates"][0]["url"]) 215 else: 216 # no image links at all :-/ 217 # video is all we have 218 display_urls.append(media_node["video_versions"][0]["url"]) 219 220 elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"): 221 # Images 222 media_url = media_node["image_versions2"]["candidates"][0]["url"] 223 display_urls.append(media_url) 224 media_urls.append(media_url) 225 else: 226 missing_media = MissingMappedField("") 227 228 media_types.add(type_map.get(media_node["media_type"], "unknown")) 229 230 # type, 'mixed' means carousel with video and photo 231 media_type = "mixed" if len(media_types) > 1 else media_types.pop() 232 233 if "comment_count" in node: 234 num_comments = node["comment_count"] 235 elif "comments" in node and type(node["comments"]) is list: 236 num_comments = len(node["comments"]) 237 else: 238 num_comments = -1 239 240 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 241 if node.get("location"): 242 location["name"] = node["location"].get("name") 243 location["location_id"] = node["location"].get("pk") 244 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 245 "location"].get("lat") else "" 246 location["city"] = node["location"].get("city") 247 248 user = node.get("user", {}) 249 owner = node.get("owner", {}) 250 if user and owner: 251 if user.get("username") != owner.get("username"): 252 raise MapItemException("Unable to parse item: different user and owner") 253 254 # Instagram posts also allow 'Collabs' with up to one co-author 255 coauthors = [] 256 coauthor_fullnames = [] 257 coauthor_ids = [] 258 if node.get("coauthor_producers"): 259 for coauthor_node in node["coauthor_producers"]: 260 coauthors.append(coauthor_node.get("username")) 261 coauthor_fullnames.append(coauthor_node.get("full_name")) 262 coauthor_ids.append(coauthor_node.get("id")) 263 264 no_likes = bool(node.get("like_and_view_counts_disabled")) 265 266 # usertags 267 if "usertags" in node: 268 usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else "" 269 else: 270 # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`) 271 usertags = "" 272 273 mapped_item = { 274 # Post and caption 275 "id": node["code"], 276 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 277 "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"), 278 "thread_id": node["code"], 279 "parent_id": node["code"], 280 "url": "https://www.instagram.com/p/" + node["code"], 281 "body": caption, 282 283 # Authors 284 "author": user.get("username", owner.get("username", MissingMappedField(""))), 285 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 286 "verified": True if user.get("is_verified") else False, 287 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 288 "coauthors": ",".join(coauthors), 289 "coauthor_fullnames": ",".join(coauthor_fullnames), 290 "coauthor_ids": ",".join(coauthor_ids), 291 292 # Media 293 "media_type": media_type, 294 "num_media": num_media, 295 "image_urls": ",".join(display_urls), 296 "media_urls": ",".join(media_urls), 297 298 # Engagement 299 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)), 300 "usertags": usertags, 301 "likes_hidden": "yes" if no_likes else "no", 302 "num_likes": node["like_count"] if not no_likes else MissingMappedField(0), 303 "num_comments": num_comments, 304 305 # Location 306 "location_name": location["name"], 307 "location_id": location["location_id"], 308 "location_latlong": location["latlong"], 309 "location_city": location["city"], 310 311 # Metadata 312 "unix_timestamp": node["taken_at"], 313 "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None 314 } 315 316 return mapped_item
16class SearchInstagram(Search): 17 """ 18 Import scraped Instagram data 19 """ 20 type = "instagram-search" # job ID 21 category = "Search" # category 22 title = "Import scraped Instagram data" # title displayed in UI 23 description = "Import Instagram data collected with an external tool such as Zeeschuimer." # description displayed in UI 24 extension = "ndjson" # extension of result file, used internally and in UI 25 is_from_zeeschuimer = True 26 27 # not available as a processor for existing datasets 28 accepts = [None] 29 references = [ 30 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 31 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)" 32 ] 33 34 # some magic numbers instagram uses 35 MEDIA_TYPE_PHOTO = 1 36 MEDIA_TYPE_VIDEO = 2 37 MEDIA_TYPE_CAROUSEL = 8 38 39 def get_items(self, query): 40 """ 41 Run custom search 42 43 Not available for Instagram 44 """ 45 raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere") 46 47 48 @staticmethod 49 def map_item(item): 50 """ 51 Map Instagram item 52 53 Instagram importing is a little bit roundabout since we can expect 54 input in two separate and not completely overlapping formats - an "edge 55 list" or an "item list", and posts are structured differently between 56 those, and do not contain the same data. So we find a middle ground 57 here... each format has its own handler function 58 59 :param dict item: Item to map 60 :return: Mapped item 61 """ 62 link = item.get("link", "") 63 if (item.get("product_type", "") == "ad") or \ 64 (link and link.startswith("https://www.facebook.com/ads/ig_redirect")): 65 # These are ads 66 raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.") 67 68 is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",) 69 70 if is_graph_response: 71 return MappedItem(SearchInstagram.parse_graph_item(item)) 72 else: 73 return MappedItem(SearchInstagram.parse_itemlist_item(item)) 74 75 @staticmethod 76 def parse_graph_item(node): 77 """ 78 Parse Instagram post in Graph format 79 80 2025-6-5: potentially legacy format 81 82 :param node: Data as received from Instagram 83 :return dict: Mapped item 84 """ 85 try: 86 caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"] 87 except IndexError: 88 caption = MissingMappedField("") 89 90 num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"]) 91 92 # get media url 93 # for carousels, get the first media item, for videos, get the video 94 # url, for photos, get the highest resolution 95 if node["__typename"] == "GraphSidecar": 96 media_node = node["edge_sidecar_to_children"]["edges"][0]["node"] 97 else: 98 media_node = node 99 100 if media_node["__typename"] == "GraphVideo": 101 media_url = media_node["video_url"] 102 elif media_node["__typename"] == "GraphImage": 103 resources = media_node.get("display_resources", media_node.get("thumbnail_resources")) 104 try: 105 media_url = resources.pop()["src"] 106 except AttributeError: 107 media_url = media_node.get("display_url", "") 108 else: 109 media_url = media_node["display_url"] 110 111 # type, 'mixed' means carousel with video and photo 112 type_map = {"GraphSidecar": "photo", "GraphVideo": "video"} 113 if node["__typename"] != "GraphSidecar": 114 media_type = type_map.get(node["__typename"], "unknown") 115 else: 116 media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]]) 117 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 118 119 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 120 # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems 121 if node.get("location"): 122 location["name"] = node["location"].get("name") 123 location["location_id"] = node["location"].get("pk") 124 # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future... 125 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 126 "location"].get("lat") else "" 127 location["city"] = node["location"].get("city") 128 129 no_likes = bool(node.get("like_and_view_counts_disabled")) 130 131 user = node.get("user") 132 owner = node.get("owner") 133 if node.get("user") and node.get("owner"): 134 if user.get("username") != owner.get("username"): 135 raise MapItemException("Unable to parse item: different user and owner") 136 137 mapped_item = { 138 # Post data 139 "id": node["shortcode"], 140 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 141 "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), 142 "thread_id": node["shortcode"], 143 "parent_id": node["shortcode"], 144 "url": "https://www.instagram.com/p/" + node["shortcode"], 145 "body": caption, 146 147 148 # Author data 149 "author": user.get("username", owner.get("username", MissingMappedField(""))), 150 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 151 "is_verified": True if user.get("is_verified") else False, 152 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 153 # Unable to find graph type posts to test 154 "coauthors": MissingMappedField(""), 155 "coauthor_fullnames": MissingMappedField(""), 156 "coauthor_ids": MissingMappedField(""), 157 158 # Media 159 "media_type": media_type, 160 "num_media": num_media, 161 "image_urls": node["display_url"], 162 "media_urls": media_url, 163 164 # Engagement 165 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)), 166 # Unsure if usertags will work; need data (this could raise it to attention...) 167 "usertags": ",".join( 168 [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 169 "likes_hidden": "yes" if no_likes else "no", 170 "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0), 171 "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0), 172 173 # Location data 174 "location_name": location["name"], 175 "location_id": location["location_id"], 176 "location_latlong": location["latlong"], 177 "location_city": location["city"], 178 179 # Metadata 180 "unix_timestamp": node["taken_at_timestamp"], 181 "missing_media": None 182 } 183 184 return mapped_item 185 186 @staticmethod 187 def parse_itemlist_item(node): 188 """ 189 Parse Instagram post in 'item list' format 190 191 :param node: Data as received from Instagram 192 :return dict: Mapped item 193 """ 194 num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"]) 195 caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"] 196 197 # get media urls 198 display_urls = [] 199 media_urls = [] 200 missing_media = None 201 type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"} 202 media_types = set() 203 # for carousels, get the first media item, for videos, get the video 204 # url, for photos, get the highest resolution 205 if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL: 206 media_nodes = node["carousel_media"] 207 else: 208 media_nodes = [node] 209 210 for media_node in media_nodes: 211 if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO: 212 # Videos 213 media_urls.append(media_node["video_versions"][0]["url"]) 214 if "image_versions2" in media_node: 215 display_urls.append(media_node["image_versions2"]["candidates"][0]["url"]) 216 else: 217 # no image links at all :-/ 218 # video is all we have 219 display_urls.append(media_node["video_versions"][0]["url"]) 220 221 elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"): 222 # Images 223 media_url = media_node["image_versions2"]["candidates"][0]["url"] 224 display_urls.append(media_url) 225 media_urls.append(media_url) 226 else: 227 missing_media = MissingMappedField("") 228 229 media_types.add(type_map.get(media_node["media_type"], "unknown")) 230 231 # type, 'mixed' means carousel with video and photo 232 media_type = "mixed" if len(media_types) > 1 else media_types.pop() 233 234 if "comment_count" in node: 235 num_comments = node["comment_count"] 236 elif "comments" in node and type(node["comments"]) is list: 237 num_comments = len(node["comments"]) 238 else: 239 num_comments = -1 240 241 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 242 if node.get("location"): 243 location["name"] = node["location"].get("name") 244 location["location_id"] = node["location"].get("pk") 245 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 246 "location"].get("lat") else "" 247 location["city"] = node["location"].get("city") 248 249 user = node.get("user", {}) 250 owner = node.get("owner", {}) 251 if user and owner: 252 if user.get("username") != owner.get("username"): 253 raise MapItemException("Unable to parse item: different user and owner") 254 255 # Instagram posts also allow 'Collabs' with up to one co-author 256 coauthors = [] 257 coauthor_fullnames = [] 258 coauthor_ids = [] 259 if node.get("coauthor_producers"): 260 for coauthor_node in node["coauthor_producers"]: 261 coauthors.append(coauthor_node.get("username")) 262 coauthor_fullnames.append(coauthor_node.get("full_name")) 263 coauthor_ids.append(coauthor_node.get("id")) 264 265 no_likes = bool(node.get("like_and_view_counts_disabled")) 266 267 # usertags 268 if "usertags" in node: 269 usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else "" 270 else: 271 # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`) 272 usertags = "" 273 274 mapped_item = { 275 # Post and caption 276 "id": node["code"], 277 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 278 "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"), 279 "thread_id": node["code"], 280 "parent_id": node["code"], 281 "url": "https://www.instagram.com/p/" + node["code"], 282 "body": caption, 283 284 # Authors 285 "author": user.get("username", owner.get("username", MissingMappedField(""))), 286 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 287 "verified": True if user.get("is_verified") else False, 288 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 289 "coauthors": ",".join(coauthors), 290 "coauthor_fullnames": ",".join(coauthor_fullnames), 291 "coauthor_ids": ",".join(coauthor_ids), 292 293 # Media 294 "media_type": media_type, 295 "num_media": num_media, 296 "image_urls": ",".join(display_urls), 297 "media_urls": ",".join(media_urls), 298 299 # Engagement 300 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)), 301 "usertags": usertags, 302 "likes_hidden": "yes" if no_likes else "no", 303 "num_likes": node["like_count"] if not no_likes else MissingMappedField(0), 304 "num_comments": num_comments, 305 306 # Location 307 "location_name": location["name"], 308 "location_id": location["location_id"], 309 "location_latlong": location["latlong"], 310 "location_city": location["city"], 311 312 # Metadata 313 "unix_timestamp": node["taken_at"], 314 "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None 315 } 316 317 return mapped_item
Import scraped Instagram data
references =
['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)']
def
get_items(self, query):
39 def get_items(self, query): 40 """ 41 Run custom search 42 43 Not available for Instagram 44 """ 45 raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")
Run custom search
Not available for Instagram
@staticmethod
def
map_item(item):
48 @staticmethod 49 def map_item(item): 50 """ 51 Map Instagram item 52 53 Instagram importing is a little bit roundabout since we can expect 54 input in two separate and not completely overlapping formats - an "edge 55 list" or an "item list", and posts are structured differently between 56 those, and do not contain the same data. So we find a middle ground 57 here... each format has its own handler function 58 59 :param dict item: Item to map 60 :return: Mapped item 61 """ 62 link = item.get("link", "") 63 if (item.get("product_type", "") == "ad") or \ 64 (link and link.startswith("https://www.facebook.com/ads/ig_redirect")): 65 # These are ads 66 raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.") 67 68 is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",) 69 70 if is_graph_response: 71 return MappedItem(SearchInstagram.parse_graph_item(item)) 72 else: 73 return MappedItem(SearchInstagram.parse_itemlist_item(item))
Map Instagram item
Instagram importing is a little bit roundabout since we can expect input in two separate and not completely overlapping formats - an "edge list" or an "item list", and posts are structured differently between those, and do not contain the same data. So we find a middle ground here... each format has its own handler function
Parameters
- dict item: Item to map
Returns
Mapped item
@staticmethod
def
parse_graph_item(node):
75 @staticmethod 76 def parse_graph_item(node): 77 """ 78 Parse Instagram post in Graph format 79 80 2025-6-5: potentially legacy format 81 82 :param node: Data as received from Instagram 83 :return dict: Mapped item 84 """ 85 try: 86 caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"] 87 except IndexError: 88 caption = MissingMappedField("") 89 90 num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"]) 91 92 # get media url 93 # for carousels, get the first media item, for videos, get the video 94 # url, for photos, get the highest resolution 95 if node["__typename"] == "GraphSidecar": 96 media_node = node["edge_sidecar_to_children"]["edges"][0]["node"] 97 else: 98 media_node = node 99 100 if media_node["__typename"] == "GraphVideo": 101 media_url = media_node["video_url"] 102 elif media_node["__typename"] == "GraphImage": 103 resources = media_node.get("display_resources", media_node.get("thumbnail_resources")) 104 try: 105 media_url = resources.pop()["src"] 106 except AttributeError: 107 media_url = media_node.get("display_url", "") 108 else: 109 media_url = media_node["display_url"] 110 111 # type, 'mixed' means carousel with video and photo 112 type_map = {"GraphSidecar": "photo", "GraphVideo": "video"} 113 if node["__typename"] != "GraphSidecar": 114 media_type = type_map.get(node["__typename"], "unknown") 115 else: 116 media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]]) 117 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 118 119 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 120 # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems 121 if node.get("location"): 122 location["name"] = node["location"].get("name") 123 location["location_id"] = node["location"].get("pk") 124 # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future... 125 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 126 "location"].get("lat") else "" 127 location["city"] = node["location"].get("city") 128 129 no_likes = bool(node.get("like_and_view_counts_disabled")) 130 131 user = node.get("user") 132 owner = node.get("owner") 133 if node.get("user") and node.get("owner"): 134 if user.get("username") != owner.get("username"): 135 raise MapItemException("Unable to parse item: different user and owner") 136 137 mapped_item = { 138 # Post data 139 "id": node["shortcode"], 140 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 141 "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), 142 "thread_id": node["shortcode"], 143 "parent_id": node["shortcode"], 144 "url": "https://www.instagram.com/p/" + node["shortcode"], 145 "body": caption, 146 147 148 # Author data 149 "author": user.get("username", owner.get("username", MissingMappedField(""))), 150 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 151 "is_verified": True if user.get("is_verified") else False, 152 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 153 # Unable to find graph type posts to test 154 "coauthors": MissingMappedField(""), 155 "coauthor_fullnames": MissingMappedField(""), 156 "coauthor_ids": MissingMappedField(""), 157 158 # Media 159 "media_type": media_type, 160 "num_media": num_media, 161 "image_urls": node["display_url"], 162 "media_urls": media_url, 163 164 # Engagement 165 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)), 166 # Unsure if usertags will work; need data (this could raise it to attention...) 167 "usertags": ",".join( 168 [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 169 "likes_hidden": "yes" if no_likes else "no", 170 "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0), 171 "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0), 172 173 # Location data 174 "location_name": location["name"], 175 "location_id": location["location_id"], 176 "location_latlong": location["latlong"], 177 "location_city": location["city"], 178 179 # Metadata 180 "unix_timestamp": node["taken_at_timestamp"], 181 "missing_media": None 182 } 183 184 return mapped_item
Parse Instagram post in Graph format
2025-6-5: potentially legacy format
Parameters
- node: Data as received from Instagram
Returns
Mapped item
@staticmethod
def
parse_itemlist_item(node):
186 @staticmethod 187 def parse_itemlist_item(node): 188 """ 189 Parse Instagram post in 'item list' format 190 191 :param node: Data as received from Instagram 192 :return dict: Mapped item 193 """ 194 num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"]) 195 caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"] 196 197 # get media urls 198 display_urls = [] 199 media_urls = [] 200 missing_media = None 201 type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"} 202 media_types = set() 203 # for carousels, get the first media item, for videos, get the video 204 # url, for photos, get the highest resolution 205 if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL: 206 media_nodes = node["carousel_media"] 207 else: 208 media_nodes = [node] 209 210 for media_node in media_nodes: 211 if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO: 212 # Videos 213 media_urls.append(media_node["video_versions"][0]["url"]) 214 if "image_versions2" in media_node: 215 display_urls.append(media_node["image_versions2"]["candidates"][0]["url"]) 216 else: 217 # no image links at all :-/ 218 # video is all we have 219 display_urls.append(media_node["video_versions"][0]["url"]) 220 221 elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"): 222 # Images 223 media_url = media_node["image_versions2"]["candidates"][0]["url"] 224 display_urls.append(media_url) 225 media_urls.append(media_url) 226 else: 227 missing_media = MissingMappedField("") 228 229 media_types.add(type_map.get(media_node["media_type"], "unknown")) 230 231 # type, 'mixed' means carousel with video and photo 232 media_type = "mixed" if len(media_types) > 1 else media_types.pop() 233 234 if "comment_count" in node: 235 num_comments = node["comment_count"] 236 elif "comments" in node and type(node["comments"]) is list: 237 num_comments = len(node["comments"]) 238 else: 239 num_comments = -1 240 241 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 242 if node.get("location"): 243 location["name"] = node["location"].get("name") 244 location["location_id"] = node["location"].get("pk") 245 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 246 "location"].get("lat") else "" 247 location["city"] = node["location"].get("city") 248 249 user = node.get("user", {}) 250 owner = node.get("owner", {}) 251 if user and owner: 252 if user.get("username") != owner.get("username"): 253 raise MapItemException("Unable to parse item: different user and owner") 254 255 # Instagram posts also allow 'Collabs' with up to one co-author 256 coauthors = [] 257 coauthor_fullnames = [] 258 coauthor_ids = [] 259 if node.get("coauthor_producers"): 260 for coauthor_node in node["coauthor_producers"]: 261 coauthors.append(coauthor_node.get("username")) 262 coauthor_fullnames.append(coauthor_node.get("full_name")) 263 coauthor_ids.append(coauthor_node.get("id")) 264 265 no_likes = bool(node.get("like_and_view_counts_disabled")) 266 267 # usertags 268 if "usertags" in node: 269 usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else "" 270 else: 271 # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`) 272 usertags = "" 273 274 mapped_item = { 275 # Post and caption 276 "id": node["code"], 277 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 278 "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"), 279 "thread_id": node["code"], 280 "parent_id": node["code"], 281 "url": "https://www.instagram.com/p/" + node["code"], 282 "body": caption, 283 284 # Authors 285 "author": user.get("username", owner.get("username", MissingMappedField(""))), 286 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 287 "verified": True if user.get("is_verified") else False, 288 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 289 "coauthors": ",".join(coauthors), 290 "coauthor_fullnames": ",".join(coauthor_fullnames), 291 "coauthor_ids": ",".join(coauthor_ids), 292 293 # Media 294 "media_type": media_type, 295 "num_media": num_media, 296 "image_urls": ",".join(display_urls), 297 "media_urls": ",".join(media_urls), 298 299 # Engagement 300 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)), 301 "usertags": usertags, 302 "likes_hidden": "yes" if no_likes else "no", 303 "num_likes": node["like_count"] if not no_likes else MissingMappedField(0), 304 "num_comments": num_comments, 305 306 # Location 307 "location_name": location["name"], 308 "location_id": location["location_id"], 309 "location_latlong": location["latlong"], 310 "location_city": location["city"], 311 312 # Metadata 313 "unix_timestamp": node["taken_at"], 314 "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None 315 } 316 317 return mapped_item
Parse Instagram post in 'item list' format
Parameters
- node: Data as received from Instagram
Returns
Mapped item
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor