datasources.instagram.search_instagram
Import scraped Instagram data
It's prohibitively difficult to scrape data from Instagram within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped Instagram data 3 4It's prohibitively difficult to scrape data from Instagram within 4CAT itself 5due to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7import datetime 8import re 9 10from backend.lib.search import Search 11from common.lib.item_mapping import MappedItem, MissingMappedField 12from common.lib.exceptions import MapItemException 13from common.lib.helpers import normalize_url_encoding 14 15 16class SearchInstagram(Search): 17 """ 18 Import scraped Instagram data 19 """ 20 type = "instagram-search" # job ID 21 category = "Search" # category 22 title = "Import scraped Instagram data" # title displayed in UI 23 description = "Import Instagram data collected with an external tool such as Zeeschuimer." # description displayed in UI 24 extension = "ndjson" # extension of result file, used internally and in UI 25 is_from_zeeschuimer = True 26 27 # not available as a processor for existing datasets 28 accepts = [None] 29 references = [ 30 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 31 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)" 32 ] 33 34 # some magic numbers instagram uses 35 MEDIA_TYPE_PHOTO = 1 36 MEDIA_TYPE_VIDEO = 2 37 MEDIA_TYPE_CAROUSEL = 8 38 39 def get_items(self, query): 40 """ 41 Run custom search 42 43 Not available for Instagram 44 """ 45 raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere") 46 47 48 @staticmethod 49 def map_item(item): 50 """ 51 Map Instagram item 52 53 Instagram importing is a little bit roundabout since we can expect 54 input in two separate and not completely overlapping formats - an "edge 55 list" or an "item list", and posts are structured differently between 56 those, and do not contain the same data. So we find a middle ground 57 here... each format has its own handler function 58 59 :param dict item: Item to map 60 :return: Mapped item 61 """ 62 link = item.get("link", "") 63 if (item.get("product_type", "") == "ad") or \ 64 (link and link.startswith("https://www.facebook.com/ads/ig_redirect")): 65 # These are ads 66 raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.") 67 68 is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",) 69 70 if is_graph_response: 71 return MappedItem(SearchInstagram.parse_graph_item(item)) 72 else: 73 return MappedItem(SearchInstagram.parse_itemlist_item(item)) 74 75 @staticmethod 76 def parse_graph_item(node): 77 """ 78 Parse Instagram post in Graph format 79 80 2025-6-5: potentially legacy format 81 82 :param node: Data as received from Instagram 83 :return dict: Mapped item 84 """ 85 try: 86 caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"] 87 except IndexError: 88 caption = MissingMappedField("") 89 90 num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"]) 91 92 # get media url 93 # for carousels, get the first media item, for videos, get the video 94 # url, for photos, get the highest resolution 95 if node["__typename"] == "GraphSidecar": 96 media_node = node["edge_sidecar_to_children"]["edges"][0]["node"] 97 else: 98 media_node = node 99 100 if media_node["__typename"] == "GraphVideo": 101 media_url = media_node["video_url"] 102 elif media_node["__typename"] == "GraphImage": 103 resources = media_node.get("display_resources", media_node.get("thumbnail_resources")) 104 try: 105 media_url = resources.pop()["src"] 106 except AttributeError: 107 media_url = media_node.get("display_url", "") 108 else: 109 media_url = media_node["display_url"] 110 111 # type, 'mixed' means carousel with video and photo 112 type_map = {"GraphSidecar": "photo", "GraphVideo": "video"} 113 if node["__typename"] != "GraphSidecar": 114 media_type = type_map.get(node["__typename"], "unknown") 115 else: 116 media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]]) 117 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 118 119 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 120 # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems 121 if node.get("location"): 122 location["name"] = node["location"].get("name") 123 location["location_id"] = node["location"].get("pk") 124 # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future... 125 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 126 "location"].get("lat") else "" 127 location["city"] = node["location"].get("city") 128 129 no_likes = bool(node.get("like_and_view_counts_disabled")) 130 131 user = node.get("user") 132 owner = node.get("owner") 133 if node.get("user") and node.get("owner"): 134 if owner.get("id") == user.get("id"): 135 # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user 136 pass 137 elif user.get("username") != owner.get("username"): 138 raise MapItemException("Unable to parse item: different user and owner") 139 140 mapped_item = { 141 # Post data 142 "id": node["shortcode"], 143 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 144 "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), 145 "thread_id": node["shortcode"], 146 "parent_id": node["shortcode"], 147 "url": "https://www.instagram.com/p/" + node["shortcode"], 148 "body": caption, 149 150 151 # Author data 152 "author": user.get("username", owner.get("username", MissingMappedField(""))), 153 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 154 "is_verified": True if user.get("is_verified") else False, 155 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 156 # Unable to find graph type posts to test 157 "coauthors": MissingMappedField(""), 158 "coauthor_fullnames": MissingMappedField(""), 159 "coauthor_ids": MissingMappedField(""), 160 161 # Media 162 "media_type": media_type, 163 "num_media": num_media, 164 "image_urls": node["display_url"], 165 "media_urls": media_url, 166 167 # Engagement 168 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)), 169 # Unsure if usertags will work; need data (this could raise it to attention...) 170 "usertags": ",".join( 171 [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 172 "likes_hidden": "yes" if no_likes else "no", 173 "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0), 174 "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0), 175 176 # Location data 177 "location_name": location["name"], 178 "location_id": location["location_id"], 179 "location_latlong": location["latlong"], 180 "location_city": location["city"], 181 182 # Metadata 183 "unix_timestamp": node["taken_at_timestamp"], 184 "missing_media": None 185 } 186 187 return mapped_item 188 189 @staticmethod 190 def parse_itemlist_item(node): 191 """ 192 Parse Instagram post in 'item list' format 193 194 :param node: Data as received from Instagram 195 :return dict: Mapped item 196 """ 197 num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"]) 198 caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"] 199 200 # get media urls 201 display_urls = [] 202 media_urls = [] 203 missing_media = None 204 type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"} 205 media_types = set() 206 # for carousels, get the first media item, for videos, get the video 207 # url, for photos, get the highest resolution 208 if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL: 209 media_nodes = node["carousel_media"] 210 else: 211 media_nodes = [node] 212 213 for media_node in media_nodes: 214 if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO: 215 # Videos 216 media_urls.append(media_node["video_versions"][0]["url"]) 217 if "image_versions2" in media_node: 218 display_urls.append(media_node["image_versions2"]["candidates"][0]["url"]) 219 else: 220 # no image links at all :-/ 221 # video is all we have 222 display_urls.append(media_node["video_versions"][0]["url"]) 223 224 elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"): 225 # Images 226 media_url = media_node["image_versions2"]["candidates"][0]["url"] 227 display_urls.append(media_url) 228 media_urls.append(media_url) 229 else: 230 missing_media = MissingMappedField("") 231 232 media_types.add(type_map.get(media_node["media_type"], "unknown")) 233 234 # type, 'mixed' means carousel with video and photo 235 media_type = "mixed" if len(media_types) > 1 else media_types.pop() 236 237 if "comment_count" in node: 238 num_comments = node["comment_count"] 239 elif "comments" in node and type(node["comments"]) is list: 240 num_comments = len(node["comments"]) 241 else: 242 num_comments = -1 243 244 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 245 if node.get("location"): 246 location["name"] = node["location"].get("name") 247 location["location_id"] = node["location"].get("pk") 248 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 249 "location"].get("lat") else "" 250 location["city"] = node["location"].get("city") 251 252 user = node.get("user", {}) 253 owner = node.get("owner", {}) 254 if user and owner: 255 if owner.get("id") == user.get("id"): 256 # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user 257 pass 258 elif user.get("username") != owner.get("username"): 259 raise MapItemException("Unable to parse item: different user and owner") 260 261 # Instagram posts also allow 'Collabs' with up to one co-author 262 coauthors = [] 263 coauthor_fullnames = [] 264 coauthor_ids = [] 265 if node.get("coauthor_producers"): 266 for coauthor_node in node["coauthor_producers"]: 267 coauthors.append(coauthor_node.get("username")) 268 coauthor_fullnames.append(coauthor_node.get("full_name")) 269 coauthor_ids.append(coauthor_node.get("id")) 270 271 no_likes = bool(node.get("like_and_view_counts_disabled")) 272 273 # usertags 274 if "usertags" in node: 275 usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else "" 276 else: 277 # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`) 278 usertags = "" 279 280 mapped_item = { 281 # Post and caption 282 "collected_from_url": normalize_url_encoding(node.get("__import_meta", {}).get("source_platform_url")), # Zeeschuimer metadata 283 "id": node["code"], 284 "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"), 285 "thread_id": node["code"], 286 "parent_id": node["code"], 287 "url": "https://www.instagram.com/p/" + node["code"], 288 "body": caption, 289 290 # Authors 291 "author": user.get("username", owner.get("username", MissingMappedField(""))), 292 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 293 "verified": True if user.get("is_verified") else False, 294 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 295 "coauthors": ",".join(coauthors), 296 "coauthor_fullnames": ",".join(coauthor_fullnames), 297 "coauthor_ids": ",".join(coauthor_ids), 298 299 # Media 300 "media_type": media_type, 301 "num_media": num_media, 302 "image_urls": ",".join(display_urls), 303 "media_urls": ",".join(media_urls), 304 305 # Engagement 306 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)), 307 "usertags": usertags, 308 "likes_hidden": "yes" if no_likes else "no", 309 "num_likes": node["like_count"] if not no_likes else MissingMappedField(0), 310 "num_comments": num_comments, 311 312 # Location 313 "location_name": location["name"], 314 "location_id": location["location_id"], 315 "location_latlong": location["latlong"], 316 "location_city": location["city"], 317 318 # Metadata 319 "unix_timestamp": node["taken_at"], 320 "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None 321 } 322 323 return mapped_item
17class SearchInstagram(Search): 18 """ 19 Import scraped Instagram data 20 """ 21 type = "instagram-search" # job ID 22 category = "Search" # category 23 title = "Import scraped Instagram data" # title displayed in UI 24 description = "Import Instagram data collected with an external tool such as Zeeschuimer." # description displayed in UI 25 extension = "ndjson" # extension of result file, used internally and in UI 26 is_from_zeeschuimer = True 27 28 # not available as a processor for existing datasets 29 accepts = [None] 30 references = [ 31 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 32 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)" 33 ] 34 35 # some magic numbers instagram uses 36 MEDIA_TYPE_PHOTO = 1 37 MEDIA_TYPE_VIDEO = 2 38 MEDIA_TYPE_CAROUSEL = 8 39 40 def get_items(self, query): 41 """ 42 Run custom search 43 44 Not available for Instagram 45 """ 46 raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere") 47 48 49 @staticmethod 50 def map_item(item): 51 """ 52 Map Instagram item 53 54 Instagram importing is a little bit roundabout since we can expect 55 input in two separate and not completely overlapping formats - an "edge 56 list" or an "item list", and posts are structured differently between 57 those, and do not contain the same data. So we find a middle ground 58 here... each format has its own handler function 59 60 :param dict item: Item to map 61 :return: Mapped item 62 """ 63 link = item.get("link", "") 64 if (item.get("product_type", "") == "ad") or \ 65 (link and link.startswith("https://www.facebook.com/ads/ig_redirect")): 66 # These are ads 67 raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.") 68 69 is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",) 70 71 if is_graph_response: 72 return MappedItem(SearchInstagram.parse_graph_item(item)) 73 else: 74 return MappedItem(SearchInstagram.parse_itemlist_item(item)) 75 76 @staticmethod 77 def parse_graph_item(node): 78 """ 79 Parse Instagram post in Graph format 80 81 2025-6-5: potentially legacy format 82 83 :param node: Data as received from Instagram 84 :return dict: Mapped item 85 """ 86 try: 87 caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"] 88 except IndexError: 89 caption = MissingMappedField("") 90 91 num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"]) 92 93 # get media url 94 # for carousels, get the first media item, for videos, get the video 95 # url, for photos, get the highest resolution 96 if node["__typename"] == "GraphSidecar": 97 media_node = node["edge_sidecar_to_children"]["edges"][0]["node"] 98 else: 99 media_node = node 100 101 if media_node["__typename"] == "GraphVideo": 102 media_url = media_node["video_url"] 103 elif media_node["__typename"] == "GraphImage": 104 resources = media_node.get("display_resources", media_node.get("thumbnail_resources")) 105 try: 106 media_url = resources.pop()["src"] 107 except AttributeError: 108 media_url = media_node.get("display_url", "") 109 else: 110 media_url = media_node["display_url"] 111 112 # type, 'mixed' means carousel with video and photo 113 type_map = {"GraphSidecar": "photo", "GraphVideo": "video"} 114 if node["__typename"] != "GraphSidecar": 115 media_type = type_map.get(node["__typename"], "unknown") 116 else: 117 media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]]) 118 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 119 120 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 121 # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems 122 if node.get("location"): 123 location["name"] = node["location"].get("name") 124 location["location_id"] = node["location"].get("pk") 125 # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future... 126 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 127 "location"].get("lat") else "" 128 location["city"] = node["location"].get("city") 129 130 no_likes = bool(node.get("like_and_view_counts_disabled")) 131 132 user = node.get("user") 133 owner = node.get("owner") 134 if node.get("user") and node.get("owner"): 135 if owner.get("id") == user.get("id"): 136 # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user 137 pass 138 elif user.get("username") != owner.get("username"): 139 raise MapItemException("Unable to parse item: different user and owner") 140 141 mapped_item = { 142 # Post data 143 "id": node["shortcode"], 144 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 145 "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), 146 "thread_id": node["shortcode"], 147 "parent_id": node["shortcode"], 148 "url": "https://www.instagram.com/p/" + node["shortcode"], 149 "body": caption, 150 151 152 # Author data 153 "author": user.get("username", owner.get("username", MissingMappedField(""))), 154 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 155 "is_verified": True if user.get("is_verified") else False, 156 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 157 # Unable to find graph type posts to test 158 "coauthors": MissingMappedField(""), 159 "coauthor_fullnames": MissingMappedField(""), 160 "coauthor_ids": MissingMappedField(""), 161 162 # Media 163 "media_type": media_type, 164 "num_media": num_media, 165 "image_urls": node["display_url"], 166 "media_urls": media_url, 167 168 # Engagement 169 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)), 170 # Unsure if usertags will work; need data (this could raise it to attention...) 171 "usertags": ",".join( 172 [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 173 "likes_hidden": "yes" if no_likes else "no", 174 "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0), 175 "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0), 176 177 # Location data 178 "location_name": location["name"], 179 "location_id": location["location_id"], 180 "location_latlong": location["latlong"], 181 "location_city": location["city"], 182 183 # Metadata 184 "unix_timestamp": node["taken_at_timestamp"], 185 "missing_media": None 186 } 187 188 return mapped_item 189 190 @staticmethod 191 def parse_itemlist_item(node): 192 """ 193 Parse Instagram post in 'item list' format 194 195 :param node: Data as received from Instagram 196 :return dict: Mapped item 197 """ 198 num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"]) 199 caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"] 200 201 # get media urls 202 display_urls = [] 203 media_urls = [] 204 missing_media = None 205 type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"} 206 media_types = set() 207 # for carousels, get the first media item, for videos, get the video 208 # url, for photos, get the highest resolution 209 if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL: 210 media_nodes = node["carousel_media"] 211 else: 212 media_nodes = [node] 213 214 for media_node in media_nodes: 215 if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO: 216 # Videos 217 media_urls.append(media_node["video_versions"][0]["url"]) 218 if "image_versions2" in media_node: 219 display_urls.append(media_node["image_versions2"]["candidates"][0]["url"]) 220 else: 221 # no image links at all :-/ 222 # video is all we have 223 display_urls.append(media_node["video_versions"][0]["url"]) 224 225 elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"): 226 # Images 227 media_url = media_node["image_versions2"]["candidates"][0]["url"] 228 display_urls.append(media_url) 229 media_urls.append(media_url) 230 else: 231 missing_media = MissingMappedField("") 232 233 media_types.add(type_map.get(media_node["media_type"], "unknown")) 234 235 # type, 'mixed' means carousel with video and photo 236 media_type = "mixed" if len(media_types) > 1 else media_types.pop() 237 238 if "comment_count" in node: 239 num_comments = node["comment_count"] 240 elif "comments" in node and type(node["comments"]) is list: 241 num_comments = len(node["comments"]) 242 else: 243 num_comments = -1 244 245 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 246 if node.get("location"): 247 location["name"] = node["location"].get("name") 248 location["location_id"] = node["location"].get("pk") 249 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 250 "location"].get("lat") else "" 251 location["city"] = node["location"].get("city") 252 253 user = node.get("user", {}) 254 owner = node.get("owner", {}) 255 if user and owner: 256 if owner.get("id") == user.get("id"): 257 # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user 258 pass 259 elif user.get("username") != owner.get("username"): 260 raise MapItemException("Unable to parse item: different user and owner") 261 262 # Instagram posts also allow 'Collabs' with up to one co-author 263 coauthors = [] 264 coauthor_fullnames = [] 265 coauthor_ids = [] 266 if node.get("coauthor_producers"): 267 for coauthor_node in node["coauthor_producers"]: 268 coauthors.append(coauthor_node.get("username")) 269 coauthor_fullnames.append(coauthor_node.get("full_name")) 270 coauthor_ids.append(coauthor_node.get("id")) 271 272 no_likes = bool(node.get("like_and_view_counts_disabled")) 273 274 # usertags 275 if "usertags" in node: 276 usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else "" 277 else: 278 # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`) 279 usertags = "" 280 281 mapped_item = { 282 # Post and caption 283 "collected_from_url": normalize_url_encoding(node.get("__import_meta", {}).get("source_platform_url")), # Zeeschuimer metadata 284 "id": node["code"], 285 "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"), 286 "thread_id": node["code"], 287 "parent_id": node["code"], 288 "url": "https://www.instagram.com/p/" + node["code"], 289 "body": caption, 290 291 # Authors 292 "author": user.get("username", owner.get("username", MissingMappedField(""))), 293 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 294 "verified": True if user.get("is_verified") else False, 295 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 296 "coauthors": ",".join(coauthors), 297 "coauthor_fullnames": ",".join(coauthor_fullnames), 298 "coauthor_ids": ",".join(coauthor_ids), 299 300 # Media 301 "media_type": media_type, 302 "num_media": num_media, 303 "image_urls": ",".join(display_urls), 304 "media_urls": ",".join(media_urls), 305 306 # Engagement 307 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)), 308 "usertags": usertags, 309 "likes_hidden": "yes" if no_likes else "no", 310 "num_likes": node["like_count"] if not no_likes else MissingMappedField(0), 311 "num_comments": num_comments, 312 313 # Location 314 "location_name": location["name"], 315 "location_id": location["location_id"], 316 "location_latlong": location["latlong"], 317 "location_city": location["city"], 318 319 # Metadata 320 "unix_timestamp": node["taken_at"], 321 "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None 322 } 323 324 return mapped_item
Import scraped Instagram data
references =
['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)']
def
get_items(self, query):
40 def get_items(self, query): 41 """ 42 Run custom search 43 44 Not available for Instagram 45 """ 46 raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")
Run custom search
Not available for Instagram
@staticmethod
def
map_item(item):
49 @staticmethod 50 def map_item(item): 51 """ 52 Map Instagram item 53 54 Instagram importing is a little bit roundabout since we can expect 55 input in two separate and not completely overlapping formats - an "edge 56 list" or an "item list", and posts are structured differently between 57 those, and do not contain the same data. So we find a middle ground 58 here... each format has its own handler function 59 60 :param dict item: Item to map 61 :return: Mapped item 62 """ 63 link = item.get("link", "") 64 if (item.get("product_type", "") == "ad") or \ 65 (link and link.startswith("https://www.facebook.com/ads/ig_redirect")): 66 # These are ads 67 raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.") 68 69 is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",) 70 71 if is_graph_response: 72 return MappedItem(SearchInstagram.parse_graph_item(item)) 73 else: 74 return MappedItem(SearchInstagram.parse_itemlist_item(item))
Map Instagram item
Instagram importing is a little bit roundabout since we can expect input in two separate and not completely overlapping formats - an "edge list" or an "item list", and posts are structured differently between those, and do not contain the same data. So we find a middle ground here... each format has its own handler function
Parameters
- dict item: Item to map
Returns
Mapped item
@staticmethod
def
parse_graph_item(node):
76 @staticmethod 77 def parse_graph_item(node): 78 """ 79 Parse Instagram post in Graph format 80 81 2025-6-5: potentially legacy format 82 83 :param node: Data as received from Instagram 84 :return dict: Mapped item 85 """ 86 try: 87 caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"] 88 except IndexError: 89 caption = MissingMappedField("") 90 91 num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"]) 92 93 # get media url 94 # for carousels, get the first media item, for videos, get the video 95 # url, for photos, get the highest resolution 96 if node["__typename"] == "GraphSidecar": 97 media_node = node["edge_sidecar_to_children"]["edges"][0]["node"] 98 else: 99 media_node = node 100 101 if media_node["__typename"] == "GraphVideo": 102 media_url = media_node["video_url"] 103 elif media_node["__typename"] == "GraphImage": 104 resources = media_node.get("display_resources", media_node.get("thumbnail_resources")) 105 try: 106 media_url = resources.pop()["src"] 107 except AttributeError: 108 media_url = media_node.get("display_url", "") 109 else: 110 media_url = media_node["display_url"] 111 112 # type, 'mixed' means carousel with video and photo 113 type_map = {"GraphSidecar": "photo", "GraphVideo": "video"} 114 if node["__typename"] != "GraphSidecar": 115 media_type = type_map.get(node["__typename"], "unknown") 116 else: 117 media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]]) 118 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 119 120 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 121 # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems 122 if node.get("location"): 123 location["name"] = node["location"].get("name") 124 location["location_id"] = node["location"].get("pk") 125 # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future... 126 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 127 "location"].get("lat") else "" 128 location["city"] = node["location"].get("city") 129 130 no_likes = bool(node.get("like_and_view_counts_disabled")) 131 132 user = node.get("user") 133 owner = node.get("owner") 134 if node.get("user") and node.get("owner"): 135 if owner.get("id") == user.get("id"): 136 # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user 137 pass 138 elif user.get("username") != owner.get("username"): 139 raise MapItemException("Unable to parse item: different user and owner") 140 141 mapped_item = { 142 # Post data 143 "id": node["shortcode"], 144 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 145 "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), 146 "thread_id": node["shortcode"], 147 "parent_id": node["shortcode"], 148 "url": "https://www.instagram.com/p/" + node["shortcode"], 149 "body": caption, 150 151 152 # Author data 153 "author": user.get("username", owner.get("username", MissingMappedField(""))), 154 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 155 "is_verified": True if user.get("is_verified") else False, 156 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 157 # Unable to find graph type posts to test 158 "coauthors": MissingMappedField(""), 159 "coauthor_fullnames": MissingMappedField(""), 160 "coauthor_ids": MissingMappedField(""), 161 162 # Media 163 "media_type": media_type, 164 "num_media": num_media, 165 "image_urls": node["display_url"], 166 "media_urls": media_url, 167 168 # Engagement 169 "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)), 170 # Unsure if usertags will work; need data (this could raise it to attention...) 171 "usertags": ",".join( 172 [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 173 "likes_hidden": "yes" if no_likes else "no", 174 "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0), 175 "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0), 176 177 # Location data 178 "location_name": location["name"], 179 "location_id": location["location_id"], 180 "location_latlong": location["latlong"], 181 "location_city": location["city"], 182 183 # Metadata 184 "unix_timestamp": node["taken_at_timestamp"], 185 "missing_media": None 186 } 187 188 return mapped_item
Parse Instagram post in Graph format
2025-6-5: potentially legacy format
Parameters
- node: Data as received from Instagram
Returns
Mapped item
@staticmethod
def
parse_itemlist_item(node):
190 @staticmethod 191 def parse_itemlist_item(node): 192 """ 193 Parse Instagram post in 'item list' format 194 195 :param node: Data as received from Instagram 196 :return dict: Mapped item 197 """ 198 num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"]) 199 caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"] 200 201 # get media urls 202 display_urls = [] 203 media_urls = [] 204 missing_media = None 205 type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"} 206 media_types = set() 207 # for carousels, get the first media item, for videos, get the video 208 # url, for photos, get the highest resolution 209 if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL: 210 media_nodes = node["carousel_media"] 211 else: 212 media_nodes = [node] 213 214 for media_node in media_nodes: 215 if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO: 216 # Videos 217 media_urls.append(media_node["video_versions"][0]["url"]) 218 if "image_versions2" in media_node: 219 display_urls.append(media_node["image_versions2"]["candidates"][0]["url"]) 220 else: 221 # no image links at all :-/ 222 # video is all we have 223 display_urls.append(media_node["video_versions"][0]["url"]) 224 225 elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"): 226 # Images 227 media_url = media_node["image_versions2"]["candidates"][0]["url"] 228 display_urls.append(media_url) 229 media_urls.append(media_url) 230 else: 231 missing_media = MissingMappedField("") 232 233 media_types.add(type_map.get(media_node["media_type"], "unknown")) 234 235 # type, 'mixed' means carousel with video and photo 236 media_type = "mixed" if len(media_types) > 1 else media_types.pop() 237 238 if "comment_count" in node: 239 num_comments = node["comment_count"] 240 elif "comments" in node and type(node["comments"]) is list: 241 num_comments = len(node["comments"]) 242 else: 243 num_comments = -1 244 245 location = {"name": "", "latlong": "", "city": "", "location_id": ""} 246 if node.get("location"): 247 location["name"] = node["location"].get("name") 248 location["location_id"] = node["location"].get("pk") 249 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 250 "location"].get("lat") else "" 251 location["city"] = node["location"].get("city") 252 253 user = node.get("user", {}) 254 owner = node.get("owner", {}) 255 if user and owner: 256 if owner.get("id") == user.get("id"): 257 # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user 258 pass 259 elif user.get("username") != owner.get("username"): 260 raise MapItemException("Unable to parse item: different user and owner") 261 262 # Instagram posts also allow 'Collabs' with up to one co-author 263 coauthors = [] 264 coauthor_fullnames = [] 265 coauthor_ids = [] 266 if node.get("coauthor_producers"): 267 for coauthor_node in node["coauthor_producers"]: 268 coauthors.append(coauthor_node.get("username")) 269 coauthor_fullnames.append(coauthor_node.get("full_name")) 270 coauthor_ids.append(coauthor_node.get("id")) 271 272 no_likes = bool(node.get("like_and_view_counts_disabled")) 273 274 # usertags 275 if "usertags" in node: 276 usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else "" 277 else: 278 # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`) 279 usertags = "" 280 281 mapped_item = { 282 # Post and caption 283 "collected_from_url": normalize_url_encoding(node.get("__import_meta", {}).get("source_platform_url")), # Zeeschuimer metadata 284 "id": node["code"], 285 "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"), 286 "thread_id": node["code"], 287 "parent_id": node["code"], 288 "url": "https://www.instagram.com/p/" + node["code"], 289 "body": caption, 290 291 # Authors 292 "author": user.get("username", owner.get("username", MissingMappedField(""))), 293 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 294 "verified": True if user.get("is_verified") else False, 295 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 296 "coauthors": ",".join(coauthors), 297 "coauthor_fullnames": ",".join(coauthor_fullnames), 298 "coauthor_ids": ",".join(coauthor_ids), 299 300 # Media 301 "media_type": media_type, 302 "num_media": num_media, 303 "image_urls": ",".join(display_urls), 304 "media_urls": ",".join(media_urls), 305 306 # Engagement 307 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)), 308 "usertags": usertags, 309 "likes_hidden": "yes" if no_likes else "no", 310 "num_likes": node["like_count"] if not no_likes else MissingMappedField(0), 311 "num_comments": num_comments, 312 313 # Location 314 "location_name": location["name"], 315 "location_id": location["location_id"], 316 "location_latlong": location["latlong"], 317 "location_city": location["city"], 318 319 # Metadata 320 "unix_timestamp": node["taken_at"], 321 "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None 322 } 323 324 return mapped_item
Parse Instagram post in 'item list' format
Parameters
- node: Data as received from Instagram
Returns
Mapped item
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- run_interruptable_process
- get_queue_id
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- for_cleanup
- work
- after_process
- clean_up_on_error
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor