Edit on GitHub

datasources.instagram.search_instagram

Import scraped Instagram data

It's prohibitively difficult to scrape data from Instagram within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

  1"""
  2Import scraped Instagram data
  3
  4It's prohibitively difficult to scrape data from Instagram within 4CAT itself
  5due to its aggressive rate limiting. Instead, import data collected elsewhere.
  6"""
  7import datetime
  8import re
  9
 10from backend.lib.search import Search
 11from common.lib.item_mapping import MappedItem, MissingMappedField
 12from common.lib.exceptions import MapItemException
 13from common.lib.helpers import normalize_url_encoding
 14
 15
 16class SearchInstagram(Search):
 17    """
 18    Import scraped Instagram data
 19    """
 20    type = "instagram-search"  # job ID
 21    category = "Search"  # category
 22    title = "Import scraped Instagram data"  # title displayed in UI
 23    description = "Import Instagram data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 24    extension = "ndjson"  # extension of result file, used internally and in UI
 25    is_from_zeeschuimer = True
 26
 27    # not available as a processor for existing datasets
 28    accepts = [None]
 29    references = [
 30        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 31        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)"
 32    ]
 33
 34    # some magic numbers instagram uses
 35    MEDIA_TYPE_PHOTO = 1
 36    MEDIA_TYPE_VIDEO = 2
 37    MEDIA_TYPE_CAROUSEL = 8
 38
 39    def get_items(self, query):
 40        """
 41        Run custom search
 42
 43        Not available for Instagram
 44        """
 45        raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")
 46
 47
 48    @staticmethod
 49    def map_item(item):
 50        """
 51        Map Instagram item
 52
 53        Instagram importing is a little bit roundabout since we can expect
 54        input in two separate and not completely overlapping formats - an "edge
 55        list" or an "item list", and posts are structured differently between
 56        those, and do not contain the same data. So we find a middle ground
 57        here... each format has its own handler function
 58
 59        :param dict item:  Item to map
 60        :return:  Mapped item
 61        """
 62        link = item.get("link", "")
 63        if (item.get("product_type", "") == "ad") or \
 64                (link and link.startswith("https://www.facebook.com/ads/ig_redirect")):
 65            # These are ads
 66            raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.")
 67
 68        is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",)
 69
 70        if is_graph_response:
 71            return MappedItem(SearchInstagram.parse_graph_item(item))
 72        else:
 73            return MappedItem(SearchInstagram.parse_itemlist_item(item))
 74
 75    @staticmethod
 76    def parse_graph_item(node):
 77        """
 78        Parse Instagram post in Graph format
 79
 80        2025-6-5: potentially legacy format
 81
 82        :param node:  Data as received from Instagram
 83        :return dict:  Mapped item
 84        """
 85        try:
 86            caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"]
 87        except IndexError:
 88            caption = MissingMappedField("")
 89
 90        num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"])
 91
 92        # get media url
 93        # for carousels, get the first media item, for videos, get the video
 94        # url, for photos, get the highest resolution
 95        if node["__typename"] == "GraphSidecar":
 96            media_node = node["edge_sidecar_to_children"]["edges"][0]["node"]
 97        else:
 98            media_node = node
 99
100        if media_node["__typename"] == "GraphVideo":
101            media_url = media_node["video_url"]
102        elif media_node["__typename"] == "GraphImage":
103            resources = media_node.get("display_resources", media_node.get("thumbnail_resources"))
104            try:
105                media_url = resources.pop()["src"]
106            except AttributeError:
107                media_url = media_node.get("display_url", "")
108        else:
109            media_url = media_node["display_url"]
110
111        # type, 'mixed' means carousel with video and photo
112        type_map = {"GraphSidecar": "photo", "GraphVideo": "video"}
113        if node["__typename"] != "GraphSidecar":
114            media_type = type_map.get(node["__typename"], "unknown")
115        else:
116            media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]])
117            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
118
119        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
120        # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems
121        if node.get("location"):
122            location["name"] = node["location"].get("name")
123            location["location_id"] = node["location"].get("pk")
124            # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future...
125            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
126                "location"].get("lat") else ""
127            location["city"] = node["location"].get("city")
128
129        no_likes = bool(node.get("like_and_view_counts_disabled"))
130
131        user = node.get("user")
132        owner = node.get("owner")
133        if node.get("user") and node.get("owner"):
134            if owner.get("id") == user.get("id"):
135                # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user
136                pass
137            elif user.get("username") != owner.get("username"):
138                raise MapItemException("Unable to parse item: different user and owner")
139
140        mapped_item = {
141            # Post data
142            "id": node["shortcode"],
143            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"),  # Zeeschuimer metadata
144            "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
145            "thread_id": node["shortcode"],
146            "parent_id": node["shortcode"],
147            "url": "https://www.instagram.com/p/" + node["shortcode"],
148            "body": caption,
149
150
151            # Author data
152            "author": user.get("username", owner.get("username", MissingMappedField(""))),
153            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
154            "is_verified": True if user.get("is_verified") else False,
155            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
156            # Unable to find graph type posts to test
157            "coauthors": MissingMappedField(""),
158            "coauthor_fullnames": MissingMappedField(""),
159            "coauthor_ids": MissingMappedField(""),
160
161            # Media
162            "media_type": media_type,
163            "num_media": num_media,
164            "image_urls": node["display_url"],
165            "media_urls": media_url,
166
167            # Engagement
168            "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)),
169            # Unsure if usertags will work; need data (this could raise it to attention...)
170            "usertags": ",".join(
171                [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
172            "likes_hidden": "yes" if no_likes else "no",
173            "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0),
174            "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0),
175
176            # Location data
177            "location_name": location["name"],
178            "location_id": location["location_id"],
179            "location_latlong": location["latlong"],
180            "location_city": location["city"],
181
182            # Metadata
183            "unix_timestamp": node["taken_at_timestamp"],
184            "missing_media": None
185        }
186
187        return mapped_item
188
189    @staticmethod
190    def parse_itemlist_item(node):
191        """
192        Parse Instagram post in 'item list' format
193
194        :param node:  Data as received from Instagram
195        :return dict:  Mapped item
196        """
197        num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"])
198        caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"]
199
200        # get media urls
201        display_urls = []
202        media_urls = []
203        missing_media = None
204        type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"}
205        media_types = set()
206        # for carousels, get the first media item, for videos, get the video
207        # url, for photos, get the highest resolution
208        if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL:
209            media_nodes = node["carousel_media"]
210        else:
211            media_nodes = [node]
212
213        for media_node in media_nodes:
214            if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO:
215                # Videos
216                media_urls.append(media_node["video_versions"][0]["url"])
217                if "image_versions2" in media_node:
218                    display_urls.append(media_node["image_versions2"]["candidates"][0]["url"])
219                else:
220                    # no image links at all :-/
221                    # video is all we have
222                    display_urls.append(media_node["video_versions"][0]["url"])
223
224            elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"):
225                # Images
226                media_url = media_node["image_versions2"]["candidates"][0]["url"]
227                display_urls.append(media_url)
228                media_urls.append(media_url)
229            else:
230                missing_media = MissingMappedField("")
231
232            media_types.add(type_map.get(media_node["media_type"], "unknown"))
233
234        # type, 'mixed' means carousel with video and photo
235        media_type = "mixed" if len(media_types) > 1 else media_types.pop()
236
237        if "comment_count" in node:
238            num_comments = node["comment_count"]
239        elif "comments" in node and type(node["comments"]) is list:
240            num_comments = len(node["comments"])
241        else:
242            num_comments = -1
243
244        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
245        if node.get("location"):
246            location["name"] = node["location"].get("name")
247            location["location_id"] = node["location"].get("pk")
248            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
249                "location"].get("lat") else ""
250            location["city"] = node["location"].get("city")
251
252        user = node.get("user", {})
253        owner = node.get("owner", {})
254        if user and owner:
255            if owner.get("id") == user.get("id"):
256                # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user
257                pass
258            elif user.get("username") != owner.get("username"):
259                raise MapItemException("Unable to parse item: different user and owner")
260
261        # Instagram posts also allow 'Collabs' with up to one co-author
262        coauthors = []
263        coauthor_fullnames = []
264        coauthor_ids = []
265        if node.get("coauthor_producers"):
266            for coauthor_node in node["coauthor_producers"]:
267                coauthors.append(coauthor_node.get("username"))
268                coauthor_fullnames.append(coauthor_node.get("full_name"))
269                coauthor_ids.append(coauthor_node.get("id"))
270
271        no_likes = bool(node.get("like_and_view_counts_disabled"))
272
273        # usertags
274        if "usertags" in node:
275            usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else ""
276        else:
277            # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`)
278            usertags = ""
279
280        mapped_item = {
281            # Post and caption
282            "collected_from_url": normalize_url_encoding(node.get("__import_meta", {}).get("source_platform_url")),  # Zeeschuimer metadata
283            "id": node["code"],
284            "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
285            "thread_id": node["code"],
286            "parent_id": node["code"],
287            "url": "https://www.instagram.com/p/" + node["code"],
288            "body": caption,
289
290            # Authors
291            "author": user.get("username", owner.get("username", MissingMappedField(""))),
292            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
293            "verified": True if user.get("is_verified") else False,
294            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
295            "coauthors": ",".join(coauthors),
296            "coauthor_fullnames": ",".join(coauthor_fullnames),
297            "coauthor_ids": ",".join(coauthor_ids),
298
299            # Media
300            "media_type": media_type,
301            "num_media": num_media,
302            "image_urls": ",".join(display_urls),
303            "media_urls": ",".join(media_urls),
304
305            # Engagement
306            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)),
307            "usertags": usertags,
308            "likes_hidden": "yes" if no_likes else "no",
309            "num_likes": node["like_count"] if not no_likes else MissingMappedField(0),
310            "num_comments": num_comments,
311
312            # Location
313            "location_name": location["name"],
314            "location_id": location["location_id"],
315            "location_latlong": location["latlong"],
316            "location_city": location["city"],
317
318            # Metadata
319            "unix_timestamp": node["taken_at"],
320            "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None
321        }
322
323        return mapped_item
class SearchInstagram(backend.lib.search.Search):
 17class SearchInstagram(Search):
 18    """
 19    Import scraped Instagram data
 20    """
 21    type = "instagram-search"  # job ID
 22    category = "Search"  # category
 23    title = "Import scraped Instagram data"  # title displayed in UI
 24    description = "Import Instagram data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 25    extension = "ndjson"  # extension of result file, used internally and in UI
 26    is_from_zeeschuimer = True
 27
 28    # not available as a processor for existing datasets
 29    accepts = [None]
 30    references = [
 31        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 32        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)"
 33    ]
 34
 35    # some magic numbers instagram uses
 36    MEDIA_TYPE_PHOTO = 1
 37    MEDIA_TYPE_VIDEO = 2
 38    MEDIA_TYPE_CAROUSEL = 8
 39
 40    def get_items(self, query):
 41        """
 42        Run custom search
 43
 44        Not available for Instagram
 45        """
 46        raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")
 47
 48
 49    @staticmethod
 50    def map_item(item):
 51        """
 52        Map Instagram item
 53
 54        Instagram importing is a little bit roundabout since we can expect
 55        input in two separate and not completely overlapping formats - an "edge
 56        list" or an "item list", and posts are structured differently between
 57        those, and do not contain the same data. So we find a middle ground
 58        here... each format has its own handler function
 59
 60        :param dict item:  Item to map
 61        :return:  Mapped item
 62        """
 63        link = item.get("link", "")
 64        if (item.get("product_type", "") == "ad") or \
 65                (link and link.startswith("https://www.facebook.com/ads/ig_redirect")):
 66            # These are ads
 67            raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.")
 68
 69        is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",)
 70
 71        if is_graph_response:
 72            return MappedItem(SearchInstagram.parse_graph_item(item))
 73        else:
 74            return MappedItem(SearchInstagram.parse_itemlist_item(item))
 75
 76    @staticmethod
 77    def parse_graph_item(node):
 78        """
 79        Parse Instagram post in Graph format
 80
 81        2025-6-5: potentially legacy format
 82
 83        :param node:  Data as received from Instagram
 84        :return dict:  Mapped item
 85        """
 86        try:
 87            caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"]
 88        except IndexError:
 89            caption = MissingMappedField("")
 90
 91        num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"])
 92
 93        # get media url
 94        # for carousels, get the first media item, for videos, get the video
 95        # url, for photos, get the highest resolution
 96        if node["__typename"] == "GraphSidecar":
 97            media_node = node["edge_sidecar_to_children"]["edges"][0]["node"]
 98        else:
 99            media_node = node
100
101        if media_node["__typename"] == "GraphVideo":
102            media_url = media_node["video_url"]
103        elif media_node["__typename"] == "GraphImage":
104            resources = media_node.get("display_resources", media_node.get("thumbnail_resources"))
105            try:
106                media_url = resources.pop()["src"]
107            except AttributeError:
108                media_url = media_node.get("display_url", "")
109        else:
110            media_url = media_node["display_url"]
111
112        # type, 'mixed' means carousel with video and photo
113        type_map = {"GraphSidecar": "photo", "GraphVideo": "video"}
114        if node["__typename"] != "GraphSidecar":
115            media_type = type_map.get(node["__typename"], "unknown")
116        else:
117            media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]])
118            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
119
120        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
121        # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems
122        if node.get("location"):
123            location["name"] = node["location"].get("name")
124            location["location_id"] = node["location"].get("pk")
125            # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future...
126            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
127                "location"].get("lat") else ""
128            location["city"] = node["location"].get("city")
129
130        no_likes = bool(node.get("like_and_view_counts_disabled"))
131
132        user = node.get("user")
133        owner = node.get("owner")
134        if node.get("user") and node.get("owner"):
135            if owner.get("id") == user.get("id"):
136                # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user
137                pass
138            elif user.get("username") != owner.get("username"):
139                raise MapItemException("Unable to parse item: different user and owner")
140
141        mapped_item = {
142            # Post data
143            "id": node["shortcode"],
144            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"),  # Zeeschuimer metadata
145            "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
146            "thread_id": node["shortcode"],
147            "parent_id": node["shortcode"],
148            "url": "https://www.instagram.com/p/" + node["shortcode"],
149            "body": caption,
150
151
152            # Author data
153            "author": user.get("username", owner.get("username", MissingMappedField(""))),
154            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
155            "is_verified": True if user.get("is_verified") else False,
156            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
157            # Unable to find graph type posts to test
158            "coauthors": MissingMappedField(""),
159            "coauthor_fullnames": MissingMappedField(""),
160            "coauthor_ids": MissingMappedField(""),
161
162            # Media
163            "media_type": media_type,
164            "num_media": num_media,
165            "image_urls": node["display_url"],
166            "media_urls": media_url,
167
168            # Engagement
169            "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)),
170            # Unsure if usertags will work; need data (this could raise it to attention...)
171            "usertags": ",".join(
172                [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
173            "likes_hidden": "yes" if no_likes else "no",
174            "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0),
175            "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0),
176
177            # Location data
178            "location_name": location["name"],
179            "location_id": location["location_id"],
180            "location_latlong": location["latlong"],
181            "location_city": location["city"],
182
183            # Metadata
184            "unix_timestamp": node["taken_at_timestamp"],
185            "missing_media": None
186        }
187
188        return mapped_item
189
190    @staticmethod
191    def parse_itemlist_item(node):
192        """
193        Parse Instagram post in 'item list' format
194
195        :param node:  Data as received from Instagram
196        :return dict:  Mapped item
197        """
198        num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"])
199        caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"]
200
201        # get media urls
202        display_urls = []
203        media_urls = []
204        missing_media = None
205        type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"}
206        media_types = set()
207        # for carousels, get the first media item, for videos, get the video
208        # url, for photos, get the highest resolution
209        if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL:
210            media_nodes = node["carousel_media"]
211        else:
212            media_nodes = [node]
213
214        for media_node in media_nodes:
215            if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO:
216                # Videos
217                media_urls.append(media_node["video_versions"][0]["url"])
218                if "image_versions2" in media_node:
219                    display_urls.append(media_node["image_versions2"]["candidates"][0]["url"])
220                else:
221                    # no image links at all :-/
222                    # video is all we have
223                    display_urls.append(media_node["video_versions"][0]["url"])
224
225            elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"):
226                # Images
227                media_url = media_node["image_versions2"]["candidates"][0]["url"]
228                display_urls.append(media_url)
229                media_urls.append(media_url)
230            else:
231                missing_media = MissingMappedField("")
232
233            media_types.add(type_map.get(media_node["media_type"], "unknown"))
234
235        # type, 'mixed' means carousel with video and photo
236        media_type = "mixed" if len(media_types) > 1 else media_types.pop()
237
238        if "comment_count" in node:
239            num_comments = node["comment_count"]
240        elif "comments" in node and type(node["comments"]) is list:
241            num_comments = len(node["comments"])
242        else:
243            num_comments = -1
244
245        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
246        if node.get("location"):
247            location["name"] = node["location"].get("name")
248            location["location_id"] = node["location"].get("pk")
249            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
250                "location"].get("lat") else ""
251            location["city"] = node["location"].get("city")
252
253        user = node.get("user", {})
254        owner = node.get("owner", {})
255        if user and owner:
256            if owner.get("id") == user.get("id"):
257                # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user
258                pass
259            elif user.get("username") != owner.get("username"):
260                raise MapItemException("Unable to parse item: different user and owner")
261
262        # Instagram posts also allow 'Collabs' with up to one co-author
263        coauthors = []
264        coauthor_fullnames = []
265        coauthor_ids = []
266        if node.get("coauthor_producers"):
267            for coauthor_node in node["coauthor_producers"]:
268                coauthors.append(coauthor_node.get("username"))
269                coauthor_fullnames.append(coauthor_node.get("full_name"))
270                coauthor_ids.append(coauthor_node.get("id"))
271
272        no_likes = bool(node.get("like_and_view_counts_disabled"))
273
274        # usertags
275        if "usertags" in node:
276            usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else ""
277        else:
278            # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`)
279            usertags = ""
280
281        mapped_item = {
282            # Post and caption
283            "collected_from_url": normalize_url_encoding(node.get("__import_meta", {}).get("source_platform_url")),  # Zeeschuimer metadata
284            "id": node["code"],
285            "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
286            "thread_id": node["code"],
287            "parent_id": node["code"],
288            "url": "https://www.instagram.com/p/" + node["code"],
289            "body": caption,
290
291            # Authors
292            "author": user.get("username", owner.get("username", MissingMappedField(""))),
293            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
294            "verified": True if user.get("is_verified") else False,
295            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
296            "coauthors": ",".join(coauthors),
297            "coauthor_fullnames": ",".join(coauthor_fullnames),
298            "coauthor_ids": ",".join(coauthor_ids),
299
300            # Media
301            "media_type": media_type,
302            "num_media": num_media,
303            "image_urls": ",".join(display_urls),
304            "media_urls": ",".join(media_urls),
305
306            # Engagement
307            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)),
308            "usertags": usertags,
309            "likes_hidden": "yes" if no_likes else "no",
310            "num_likes": node["like_count"] if not no_likes else MissingMappedField(0),
311            "num_comments": num_comments,
312
313            # Location
314            "location_name": location["name"],
315            "location_id": location["location_id"],
316            "location_latlong": location["latlong"],
317            "location_city": location["city"],
318
319            # Metadata
320            "unix_timestamp": node["taken_at"],
321            "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None
322        }
323
324        return mapped_item

Import scraped Instagram data

type = 'instagram-search'
category = 'Search'
title = 'Import scraped Instagram data'
description = 'Import Instagram data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)']
MEDIA_TYPE_PHOTO = 1
MEDIA_TYPE_VIDEO = 2
def get_items(self, query):
40    def get_items(self, query):
41        """
42        Run custom search
43
44        Not available for Instagram
45        """
46        raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Instagram

@staticmethod
def map_item(item):
49    @staticmethod
50    def map_item(item):
51        """
52        Map Instagram item
53
54        Instagram importing is a little bit roundabout since we can expect
55        input in two separate and not completely overlapping formats - an "edge
56        list" or an "item list", and posts are structured differently between
57        those, and do not contain the same data. So we find a middle ground
58        here... each format has its own handler function
59
60        :param dict item:  Item to map
61        :return:  Mapped item
62        """
63        link = item.get("link", "")
64        if (item.get("product_type", "") == "ad") or \
65                (link and link.startswith("https://www.facebook.com/ads/ig_redirect")):
66            # These are ads
67            raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.")
68
69        is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",)
70
71        if is_graph_response:
72            return MappedItem(SearchInstagram.parse_graph_item(item))
73        else:
74            return MappedItem(SearchInstagram.parse_itemlist_item(item))

Map Instagram item

Instagram importing is a little bit roundabout since we can expect input in two separate and not completely overlapping formats - an "edge list" or an "item list", and posts are structured differently between those, and do not contain the same data. So we find a middle ground here... each format has its own handler function

Parameters
  • dict item: Item to map
Returns

Mapped item

@staticmethod
def parse_graph_item(node):
 76    @staticmethod
 77    def parse_graph_item(node):
 78        """
 79        Parse Instagram post in Graph format
 80
 81        2025-6-5: potentially legacy format
 82
 83        :param node:  Data as received from Instagram
 84        :return dict:  Mapped item
 85        """
 86        try:
 87            caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"]
 88        except IndexError:
 89            caption = MissingMappedField("")
 90
 91        num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"])
 92
 93        # get media url
 94        # for carousels, get the first media item, for videos, get the video
 95        # url, for photos, get the highest resolution
 96        if node["__typename"] == "GraphSidecar":
 97            media_node = node["edge_sidecar_to_children"]["edges"][0]["node"]
 98        else:
 99            media_node = node
100
101        if media_node["__typename"] == "GraphVideo":
102            media_url = media_node["video_url"]
103        elif media_node["__typename"] == "GraphImage":
104            resources = media_node.get("display_resources", media_node.get("thumbnail_resources"))
105            try:
106                media_url = resources.pop()["src"]
107            except AttributeError:
108                media_url = media_node.get("display_url", "")
109        else:
110            media_url = media_node["display_url"]
111
112        # type, 'mixed' means carousel with video and photo
113        type_map = {"GraphSidecar": "photo", "GraphVideo": "video"}
114        if node["__typename"] != "GraphSidecar":
115            media_type = type_map.get(node["__typename"], "unknown")
116        else:
117            media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]])
118            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
119
120        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
121        # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems
122        if node.get("location"):
123            location["name"] = node["location"].get("name")
124            location["location_id"] = node["location"].get("pk")
125            # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future...
126            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
127                "location"].get("lat") else ""
128            location["city"] = node["location"].get("city")
129
130        no_likes = bool(node.get("like_and_view_counts_disabled"))
131
132        user = node.get("user")
133        owner = node.get("owner")
134        if node.get("user") and node.get("owner"):
135            if owner.get("id") == user.get("id"):
136                # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user
137                pass
138            elif user.get("username") != owner.get("username"):
139                raise MapItemException("Unable to parse item: different user and owner")
140
141        mapped_item = {
142            # Post data
143            "id": node["shortcode"],
144            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"),  # Zeeschuimer metadata
145            "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
146            "thread_id": node["shortcode"],
147            "parent_id": node["shortcode"],
148            "url": "https://www.instagram.com/p/" + node["shortcode"],
149            "body": caption,
150
151
152            # Author data
153            "author": user.get("username", owner.get("username", MissingMappedField(""))),
154            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
155            "is_verified": True if user.get("is_verified") else False,
156            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
157            # Unable to find graph type posts to test
158            "coauthors": MissingMappedField(""),
159            "coauthor_fullnames": MissingMappedField(""),
160            "coauthor_ids": MissingMappedField(""),
161
162            # Media
163            "media_type": media_type,
164            "num_media": num_media,
165            "image_urls": node["display_url"],
166            "media_urls": media_url,
167
168            # Engagement
169            "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)),
170            # Unsure if usertags will work; need data (this could raise it to attention...)
171            "usertags": ",".join(
172                [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
173            "likes_hidden": "yes" if no_likes else "no",
174            "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0),
175            "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0),
176
177            # Location data
178            "location_name": location["name"],
179            "location_id": location["location_id"],
180            "location_latlong": location["latlong"],
181            "location_city": location["city"],
182
183            # Metadata
184            "unix_timestamp": node["taken_at_timestamp"],
185            "missing_media": None
186        }
187
188        return mapped_item

Parse Instagram post in Graph format

2025-6-5: potentially legacy format

Parameters
  • node: Data as received from Instagram
Returns

Mapped item

@staticmethod
def parse_itemlist_item(node):
190    @staticmethod
191    def parse_itemlist_item(node):
192        """
193        Parse Instagram post in 'item list' format
194
195        :param node:  Data as received from Instagram
196        :return dict:  Mapped item
197        """
198        num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"])
199        caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"]
200
201        # get media urls
202        display_urls = []
203        media_urls = []
204        missing_media = None
205        type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"}
206        media_types = set()
207        # for carousels, get the first media item, for videos, get the video
208        # url, for photos, get the highest resolution
209        if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL:
210            media_nodes = node["carousel_media"]
211        else:
212            media_nodes = [node]
213
214        for media_node in media_nodes:
215            if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO:
216                # Videos
217                media_urls.append(media_node["video_versions"][0]["url"])
218                if "image_versions2" in media_node:
219                    display_urls.append(media_node["image_versions2"]["candidates"][0]["url"])
220                else:
221                    # no image links at all :-/
222                    # video is all we have
223                    display_urls.append(media_node["video_versions"][0]["url"])
224
225            elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"):
226                # Images
227                media_url = media_node["image_versions2"]["candidates"][0]["url"]
228                display_urls.append(media_url)
229                media_urls.append(media_url)
230            else:
231                missing_media = MissingMappedField("")
232
233            media_types.add(type_map.get(media_node["media_type"], "unknown"))
234
235        # type, 'mixed' means carousel with video and photo
236        media_type = "mixed" if len(media_types) > 1 else media_types.pop()
237
238        if "comment_count" in node:
239            num_comments = node["comment_count"]
240        elif "comments" in node and type(node["comments"]) is list:
241            num_comments = len(node["comments"])
242        else:
243            num_comments = -1
244
245        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
246        if node.get("location"):
247            location["name"] = node["location"].get("name")
248            location["location_id"] = node["location"].get("pk")
249            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
250                "location"].get("lat") else ""
251            location["city"] = node["location"].get("city")
252
253        user = node.get("user", {})
254        owner = node.get("owner", {})
255        if user and owner:
256            if owner.get("id") == user.get("id"):
257                # Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user
258                pass
259            elif user.get("username") != owner.get("username"):
260                raise MapItemException("Unable to parse item: different user and owner")
261
262        # Instagram posts also allow 'Collabs' with up to one co-author
263        coauthors = []
264        coauthor_fullnames = []
265        coauthor_ids = []
266        if node.get("coauthor_producers"):
267            for coauthor_node in node["coauthor_producers"]:
268                coauthors.append(coauthor_node.get("username"))
269                coauthor_fullnames.append(coauthor_node.get("full_name"))
270                coauthor_ids.append(coauthor_node.get("id"))
271
272        no_likes = bool(node.get("like_and_view_counts_disabled"))
273
274        # usertags
275        if "usertags" in node:
276            usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else ""
277        else:
278            # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`)
279            usertags = ""
280
281        mapped_item = {
282            # Post and caption
283            "collected_from_url": normalize_url_encoding(node.get("__import_meta", {}).get("source_platform_url")),  # Zeeschuimer metadata
284            "id": node["code"],
285            "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
286            "thread_id": node["code"],
287            "parent_id": node["code"],
288            "url": "https://www.instagram.com/p/" + node["code"],
289            "body": caption,
290
291            # Authors
292            "author": user.get("username", owner.get("username", MissingMappedField(""))),
293            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
294            "verified": True if user.get("is_verified") else False,
295            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
296            "coauthors": ",".join(coauthors),
297            "coauthor_fullnames": ",".join(coauthor_fullnames),
298            "coauthor_ids": ",".join(coauthor_ids),
299
300            # Media
301            "media_type": media_type,
302            "num_media": num_media,
303            "image_urls": ",".join(display_urls),
304            "media_urls": ",".join(media_urls),
305
306            # Engagement
307            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)),
308            "usertags": usertags,
309            "likes_hidden": "yes" if no_likes else "no",
310            "num_likes": node["like_count"] if not no_likes else MissingMappedField(0),
311            "num_comments": num_comments,
312
313            # Location
314            "location_name": location["name"],
315            "location_id": location["location_id"],
316            "location_latlong": location["latlong"],
317            "location_city": location["city"],
318
319            # Metadata
320            "unix_timestamp": node["taken_at"],
321            "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None
322        }
323
324        return mapped_item

Parse Instagram post in 'item list' format

Parameters
  • node: Data as received from Instagram
Returns

Mapped item