Edit on GitHub

datasources.instagram.search_instagram

Import scraped Instagram data

It's prohibitively difficult to scrape data from Instagram within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

  1"""
  2Import scraped Instagram data
  3
  4It's prohibitively difficult to scrape data from Instagram within 4CAT itself
  5due to its aggressive rate limiting. Instead, import data collected elsewhere.
  6"""
  7import datetime
  8import re
  9
 10from backend.lib.search import Search
 11from common.lib.item_mapping import MappedItem, MissingMappedField
 12from common.lib.exceptions import MapItemException
 13
 14
 15class SearchInstagram(Search):
 16    """
 17    Import scraped Instagram data
 18    """
 19    type = "instagram-search"  # job ID
 20    category = "Search"  # category
 21    title = "Import scraped Instagram data"  # title displayed in UI
 22    description = "Import Instagram data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 23    extension = "ndjson"  # extension of result file, used internally and in UI
 24    is_from_zeeschuimer = True
 25
 26    # not available as a processor for existing datasets
 27    accepts = [None]
 28    references = [
 29        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 30        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)"
 31    ]
 32
 33    # some magic numbers instagram uses
 34    MEDIA_TYPE_PHOTO = 1
 35    MEDIA_TYPE_VIDEO = 2
 36    MEDIA_TYPE_CAROUSEL = 8
 37
 38    def get_items(self, query):
 39        """
 40        Run custom search
 41
 42        Not available for Instagram
 43        """
 44        raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")
 45
 46
 47    @staticmethod
 48    def map_item(item):
 49        """
 50        Map Instagram item
 51
 52        Instagram importing is a little bit roundabout since we can expect
 53        input in two separate and not completely overlapping formats - an "edge
 54        list" or an "item list", and posts are structured differently between
 55        those, and do not contain the same data. So we find a middle ground
 56        here... each format has its own handler function
 57
 58        :param dict item:  Item to map
 59        :return:  Mapped item
 60        """
 61        link = item.get("link", "")
 62        if (item.get("product_type", "") == "ad") or \
 63                (link and link.startswith("https://www.facebook.com/ads/ig_redirect")):
 64            # These are ads
 65            raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.")
 66
 67        is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",)
 68
 69        if is_graph_response:
 70            return MappedItem(SearchInstagram.parse_graph_item(item))
 71        else:
 72            return MappedItem(SearchInstagram.parse_itemlist_item(item))
 73
 74    @staticmethod
 75    def parse_graph_item(node):
 76        """
 77        Parse Instagram post in Graph format
 78
 79        2025-6-5: potentially legacy format
 80
 81        :param node:  Data as received from Instagram
 82        :return dict:  Mapped item
 83        """
 84        try:
 85            caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"]
 86        except IndexError:
 87            caption = MissingMappedField("")
 88
 89        num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"])
 90
 91        # get media url
 92        # for carousels, get the first media item, for videos, get the video
 93        # url, for photos, get the highest resolution
 94        if node["__typename"] == "GraphSidecar":
 95            media_node = node["edge_sidecar_to_children"]["edges"][0]["node"]
 96        else:
 97            media_node = node
 98
 99        if media_node["__typename"] == "GraphVideo":
100            media_url = media_node["video_url"]
101        elif media_node["__typename"] == "GraphImage":
102            resources = media_node.get("display_resources", media_node.get("thumbnail_resources"))
103            try:
104                media_url = resources.pop()["src"]
105            except AttributeError:
106                media_url = media_node.get("display_url", "")
107        else:
108            media_url = media_node["display_url"]
109
110        # type, 'mixed' means carousel with video and photo
111        type_map = {"GraphSidecar": "photo", "GraphVideo": "video"}
112        if node["__typename"] != "GraphSidecar":
113            media_type = type_map.get(node["__typename"], "unknown")
114        else:
115            media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]])
116            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
117
118        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
119        # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems
120        if node.get("location"):
121            location["name"] = node["location"].get("name")
122            location["location_id"] = node["location"].get("pk")
123            # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future...
124            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
125                "location"].get("lat") else ""
126            location["city"] = node["location"].get("city")
127
128        no_likes = bool(node.get("like_and_view_counts_disabled"))
129
130        user = node.get("user")
131        owner = node.get("owner")
132        if node.get("user") and node.get("owner"):
133            if user.get("username") != owner.get("username"):
134                raise MapItemException("Unable to parse item: different user and owner")
135
136        mapped_item = {
137            # Post data
138            "id": node["shortcode"],
139            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"),  # Zeeschuimer metadata
140            "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
141            "thread_id": node["shortcode"],
142            "parent_id": node["shortcode"],
143            "url": "https://www.instagram.com/p/" + node["shortcode"],
144            "body": caption,
145
146
147            # Author data
148            "author": user.get("username", owner.get("username", MissingMappedField(""))),
149            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
150            "is_verified": True if user.get("is_verified") else False,
151            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
152            # Unable to find graph type posts to test
153            "coauthors": MissingMappedField(""),
154            "coauthor_fullnames": MissingMappedField(""),
155            "coauthor_ids": MissingMappedField(""),
156
157            # Media
158            "media_type": media_type,
159            "num_media": num_media,
160            "image_urls": node["display_url"],
161            "media_urls": media_url,
162
163            # Engagement
164            "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)),
165            # Unsure if usertags will work; need data (this could raise it to attention...)
166            "usertags": ",".join(
167                [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
168            "likes_hidden": "yes" if no_likes else "no",
169            "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0),
170            "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0),
171
172            # Location data
173            "location_name": location["name"],
174            "location_id": location["location_id"],
175            "location_latlong": location["latlong"],
176            "location_city": location["city"],
177
178            # Metadata
179            "unix_timestamp": node["taken_at_timestamp"],
180            "missing_media": None
181        }
182
183        return mapped_item
184
185    @staticmethod
186    def parse_itemlist_item(node):
187        """
188        Parse Instagram post in 'item list' format
189
190        :param node:  Data as received from Instagram
191        :return dict:  Mapped item
192        """
193        num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"])
194        caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"]
195
196        # get media urls
197        display_urls = []
198        media_urls = []
199        missing_media = None
200        type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"}
201        media_types = set()
202        # for carousels, get the first media item, for videos, get the video
203        # url, for photos, get the highest resolution
204        if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL:
205            media_nodes = node["carousel_media"]
206        else:
207            media_nodes = [node]
208
209        for media_node in media_nodes:
210            if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO:
211                # Videos
212                media_urls.append(media_node["video_versions"][0]["url"])
213                if "image_versions2" in media_node:
214                    display_urls.append(media_node["image_versions2"]["candidates"][0]["url"])
215                else:
216                    # no image links at all :-/
217                    # video is all we have
218                    display_urls.append(media_node["video_versions"][0]["url"])
219
220            elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"):
221                # Images
222                media_url = media_node["image_versions2"]["candidates"][0]["url"]
223                display_urls.append(media_url)
224                media_urls.append(media_url)
225            else:
226                missing_media = MissingMappedField("")
227
228            media_types.add(type_map.get(media_node["media_type"], "unknown"))
229
230        # type, 'mixed' means carousel with video and photo
231        media_type = "mixed" if len(media_types) > 1 else media_types.pop()
232
233        if "comment_count" in node:
234            num_comments = node["comment_count"]
235        elif "comments" in node and type(node["comments"]) is list:
236            num_comments = len(node["comments"])
237        else:
238            num_comments = -1
239
240        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
241        if node.get("location"):
242            location["name"] = node["location"].get("name")
243            location["location_id"] = node["location"].get("pk")
244            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
245                "location"].get("lat") else ""
246            location["city"] = node["location"].get("city")
247
248        user = node.get("user", {})
249        owner = node.get("owner", {})
250        if user and owner:
251            if user.get("username") != owner.get("username"):
252                raise MapItemException("Unable to parse item: different user and owner")
253
254        # Instagram posts also allow 'Collabs' with up to one co-author
255        coauthors = []
256        coauthor_fullnames = []
257        coauthor_ids = []
258        if node.get("coauthor_producers"):
259            for coauthor_node in node["coauthor_producers"]:
260                coauthors.append(coauthor_node.get("username"))
261                coauthor_fullnames.append(coauthor_node.get("full_name"))
262                coauthor_ids.append(coauthor_node.get("id"))
263
264        no_likes = bool(node.get("like_and_view_counts_disabled"))
265
266        # usertags
267        if "usertags" in node:
268            usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else ""
269        else:
270            # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`)
271            usertags = ""
272
273        mapped_item = {
274            # Post and caption
275            "id": node["code"],
276            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata
277            "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
278            "thread_id": node["code"],
279            "parent_id": node["code"],
280            "url": "https://www.instagram.com/p/" + node["code"],
281            "body": caption,
282
283            # Authors
284            "author": user.get("username", owner.get("username", MissingMappedField(""))),
285            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
286            "verified": True if user.get("is_verified") else False,
287            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
288            "coauthors": ",".join(coauthors),
289            "coauthor_fullnames": ",".join(coauthor_fullnames),
290            "coauthor_ids": ",".join(coauthor_ids),
291
292            # Media
293            "media_type": media_type,
294            "num_media": num_media,
295            "image_urls": ",".join(display_urls),
296            "media_urls": ",".join(media_urls),
297
298            # Engagement
299            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)),
300            "usertags": usertags,
301            "likes_hidden": "yes" if no_likes else "no",
302            "num_likes": node["like_count"] if not no_likes else MissingMappedField(0),
303            "num_comments": num_comments,
304
305            # Location
306            "location_name": location["name"],
307            "location_id": location["location_id"],
308            "location_latlong": location["latlong"],
309            "location_city": location["city"],
310
311            # Metadata
312            "unix_timestamp": node["taken_at"],
313            "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None
314        }
315
316        return mapped_item
class SearchInstagram(backend.lib.search.Search):
 16class SearchInstagram(Search):
 17    """
 18    Import scraped Instagram data
 19    """
 20    type = "instagram-search"  # job ID
 21    category = "Search"  # category
 22    title = "Import scraped Instagram data"  # title displayed in UI
 23    description = "Import Instagram data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 24    extension = "ndjson"  # extension of result file, used internally and in UI
 25    is_from_zeeschuimer = True
 26
 27    # not available as a processor for existing datasets
 28    accepts = [None]
 29    references = [
 30        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 31        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)"
 32    ]
 33
 34    # some magic numbers instagram uses
 35    MEDIA_TYPE_PHOTO = 1
 36    MEDIA_TYPE_VIDEO = 2
 37    MEDIA_TYPE_CAROUSEL = 8
 38
 39    def get_items(self, query):
 40        """
 41        Run custom search
 42
 43        Not available for Instagram
 44        """
 45        raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")
 46
 47
 48    @staticmethod
 49    def map_item(item):
 50        """
 51        Map Instagram item
 52
 53        Instagram importing is a little bit roundabout since we can expect
 54        input in two separate and not completely overlapping formats - an "edge
 55        list" or an "item list", and posts are structured differently between
 56        those, and do not contain the same data. So we find a middle ground
 57        here... each format has its own handler function
 58
 59        :param dict item:  Item to map
 60        :return:  Mapped item
 61        """
 62        link = item.get("link", "")
 63        if (item.get("product_type", "") == "ad") or \
 64                (link and link.startswith("https://www.facebook.com/ads/ig_redirect")):
 65            # These are ads
 66            raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.")
 67
 68        is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",)
 69
 70        if is_graph_response:
 71            return MappedItem(SearchInstagram.parse_graph_item(item))
 72        else:
 73            return MappedItem(SearchInstagram.parse_itemlist_item(item))
 74
 75    @staticmethod
 76    def parse_graph_item(node):
 77        """
 78        Parse Instagram post in Graph format
 79
 80        2025-6-5: potentially legacy format
 81
 82        :param node:  Data as received from Instagram
 83        :return dict:  Mapped item
 84        """
 85        try:
 86            caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"]
 87        except IndexError:
 88            caption = MissingMappedField("")
 89
 90        num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"])
 91
 92        # get media url
 93        # for carousels, get the first media item, for videos, get the video
 94        # url, for photos, get the highest resolution
 95        if node["__typename"] == "GraphSidecar":
 96            media_node = node["edge_sidecar_to_children"]["edges"][0]["node"]
 97        else:
 98            media_node = node
 99
100        if media_node["__typename"] == "GraphVideo":
101            media_url = media_node["video_url"]
102        elif media_node["__typename"] == "GraphImage":
103            resources = media_node.get("display_resources", media_node.get("thumbnail_resources"))
104            try:
105                media_url = resources.pop()["src"]
106            except AttributeError:
107                media_url = media_node.get("display_url", "")
108        else:
109            media_url = media_node["display_url"]
110
111        # type, 'mixed' means carousel with video and photo
112        type_map = {"GraphSidecar": "photo", "GraphVideo": "video"}
113        if node["__typename"] != "GraphSidecar":
114            media_type = type_map.get(node["__typename"], "unknown")
115        else:
116            media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]])
117            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
118
119        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
120        # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems
121        if node.get("location"):
122            location["name"] = node["location"].get("name")
123            location["location_id"] = node["location"].get("pk")
124            # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future...
125            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
126                "location"].get("lat") else ""
127            location["city"] = node["location"].get("city")
128
129        no_likes = bool(node.get("like_and_view_counts_disabled"))
130
131        user = node.get("user")
132        owner = node.get("owner")
133        if node.get("user") and node.get("owner"):
134            if user.get("username") != owner.get("username"):
135                raise MapItemException("Unable to parse item: different user and owner")
136
137        mapped_item = {
138            # Post data
139            "id": node["shortcode"],
140            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"),  # Zeeschuimer metadata
141            "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
142            "thread_id": node["shortcode"],
143            "parent_id": node["shortcode"],
144            "url": "https://www.instagram.com/p/" + node["shortcode"],
145            "body": caption,
146
147
148            # Author data
149            "author": user.get("username", owner.get("username", MissingMappedField(""))),
150            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
151            "is_verified": True if user.get("is_verified") else False,
152            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
153            # Unable to find graph type posts to test
154            "coauthors": MissingMappedField(""),
155            "coauthor_fullnames": MissingMappedField(""),
156            "coauthor_ids": MissingMappedField(""),
157
158            # Media
159            "media_type": media_type,
160            "num_media": num_media,
161            "image_urls": node["display_url"],
162            "media_urls": media_url,
163
164            # Engagement
165            "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)),
166            # Unsure if usertags will work; need data (this could raise it to attention...)
167            "usertags": ",".join(
168                [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
169            "likes_hidden": "yes" if no_likes else "no",
170            "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0),
171            "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0),
172
173            # Location data
174            "location_name": location["name"],
175            "location_id": location["location_id"],
176            "location_latlong": location["latlong"],
177            "location_city": location["city"],
178
179            # Metadata
180            "unix_timestamp": node["taken_at_timestamp"],
181            "missing_media": None
182        }
183
184        return mapped_item
185
186    @staticmethod
187    def parse_itemlist_item(node):
188        """
189        Parse Instagram post in 'item list' format
190
191        :param node:  Data as received from Instagram
192        :return dict:  Mapped item
193        """
194        num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"])
195        caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"]
196
197        # get media urls
198        display_urls = []
199        media_urls = []
200        missing_media = None
201        type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"}
202        media_types = set()
203        # for carousels, get the first media item, for videos, get the video
204        # url, for photos, get the highest resolution
205        if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL:
206            media_nodes = node["carousel_media"]
207        else:
208            media_nodes = [node]
209
210        for media_node in media_nodes:
211            if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO:
212                # Videos
213                media_urls.append(media_node["video_versions"][0]["url"])
214                if "image_versions2" in media_node:
215                    display_urls.append(media_node["image_versions2"]["candidates"][0]["url"])
216                else:
217                    # no image links at all :-/
218                    # video is all we have
219                    display_urls.append(media_node["video_versions"][0]["url"])
220
221            elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"):
222                # Images
223                media_url = media_node["image_versions2"]["candidates"][0]["url"]
224                display_urls.append(media_url)
225                media_urls.append(media_url)
226            else:
227                missing_media = MissingMappedField("")
228
229            media_types.add(type_map.get(media_node["media_type"], "unknown"))
230
231        # type, 'mixed' means carousel with video and photo
232        media_type = "mixed" if len(media_types) > 1 else media_types.pop()
233
234        if "comment_count" in node:
235            num_comments = node["comment_count"]
236        elif "comments" in node and type(node["comments"]) is list:
237            num_comments = len(node["comments"])
238        else:
239            num_comments = -1
240
241        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
242        if node.get("location"):
243            location["name"] = node["location"].get("name")
244            location["location_id"] = node["location"].get("pk")
245            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
246                "location"].get("lat") else ""
247            location["city"] = node["location"].get("city")
248
249        user = node.get("user", {})
250        owner = node.get("owner", {})
251        if user and owner:
252            if user.get("username") != owner.get("username"):
253                raise MapItemException("Unable to parse item: different user and owner")
254
255        # Instagram posts also allow 'Collabs' with up to one co-author
256        coauthors = []
257        coauthor_fullnames = []
258        coauthor_ids = []
259        if node.get("coauthor_producers"):
260            for coauthor_node in node["coauthor_producers"]:
261                coauthors.append(coauthor_node.get("username"))
262                coauthor_fullnames.append(coauthor_node.get("full_name"))
263                coauthor_ids.append(coauthor_node.get("id"))
264
265        no_likes = bool(node.get("like_and_view_counts_disabled"))
266
267        # usertags
268        if "usertags" in node:
269            usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else ""
270        else:
271            # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`)
272            usertags = ""
273
274        mapped_item = {
275            # Post and caption
276            "id": node["code"],
277            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata
278            "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
279            "thread_id": node["code"],
280            "parent_id": node["code"],
281            "url": "https://www.instagram.com/p/" + node["code"],
282            "body": caption,
283
284            # Authors
285            "author": user.get("username", owner.get("username", MissingMappedField(""))),
286            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
287            "verified": True if user.get("is_verified") else False,
288            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
289            "coauthors": ",".join(coauthors),
290            "coauthor_fullnames": ",".join(coauthor_fullnames),
291            "coauthor_ids": ",".join(coauthor_ids),
292
293            # Media
294            "media_type": media_type,
295            "num_media": num_media,
296            "image_urls": ",".join(display_urls),
297            "media_urls": ",".join(media_urls),
298
299            # Engagement
300            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)),
301            "usertags": usertags,
302            "likes_hidden": "yes" if no_likes else "no",
303            "num_likes": node["like_count"] if not no_likes else MissingMappedField(0),
304            "num_comments": num_comments,
305
306            # Location
307            "location_name": location["name"],
308            "location_id": location["location_id"],
309            "location_latlong": location["latlong"],
310            "location_city": location["city"],
311
312            # Metadata
313            "unix_timestamp": node["taken_at"],
314            "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None
315        }
316
317        return mapped_item

Import scraped Instagram data

type = 'instagram-search'
category = 'Search'
title = 'Import scraped Instagram data'
description = 'Import Instagram data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)']
MEDIA_TYPE_PHOTO = 1
MEDIA_TYPE_VIDEO = 2
def get_items(self, query):
39    def get_items(self, query):
40        """
41        Run custom search
42
43        Not available for Instagram
44        """
45        raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Instagram

@staticmethod
def map_item(item):
48    @staticmethod
49    def map_item(item):
50        """
51        Map Instagram item
52
53        Instagram importing is a little bit roundabout since we can expect
54        input in two separate and not completely overlapping formats - an "edge
55        list" or an "item list", and posts are structured differently between
56        those, and do not contain the same data. So we find a middle ground
57        here... each format has its own handler function
58
59        :param dict item:  Item to map
60        :return:  Mapped item
61        """
62        link = item.get("link", "")
63        if (item.get("product_type", "") == "ad") or \
64                (link and link.startswith("https://www.facebook.com/ads/ig_redirect")):
65            # These are ads
66            raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.")
67
68        is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",)
69
70        if is_graph_response:
71            return MappedItem(SearchInstagram.parse_graph_item(item))
72        else:
73            return MappedItem(SearchInstagram.parse_itemlist_item(item))

Map Instagram item

Instagram importing is a little bit roundabout since we can expect input in two separate and not completely overlapping formats - an "edge list" or an "item list", and posts are structured differently between those, and do not contain the same data. So we find a middle ground here... each format has its own handler function

Parameters
  • dict item: Item to map
Returns

Mapped item

@staticmethod
def parse_graph_item(node):
 75    @staticmethod
 76    def parse_graph_item(node):
 77        """
 78        Parse Instagram post in Graph format
 79
 80        2025-6-5: potentially legacy format
 81
 82        :param node:  Data as received from Instagram
 83        :return dict:  Mapped item
 84        """
 85        try:
 86            caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"]
 87        except IndexError:
 88            caption = MissingMappedField("")
 89
 90        num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"])
 91
 92        # get media url
 93        # for carousels, get the first media item, for videos, get the video
 94        # url, for photos, get the highest resolution
 95        if node["__typename"] == "GraphSidecar":
 96            media_node = node["edge_sidecar_to_children"]["edges"][0]["node"]
 97        else:
 98            media_node = node
 99
100        if media_node["__typename"] == "GraphVideo":
101            media_url = media_node["video_url"]
102        elif media_node["__typename"] == "GraphImage":
103            resources = media_node.get("display_resources", media_node.get("thumbnail_resources"))
104            try:
105                media_url = resources.pop()["src"]
106            except AttributeError:
107                media_url = media_node.get("display_url", "")
108        else:
109            media_url = media_node["display_url"]
110
111        # type, 'mixed' means carousel with video and photo
112        type_map = {"GraphSidecar": "photo", "GraphVideo": "video"}
113        if node["__typename"] != "GraphSidecar":
114            media_type = type_map.get(node["__typename"], "unknown")
115        else:
116            media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]])
117            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
118
119        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
120        # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems
121        if node.get("location"):
122            location["name"] = node["location"].get("name")
123            location["location_id"] = node["location"].get("pk")
124            # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future...
125            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
126                "location"].get("lat") else ""
127            location["city"] = node["location"].get("city")
128
129        no_likes = bool(node.get("like_and_view_counts_disabled"))
130
131        user = node.get("user")
132        owner = node.get("owner")
133        if node.get("user") and node.get("owner"):
134            if user.get("username") != owner.get("username"):
135                raise MapItemException("Unable to parse item: different user and owner")
136
137        mapped_item = {
138            # Post data
139            "id": node["shortcode"],
140            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"),  # Zeeschuimer metadata
141            "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
142            "thread_id": node["shortcode"],
143            "parent_id": node["shortcode"],
144            "url": "https://www.instagram.com/p/" + node["shortcode"],
145            "body": caption,
146
147
148            # Author data
149            "author": user.get("username", owner.get("username", MissingMappedField(""))),
150            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
151            "is_verified": True if user.get("is_verified") else False,
152            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
153            # Unable to find graph type posts to test
154            "coauthors": MissingMappedField(""),
155            "coauthor_fullnames": MissingMappedField(""),
156            "coauthor_ids": MissingMappedField(""),
157
158            # Media
159            "media_type": media_type,
160            "num_media": num_media,
161            "image_urls": node["display_url"],
162            "media_urls": media_url,
163
164            # Engagement
165            "hashtags": ",".join(re.findall(r"#([^\s!@#$%^&*()_+{}:\"|<>?\[\];'\,./`~]+)", caption)),
166            # Unsure if usertags will work; need data (this could raise it to attention...)
167            "usertags": ",".join(
168                [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
169            "likes_hidden": "yes" if no_likes else "no",
170            "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0),
171            "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0),
172
173            # Location data
174            "location_name": location["name"],
175            "location_id": location["location_id"],
176            "location_latlong": location["latlong"],
177            "location_city": location["city"],
178
179            # Metadata
180            "unix_timestamp": node["taken_at_timestamp"],
181            "missing_media": None
182        }
183
184        return mapped_item

Parse Instagram post in Graph format

2025-6-5: potentially legacy format

Parameters
  • node: Data as received from Instagram
Returns

Mapped item

@staticmethod
def parse_itemlist_item(node):
186    @staticmethod
187    def parse_itemlist_item(node):
188        """
189        Parse Instagram post in 'item list' format
190
191        :param node:  Data as received from Instagram
192        :return dict:  Mapped item
193        """
194        num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"])
195        caption = MissingMappedField("") if "caption" not in node else "" if not node.get("caption") else node["caption"]["text"]
196
197        # get media urls
198        display_urls = []
199        media_urls = []
200        missing_media = None
201        type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"}
202        media_types = set()
203        # for carousels, get the first media item, for videos, get the video
204        # url, for photos, get the highest resolution
205        if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL:
206            media_nodes = node["carousel_media"]
207        else:
208            media_nodes = [node]
209
210        for media_node in media_nodes:
211            if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO:
212                # Videos
213                media_urls.append(media_node["video_versions"][0]["url"])
214                if "image_versions2" in media_node:
215                    display_urls.append(media_node["image_versions2"]["candidates"][0]["url"])
216                else:
217                    # no image links at all :-/
218                    # video is all we have
219                    display_urls.append(media_node["video_versions"][0]["url"])
220
221            elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"):
222                # Images
223                media_url = media_node["image_versions2"]["candidates"][0]["url"]
224                display_urls.append(media_url)
225                media_urls.append(media_url)
226            else:
227                missing_media = MissingMappedField("")
228
229            media_types.add(type_map.get(media_node["media_type"], "unknown"))
230
231        # type, 'mixed' means carousel with video and photo
232        media_type = "mixed" if len(media_types) > 1 else media_types.pop()
233
234        if "comment_count" in node:
235            num_comments = node["comment_count"]
236        elif "comments" in node and type(node["comments"]) is list:
237            num_comments = len(node["comments"])
238        else:
239            num_comments = -1
240
241        location = {"name": "", "latlong": "", "city": "", "location_id": ""}
242        if node.get("location"):
243            location["name"] = node["location"].get("name")
244            location["location_id"] = node["location"].get("pk")
245            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
246                "location"].get("lat") else ""
247            location["city"] = node["location"].get("city")
248
249        user = node.get("user", {})
250        owner = node.get("owner", {})
251        if user and owner:
252            if user.get("username") != owner.get("username"):
253                raise MapItemException("Unable to parse item: different user and owner")
254
255        # Instagram posts also allow 'Collabs' with up to one co-author
256        coauthors = []
257        coauthor_fullnames = []
258        coauthor_ids = []
259        if node.get("coauthor_producers"):
260            for coauthor_node in node["coauthor_producers"]:
261                coauthors.append(coauthor_node.get("username"))
262                coauthor_fullnames.append(coauthor_node.get("full_name"))
263                coauthor_ids.append(coauthor_node.get("id"))
264
265        no_likes = bool(node.get("like_and_view_counts_disabled"))
266
267        # usertags
268        if "usertags" in node:
269            usertags = ",".join([user["user"]["username"] for user in node["usertags"]["in"]]) if node["usertags"] else ""
270        else:
271            # Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`)
272            usertags = ""
273
274        mapped_item = {
275            # Post and caption
276            "id": node["code"],
277            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata
278            "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
279            "thread_id": node["code"],
280            "parent_id": node["code"],
281            "url": "https://www.instagram.com/p/" + node["code"],
282            "body": caption,
283
284            # Authors
285            "author": user.get("username", owner.get("username", MissingMappedField(""))),
286            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
287            "verified": True if user.get("is_verified") else False,
288            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
289            "coauthors": ",".join(coauthors),
290            "coauthor_fullnames": ",".join(coauthor_fullnames),
291            "coauthor_ids": ",".join(coauthor_ids),
292
293            # Media
294            "media_type": media_type,
295            "num_media": num_media,
296            "image_urls": ",".join(display_urls),
297            "media_urls": ",".join(media_urls),
298
299            # Engagement
300            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~'‘’]+)", caption)),
301            "usertags": usertags,
302            "likes_hidden": "yes" if no_likes else "no",
303            "num_likes": node["like_count"] if not no_likes else MissingMappedField(0),
304            "num_comments": num_comments,
305
306            # Location
307            "location_name": location["name"],
308            "location_id": location["location_id"],
309            "location_latlong": location["latlong"],
310            "location_city": location["city"],
311
312            # Metadata
313            "unix_timestamp": node["taken_at"],
314            "missing_media": missing_media, # This denotes media that is unable to be mapped and is otherwise None
315        }
316
317        return mapped_item

Parse Instagram post in 'item list' format

Parameters
  • node: Data as received from Instagram
Returns

Mapped item