Edit on GitHub

datasources.instagram.search_instagram

Import scraped Instagram data

It's prohibitively difficult to scrape data from Instagram within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

  1"""
  2Import scraped Instagram data
  3
  4It's prohibitively difficult to scrape data from Instagram within 4CAT itself
  5due to its aggressive rate limiting. Instead, import data collected elsewhere.
  6"""
  7import datetime
  8import re
  9
 10from backend.lib.search import Search
 11from common.lib.item_mapping import MappedItem, MissingMappedField
 12from common.lib.exceptions import WorkerInterruptedException, MapItemException
 13
 14
 15class SearchInstagram(Search):
 16    """
 17    Import scraped Instagram data
 18    """
 19    type = "instagram-search"  # job ID
 20    category = "Search"  # category
 21    title = "Import scraped Instagram data"  # title displayed in UI
 22    description = "Import Instagram data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 23    extension = "ndjson"  # extension of result file, used internally and in UI
 24    is_from_zeeschuimer = True
 25
 26    # not available as a processor for existing datasets
 27    accepts = [None]
 28    references = [
 29        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 30        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)"
 31    ]
 32
 33    # some magic numbers instagram uses
 34    MEDIA_TYPE_PHOTO = 1
 35    MEDIA_TYPE_VIDEO = 2
 36    MEDIA_TYPE_CAROUSEL = 8
 37
 38    def get_items(self, query):
 39        """
 40        Run custom search
 41
 42        Not available for Instagram
 43        """
 44        raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")
 45
 46
 47    @staticmethod
 48    def map_item(item):
 49        """
 50        Map Instagram item
 51
 52        Instagram importing is a little bit roundabout since we can expect
 53        input in two separate and not completely overlapping formats - an "edge
 54        list" or an "item list", and posts are structured differently between
 55        those, and do not contain the same data. So we find a middle ground
 56        here... each format has its own handler function
 57
 58        :param dict item:  Item to map
 59        :return:  Mapped item
 60        """
 61        link = item.get("link", "")
 62        if (item.get("product_type", "") == "ad") or \
 63                (link and link.startswith("https://www.facebook.com/ads/ig_redirect")):
 64            # These are ads
 65            raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.")
 66
 67        is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",)
 68
 69        if is_graph_response:
 70            return MappedItem(SearchInstagram.parse_graph_item(item))
 71        else:
 72            return MappedItem(SearchInstagram.parse_itemlist_item(item))
 73
 74    @staticmethod
 75    def parse_graph_item(node):
 76        """
 77        Parse Instagram post in Graph format
 78
 79        :param node:  Data as received from Instagram
 80        :return dict:  Mapped item
 81        """
 82        try:
 83            caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"]
 84        except IndexError:
 85            caption = MissingMappedField("")
 86
 87        num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"])
 88
 89        # get media url
 90        # for carousels, get the first media item, for videos, get the video
 91        # url, for photos, get the highest resolution
 92        if node["__typename"] == "GraphSidecar":
 93            media_node = node["edge_sidecar_to_children"]["edges"][0]["node"]
 94        else:
 95            media_node = node
 96
 97        if media_node["__typename"] == "GraphVideo":
 98            media_url = media_node["video_url"]
 99        elif media_node["__typename"] == "GraphImage":
100            resources = media_node.get("display_resources", media_node.get("thumbnail_resources"))
101            try:
102                media_url = resources.pop()["src"]
103            except AttributeError:
104                media_url = media_node.get("display_url", "")
105        else:
106            media_url = media_node["display_url"]
107
108        # type, 'mixed' means carousel with video and photo
109        type_map = {"GraphSidecar": "photo", "GraphVideo": "video"}
110        if node["__typename"] != "GraphSidecar":
111            media_type = type_map.get(node["__typename"], "unknown")
112        else:
113            media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]])
114            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
115
116        location = {"name": "", "latlong": "", "city": ""}
117        # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems
118        if node.get("location"):
119            location["name"] = node["location"].get("name")
120            # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future...
121            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
122                "location"].get("lat") else ""
123            location["city"] = node["location"].get("city")
124
125        no_likes = bool(node.get("like_and_view_counts_disabled"))
126
127        user = node.get("user")
128        owner = node.get("owner")
129        if node.get("user") and node.get("owner"):
130            if user.get("username") != owner.get("username"):
131                raise MapItemException("Unable to parse item: different user and owner")
132
133        mapped_item = {
134            "id": node["shortcode"],
135            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"),  # Zeeschuimer metadata
136            "thread_id": node["shortcode"],
137            "parent_id": node["shortcode"],
138            "body": caption,
139            "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
140            "author": user.get("username", owner.get("username", MissingMappedField(""))),
141            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
142            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
143            "type": media_type,
144            "url": "https://www.instagram.com/p/" + node["shortcode"],
145            "image_url": node["display_url"],
146            "media_url": media_url,
147            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)),
148            # "usertags": ",".join(
149            #     [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
150            "likes_hidden": "yes" if no_likes else "no",
151            "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0),
152            "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0),
153            "num_media": num_media,
154            "location_name": location["name"],
155            "location_latlong": location["latlong"],
156            "location_city": location["city"],
157            "unix_timestamp": node["taken_at_timestamp"]
158        }
159
160        return mapped_item
161
162    @staticmethod
163    def parse_itemlist_item(node):
164        """
165        Parse Instagram post in 'item list' format
166
167        :param node:  Data as received from Instagram
168        :return dict:  Mapped item
169        """
170        num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"])
171        caption = MissingMappedField("") if not "caption" in node else "" if not node.get("caption") else node["caption"]["text"]
172
173        # get media url
174        # for carousels, get the first media item, for videos, get the video
175        # url, for photos, get the highest resolution
176        if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL:
177            media_node = node["carousel_media"][0]
178        else:
179            media_node = node
180
181        if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO:
182            media_url = media_node["video_versions"][0]["url"]
183            if "image_versions2" in media_node:
184                display_url = media_node["image_versions2"]["candidates"][0]["url"]
185            else:
186                # no image links at all :-/
187                # video is all we have
188                display_url = media_node["video_versions"][0]["url"]
189        elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"):
190            media_url = media_node["image_versions2"]["candidates"][0]["url"]
191            display_url = media_url
192        else:
193            media_url = MissingMappedField("")
194            display_url = MissingMappedField("")
195
196        # type, 'mixed' means carousel with video and photo
197        type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"}
198        if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL:
199            media_type = type_map.get(node["media_type"], "unknown")
200        else:
201            media_types = set([s["media_type"] for s in node["carousel_media"]])
202            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
203
204        if "comment_count" in node:
205            num_comments = node["comment_count"]
206        elif "comments" in node and type(node["comments"]) is list:
207            num_comments = len(node["comments"])
208        else:
209            num_comments = -1
210
211        location = {"name": "", "latlong": "", "city": ""}
212        if node.get("location"):
213            location["name"] = node["location"].get("name")
214            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
215                "location"].get("lat") else ""
216            location["city"] = node["location"].get("city")
217
218        user = node.get("user", {})
219        owner = node.get("owner", {})
220        if user and owner:
221            if user.get("username") != owner.get("username"):
222                raise MapItemException("Unable to parse item: different user and owner")
223
224        no_likes = bool(node.get("like_and_view_counts_disabled"))
225
226        mapped_item = {
227            "id": node["code"],
228            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata
229            "thread_id": node["code"],
230            "parent_id": node["code"],
231            "body": caption,
232            "author": user.get("username", owner.get("username", MissingMappedField(""))),
233            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
234            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
235            "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
236            "type": media_type,
237            "url": "https://www.instagram.com/p/" + node["code"],
238            "image_url": display_url,
239            "media_url": media_url,
240            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)),
241            # "usertags": ",".join(
242            #     [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
243            "likes_hidden": "yes" if no_likes else "no",
244            "num_likes": node["like_count"] if not no_likes else MissingMappedField(0),
245            "num_comments": num_comments,
246            "num_media": num_media,
247            "location_name": location["name"],
248            "location_latlong": location["latlong"],
249            "location_city": location["city"],
250            "unix_timestamp": node["taken_at"]
251        }
252
253        return mapped_item
class SearchInstagram(backend.lib.search.Search):
 16class SearchInstagram(Search):
 17    """
 18    Import scraped Instagram data
 19    """
 20    type = "instagram-search"  # job ID
 21    category = "Search"  # category
 22    title = "Import scraped Instagram data"  # title displayed in UI
 23    description = "Import Instagram data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 24    extension = "ndjson"  # extension of result file, used internally and in UI
 25    is_from_zeeschuimer = True
 26
 27    # not available as a processor for existing datasets
 28    accepts = [None]
 29    references = [
 30        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 31        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)"
 32    ]
 33
 34    # some magic numbers instagram uses
 35    MEDIA_TYPE_PHOTO = 1
 36    MEDIA_TYPE_VIDEO = 2
 37    MEDIA_TYPE_CAROUSEL = 8
 38
 39    def get_items(self, query):
 40        """
 41        Run custom search
 42
 43        Not available for Instagram
 44        """
 45        raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")
 46
 47
 48    @staticmethod
 49    def map_item(item):
 50        """
 51        Map Instagram item
 52
 53        Instagram importing is a little bit roundabout since we can expect
 54        input in two separate and not completely overlapping formats - an "edge
 55        list" or an "item list", and posts are structured differently between
 56        those, and do not contain the same data. So we find a middle ground
 57        here... each format has its own handler function
 58
 59        :param dict item:  Item to map
 60        :return:  Mapped item
 61        """
 62        link = item.get("link", "")
 63        if (item.get("product_type", "") == "ad") or \
 64                (link and link.startswith("https://www.facebook.com/ads/ig_redirect")):
 65            # These are ads
 66            raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.")
 67
 68        is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",)
 69
 70        if is_graph_response:
 71            return MappedItem(SearchInstagram.parse_graph_item(item))
 72        else:
 73            return MappedItem(SearchInstagram.parse_itemlist_item(item))
 74
 75    @staticmethod
 76    def parse_graph_item(node):
 77        """
 78        Parse Instagram post in Graph format
 79
 80        :param node:  Data as received from Instagram
 81        :return dict:  Mapped item
 82        """
 83        try:
 84            caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"]
 85        except IndexError:
 86            caption = MissingMappedField("")
 87
 88        num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"])
 89
 90        # get media url
 91        # for carousels, get the first media item, for videos, get the video
 92        # url, for photos, get the highest resolution
 93        if node["__typename"] == "GraphSidecar":
 94            media_node = node["edge_sidecar_to_children"]["edges"][0]["node"]
 95        else:
 96            media_node = node
 97
 98        if media_node["__typename"] == "GraphVideo":
 99            media_url = media_node["video_url"]
100        elif media_node["__typename"] == "GraphImage":
101            resources = media_node.get("display_resources", media_node.get("thumbnail_resources"))
102            try:
103                media_url = resources.pop()["src"]
104            except AttributeError:
105                media_url = media_node.get("display_url", "")
106        else:
107            media_url = media_node["display_url"]
108
109        # type, 'mixed' means carousel with video and photo
110        type_map = {"GraphSidecar": "photo", "GraphVideo": "video"}
111        if node["__typename"] != "GraphSidecar":
112            media_type = type_map.get(node["__typename"], "unknown")
113        else:
114            media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]])
115            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
116
117        location = {"name": "", "latlong": "", "city": ""}
118        # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems
119        if node.get("location"):
120            location["name"] = node["location"].get("name")
121            # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future...
122            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
123                "location"].get("lat") else ""
124            location["city"] = node["location"].get("city")
125
126        no_likes = bool(node.get("like_and_view_counts_disabled"))
127
128        user = node.get("user")
129        owner = node.get("owner")
130        if node.get("user") and node.get("owner"):
131            if user.get("username") != owner.get("username"):
132                raise MapItemException("Unable to parse item: different user and owner")
133
134        mapped_item = {
135            "id": node["shortcode"],
136            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"),  # Zeeschuimer metadata
137            "thread_id": node["shortcode"],
138            "parent_id": node["shortcode"],
139            "body": caption,
140            "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
141            "author": user.get("username", owner.get("username", MissingMappedField(""))),
142            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
143            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
144            "type": media_type,
145            "url": "https://www.instagram.com/p/" + node["shortcode"],
146            "image_url": node["display_url"],
147            "media_url": media_url,
148            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)),
149            # "usertags": ",".join(
150            #     [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
151            "likes_hidden": "yes" if no_likes else "no",
152            "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0),
153            "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0),
154            "num_media": num_media,
155            "location_name": location["name"],
156            "location_latlong": location["latlong"],
157            "location_city": location["city"],
158            "unix_timestamp": node["taken_at_timestamp"]
159        }
160
161        return mapped_item
162
163    @staticmethod
164    def parse_itemlist_item(node):
165        """
166        Parse Instagram post in 'item list' format
167
168        :param node:  Data as received from Instagram
169        :return dict:  Mapped item
170        """
171        num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"])
172        caption = MissingMappedField("") if not "caption" in node else "" if not node.get("caption") else node["caption"]["text"]
173
174        # get media url
175        # for carousels, get the first media item, for videos, get the video
176        # url, for photos, get the highest resolution
177        if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL:
178            media_node = node["carousel_media"][0]
179        else:
180            media_node = node
181
182        if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO:
183            media_url = media_node["video_versions"][0]["url"]
184            if "image_versions2" in media_node:
185                display_url = media_node["image_versions2"]["candidates"][0]["url"]
186            else:
187                # no image links at all :-/
188                # video is all we have
189                display_url = media_node["video_versions"][0]["url"]
190        elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"):
191            media_url = media_node["image_versions2"]["candidates"][0]["url"]
192            display_url = media_url
193        else:
194            media_url = MissingMappedField("")
195            display_url = MissingMappedField("")
196
197        # type, 'mixed' means carousel with video and photo
198        type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"}
199        if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL:
200            media_type = type_map.get(node["media_type"], "unknown")
201        else:
202            media_types = set([s["media_type"] for s in node["carousel_media"]])
203            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
204
205        if "comment_count" in node:
206            num_comments = node["comment_count"]
207        elif "comments" in node and type(node["comments"]) is list:
208            num_comments = len(node["comments"])
209        else:
210            num_comments = -1
211
212        location = {"name": "", "latlong": "", "city": ""}
213        if node.get("location"):
214            location["name"] = node["location"].get("name")
215            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
216                "location"].get("lat") else ""
217            location["city"] = node["location"].get("city")
218
219        user = node.get("user", {})
220        owner = node.get("owner", {})
221        if user and owner:
222            if user.get("username") != owner.get("username"):
223                raise MapItemException("Unable to parse item: different user and owner")
224
225        no_likes = bool(node.get("like_and_view_counts_disabled"))
226
227        mapped_item = {
228            "id": node["code"],
229            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata
230            "thread_id": node["code"],
231            "parent_id": node["code"],
232            "body": caption,
233            "author": user.get("username", owner.get("username", MissingMappedField(""))),
234            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
235            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
236            "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
237            "type": media_type,
238            "url": "https://www.instagram.com/p/" + node["code"],
239            "image_url": display_url,
240            "media_url": media_url,
241            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)),
242            # "usertags": ",".join(
243            #     [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
244            "likes_hidden": "yes" if no_likes else "no",
245            "num_likes": node["like_count"] if not no_likes else MissingMappedField(0),
246            "num_comments": num_comments,
247            "num_media": num_media,
248            "location_name": location["name"],
249            "location_latlong": location["latlong"],
250            "location_city": location["city"],
251            "unix_timestamp": node["taken_at"]
252        }
253
254        return mapped_item

Import scraped Instagram data

type = 'instagram-search'
category = 'Search'
title = 'Import scraped Instagram data'
description = 'Import Instagram data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)']
MEDIA_TYPE_PHOTO = 1
MEDIA_TYPE_VIDEO = 2
def get_items(self, query):
39    def get_items(self, query):
40        """
41        Run custom search
42
43        Not available for Instagram
44        """
45        raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Instagram

@staticmethod
def map_item(item):
48    @staticmethod
49    def map_item(item):
50        """
51        Map Instagram item
52
53        Instagram importing is a little bit roundabout since we can expect
54        input in two separate and not completely overlapping formats - an "edge
55        list" or an "item list", and posts are structured differently between
56        those, and do not contain the same data. So we find a middle ground
57        here... each format has its own handler function
58
59        :param dict item:  Item to map
60        :return:  Mapped item
61        """
62        link = item.get("link", "")
63        if (item.get("product_type", "") == "ad") or \
64                (link and link.startswith("https://www.facebook.com/ads/ig_redirect")):
65            # These are ads
66            raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.")
67
68        is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",)
69
70        if is_graph_response:
71            return MappedItem(SearchInstagram.parse_graph_item(item))
72        else:
73            return MappedItem(SearchInstagram.parse_itemlist_item(item))

Map Instagram item

Instagram importing is a little bit roundabout since we can expect input in two separate and not completely overlapping formats - an "edge list" or an "item list", and posts are structured differently between those, and do not contain the same data. So we find a middle ground here... each format has its own handler function

Parameters
  • dict item: Item to map
Returns

Mapped item

@staticmethod
def parse_graph_item(node):
 75    @staticmethod
 76    def parse_graph_item(node):
 77        """
 78        Parse Instagram post in Graph format
 79
 80        :param node:  Data as received from Instagram
 81        :return dict:  Mapped item
 82        """
 83        try:
 84            caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"]
 85        except IndexError:
 86            caption = MissingMappedField("")
 87
 88        num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"])
 89
 90        # get media url
 91        # for carousels, get the first media item, for videos, get the video
 92        # url, for photos, get the highest resolution
 93        if node["__typename"] == "GraphSidecar":
 94            media_node = node["edge_sidecar_to_children"]["edges"][0]["node"]
 95        else:
 96            media_node = node
 97
 98        if media_node["__typename"] == "GraphVideo":
 99            media_url = media_node["video_url"]
100        elif media_node["__typename"] == "GraphImage":
101            resources = media_node.get("display_resources", media_node.get("thumbnail_resources"))
102            try:
103                media_url = resources.pop()["src"]
104            except AttributeError:
105                media_url = media_node.get("display_url", "")
106        else:
107            media_url = media_node["display_url"]
108
109        # type, 'mixed' means carousel with video and photo
110        type_map = {"GraphSidecar": "photo", "GraphVideo": "video"}
111        if node["__typename"] != "GraphSidecar":
112            media_type = type_map.get(node["__typename"], "unknown")
113        else:
114            media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]])
115            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
116
117        location = {"name": "", "latlong": "", "city": ""}
118        # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems
119        if node.get("location"):
120            location["name"] = node["location"].get("name")
121            # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future...
122            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
123                "location"].get("lat") else ""
124            location["city"] = node["location"].get("city")
125
126        no_likes = bool(node.get("like_and_view_counts_disabled"))
127
128        user = node.get("user")
129        owner = node.get("owner")
130        if node.get("user") and node.get("owner"):
131            if user.get("username") != owner.get("username"):
132                raise MapItemException("Unable to parse item: different user and owner")
133
134        mapped_item = {
135            "id": node["shortcode"],
136            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"),  # Zeeschuimer metadata
137            "thread_id": node["shortcode"],
138            "parent_id": node["shortcode"],
139            "body": caption,
140            "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
141            "author": user.get("username", owner.get("username", MissingMappedField(""))),
142            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
143            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
144            "type": media_type,
145            "url": "https://www.instagram.com/p/" + node["shortcode"],
146            "image_url": node["display_url"],
147            "media_url": media_url,
148            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)),
149            # "usertags": ",".join(
150            #     [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
151            "likes_hidden": "yes" if no_likes else "no",
152            "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0),
153            "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0),
154            "num_media": num_media,
155            "location_name": location["name"],
156            "location_latlong": location["latlong"],
157            "location_city": location["city"],
158            "unix_timestamp": node["taken_at_timestamp"]
159        }
160
161        return mapped_item

Parse Instagram post in Graph format

Parameters
  • node: Data as received from Instagram
Returns

Mapped item

@staticmethod
def parse_itemlist_item(node):
163    @staticmethod
164    def parse_itemlist_item(node):
165        """
166        Parse Instagram post in 'item list' format
167
168        :param node:  Data as received from Instagram
169        :return dict:  Mapped item
170        """
171        num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"])
172        caption = MissingMappedField("") if not "caption" in node else "" if not node.get("caption") else node["caption"]["text"]
173
174        # get media url
175        # for carousels, get the first media item, for videos, get the video
176        # url, for photos, get the highest resolution
177        if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL:
178            media_node = node["carousel_media"][0]
179        else:
180            media_node = node
181
182        if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO:
183            media_url = media_node["video_versions"][0]["url"]
184            if "image_versions2" in media_node:
185                display_url = media_node["image_versions2"]["candidates"][0]["url"]
186            else:
187                # no image links at all :-/
188                # video is all we have
189                display_url = media_node["video_versions"][0]["url"]
190        elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"):
191            media_url = media_node["image_versions2"]["candidates"][0]["url"]
192            display_url = media_url
193        else:
194            media_url = MissingMappedField("")
195            display_url = MissingMappedField("")
196
197        # type, 'mixed' means carousel with video and photo
198        type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"}
199        if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL:
200            media_type = type_map.get(node["media_type"], "unknown")
201        else:
202            media_types = set([s["media_type"] for s in node["carousel_media"]])
203            media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
204
205        if "comment_count" in node:
206            num_comments = node["comment_count"]
207        elif "comments" in node and type(node["comments"]) is list:
208            num_comments = len(node["comments"])
209        else:
210            num_comments = -1
211
212        location = {"name": "", "latlong": "", "city": ""}
213        if node.get("location"):
214            location["name"] = node["location"].get("name")
215            location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
216                "location"].get("lat") else ""
217            location["city"] = node["location"].get("city")
218
219        user = node.get("user", {})
220        owner = node.get("owner", {})
221        if user and owner:
222            if user.get("username") != owner.get("username"):
223                raise MapItemException("Unable to parse item: different user and owner")
224
225        no_likes = bool(node.get("like_and_view_counts_disabled"))
226
227        mapped_item = {
228            "id": node["code"],
229            "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata
230            "thread_id": node["code"],
231            "parent_id": node["code"],
232            "body": caption,
233            "author": user.get("username", owner.get("username", MissingMappedField(""))),
234            "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
235            "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
236            "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
237            "type": media_type,
238            "url": "https://www.instagram.com/p/" + node["code"],
239            "image_url": display_url,
240            "media_url": media_url,
241            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)),
242            # "usertags": ",".join(
243            #     [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]),
244            "likes_hidden": "yes" if no_likes else "no",
245            "num_likes": node["like_count"] if not no_likes else MissingMappedField(0),
246            "num_comments": num_comments,
247            "num_media": num_media,
248            "location_name": location["name"],
249            "location_latlong": location["latlong"],
250            "location_city": location["city"],
251            "unix_timestamp": node["taken_at"]
252        }
253
254        return mapped_item

Parse Instagram post in 'item list' format

Parameters
  • node: Data as received from Instagram
Returns

Mapped item