datasources.instagram.search_instagram
Import scraped Instagram data
It's prohibitively difficult to scrape data from Instagram within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped Instagram data 3 4It's prohibitively difficult to scrape data from Instagram within 4CAT itself 5due to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7import datetime 8import re 9 10from backend.lib.search import Search 11from common.lib.item_mapping import MappedItem, MissingMappedField 12from common.lib.exceptions import WorkerInterruptedException, MapItemException 13 14 15class SearchInstagram(Search): 16 """ 17 Import scraped Instagram data 18 """ 19 type = "instagram-search" # job ID 20 category = "Search" # category 21 title = "Import scraped Instagram data" # title displayed in UI 22 description = "Import Instagram data collected with an external tool such as Zeeschuimer." # description displayed in UI 23 extension = "ndjson" # extension of result file, used internally and in UI 24 is_from_zeeschuimer = True 25 26 # not available as a processor for existing datasets 27 accepts = [None] 28 references = [ 29 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 30 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)" 31 ] 32 33 # some magic numbers instagram uses 34 MEDIA_TYPE_PHOTO = 1 35 MEDIA_TYPE_VIDEO = 2 36 MEDIA_TYPE_CAROUSEL = 8 37 38 def get_items(self, query): 39 """ 40 Run custom search 41 42 Not available for Instagram 43 """ 44 raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere") 45 46 47 @staticmethod 48 def map_item(item): 49 """ 50 Map Instagram item 51 52 Instagram importing is a little bit roundabout since we can expect 53 input in two separate and not completely overlapping formats - an "edge 54 list" or an "item list", and posts are structured differently between 55 those, and do not contain the same data. So we find a middle ground 56 here... each format has its own handler function 57 58 :param dict item: Item to map 59 :return: Mapped item 60 """ 61 link = item.get("link", "") 62 if (item.get("product_type", "") == "ad") or \ 63 (link and link.startswith("https://www.facebook.com/ads/ig_redirect")): 64 # These are ads 65 raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.") 66 67 is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",) 68 69 if is_graph_response: 70 return MappedItem(SearchInstagram.parse_graph_item(item)) 71 else: 72 return MappedItem(SearchInstagram.parse_itemlist_item(item)) 73 74 @staticmethod 75 def parse_graph_item(node): 76 """ 77 Parse Instagram post in Graph format 78 79 :param node: Data as received from Instagram 80 :return dict: Mapped item 81 """ 82 try: 83 caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"] 84 except IndexError: 85 caption = MissingMappedField("") 86 87 num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"]) 88 89 # get media url 90 # for carousels, get the first media item, for videos, get the video 91 # url, for photos, get the highest resolution 92 if node["__typename"] == "GraphSidecar": 93 media_node = node["edge_sidecar_to_children"]["edges"][0]["node"] 94 else: 95 media_node = node 96 97 if media_node["__typename"] == "GraphVideo": 98 media_url = media_node["video_url"] 99 elif media_node["__typename"] == "GraphImage": 100 resources = media_node.get("display_resources", media_node.get("thumbnail_resources")) 101 try: 102 media_url = resources.pop()["src"] 103 except AttributeError: 104 media_url = media_node.get("display_url", "") 105 else: 106 media_url = media_node["display_url"] 107 108 # type, 'mixed' means carousel with video and photo 109 type_map = {"GraphSidecar": "photo", "GraphVideo": "video"} 110 if node["__typename"] != "GraphSidecar": 111 media_type = type_map.get(node["__typename"], "unknown") 112 else: 113 media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]]) 114 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 115 116 location = {"name": "", "latlong": "", "city": ""} 117 # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems 118 if node.get("location"): 119 location["name"] = node["location"].get("name") 120 # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future... 121 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 122 "location"].get("lat") else "" 123 location["city"] = node["location"].get("city") 124 125 no_likes = bool(node.get("like_and_view_counts_disabled")) 126 127 user = node.get("user") 128 owner = node.get("owner") 129 if node.get("user") and node.get("owner"): 130 if user.get("username") != owner.get("username"): 131 raise MapItemException("Unable to parse item: different user and owner") 132 133 mapped_item = { 134 "id": node["shortcode"], 135 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 136 "thread_id": node["shortcode"], 137 "parent_id": node["shortcode"], 138 "body": caption, 139 "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), 140 "author": user.get("username", owner.get("username", MissingMappedField(""))), 141 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 142 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 143 "type": media_type, 144 "url": "https://www.instagram.com/p/" + node["shortcode"], 145 "image_url": node["display_url"], 146 "media_url": media_url, 147 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)), 148 # "usertags": ",".join( 149 # [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 150 "likes_hidden": "yes" if no_likes else "no", 151 "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0), 152 "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0), 153 "num_media": num_media, 154 "location_name": location["name"], 155 "location_latlong": location["latlong"], 156 "location_city": location["city"], 157 "unix_timestamp": node["taken_at_timestamp"] 158 } 159 160 return mapped_item 161 162 @staticmethod 163 def parse_itemlist_item(node): 164 """ 165 Parse Instagram post in 'item list' format 166 167 :param node: Data as received from Instagram 168 :return dict: Mapped item 169 """ 170 num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"]) 171 caption = MissingMappedField("") if not "caption" in node else "" if not node.get("caption") else node["caption"]["text"] 172 173 # get media url 174 # for carousels, get the first media item, for videos, get the video 175 # url, for photos, get the highest resolution 176 if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL: 177 media_node = node["carousel_media"][0] 178 else: 179 media_node = node 180 181 if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO: 182 media_url = media_node["video_versions"][0]["url"] 183 if "image_versions2" in media_node: 184 display_url = media_node["image_versions2"]["candidates"][0]["url"] 185 else: 186 # no image links at all :-/ 187 # video is all we have 188 display_url = media_node["video_versions"][0]["url"] 189 elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"): 190 media_url = media_node["image_versions2"]["candidates"][0]["url"] 191 display_url = media_url 192 else: 193 media_url = MissingMappedField("") 194 display_url = MissingMappedField("") 195 196 # type, 'mixed' means carousel with video and photo 197 type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"} 198 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL: 199 media_type = type_map.get(node["media_type"], "unknown") 200 else: 201 media_types = set([s["media_type"] for s in node["carousel_media"]]) 202 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 203 204 if "comment_count" in node: 205 num_comments = node["comment_count"] 206 elif "comments" in node and type(node["comments"]) is list: 207 num_comments = len(node["comments"]) 208 else: 209 num_comments = -1 210 211 location = {"name": "", "latlong": "", "city": ""} 212 if node.get("location"): 213 location["name"] = node["location"].get("name") 214 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 215 "location"].get("lat") else "" 216 location["city"] = node["location"].get("city") 217 218 user = node.get("user", {}) 219 owner = node.get("owner", {}) 220 if user and owner: 221 if user.get("username") != owner.get("username"): 222 raise MapItemException("Unable to parse item: different user and owner") 223 224 no_likes = bool(node.get("like_and_view_counts_disabled")) 225 226 mapped_item = { 227 "id": node["code"], 228 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 229 "thread_id": node["code"], 230 "parent_id": node["code"], 231 "body": caption, 232 "author": user.get("username", owner.get("username", MissingMappedField(""))), 233 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 234 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 235 "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"), 236 "type": media_type, 237 "url": "https://www.instagram.com/p/" + node["code"], 238 "image_url": display_url, 239 "media_url": media_url, 240 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)), 241 # "usertags": ",".join( 242 # [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 243 "likes_hidden": "yes" if no_likes else "no", 244 "num_likes": node["like_count"] if not no_likes else MissingMappedField(0), 245 "num_comments": num_comments, 246 "num_media": num_media, 247 "location_name": location["name"], 248 "location_latlong": location["latlong"], 249 "location_city": location["city"], 250 "unix_timestamp": node["taken_at"] 251 } 252 253 return mapped_item
16class SearchInstagram(Search): 17 """ 18 Import scraped Instagram data 19 """ 20 type = "instagram-search" # job ID 21 category = "Search" # category 22 title = "Import scraped Instagram data" # title displayed in UI 23 description = "Import Instagram data collected with an external tool such as Zeeschuimer." # description displayed in UI 24 extension = "ndjson" # extension of result file, used internally and in UI 25 is_from_zeeschuimer = True 26 27 # not available as a processor for existing datasets 28 accepts = [None] 29 references = [ 30 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 31 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)" 32 ] 33 34 # some magic numbers instagram uses 35 MEDIA_TYPE_PHOTO = 1 36 MEDIA_TYPE_VIDEO = 2 37 MEDIA_TYPE_CAROUSEL = 8 38 39 def get_items(self, query): 40 """ 41 Run custom search 42 43 Not available for Instagram 44 """ 45 raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere") 46 47 48 @staticmethod 49 def map_item(item): 50 """ 51 Map Instagram item 52 53 Instagram importing is a little bit roundabout since we can expect 54 input in two separate and not completely overlapping formats - an "edge 55 list" or an "item list", and posts are structured differently between 56 those, and do not contain the same data. So we find a middle ground 57 here... each format has its own handler function 58 59 :param dict item: Item to map 60 :return: Mapped item 61 """ 62 link = item.get("link", "") 63 if (item.get("product_type", "") == "ad") or \ 64 (link and link.startswith("https://www.facebook.com/ads/ig_redirect")): 65 # These are ads 66 raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.") 67 68 is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",) 69 70 if is_graph_response: 71 return MappedItem(SearchInstagram.parse_graph_item(item)) 72 else: 73 return MappedItem(SearchInstagram.parse_itemlist_item(item)) 74 75 @staticmethod 76 def parse_graph_item(node): 77 """ 78 Parse Instagram post in Graph format 79 80 :param node: Data as received from Instagram 81 :return dict: Mapped item 82 """ 83 try: 84 caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"] 85 except IndexError: 86 caption = MissingMappedField("") 87 88 num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"]) 89 90 # get media url 91 # for carousels, get the first media item, for videos, get the video 92 # url, for photos, get the highest resolution 93 if node["__typename"] == "GraphSidecar": 94 media_node = node["edge_sidecar_to_children"]["edges"][0]["node"] 95 else: 96 media_node = node 97 98 if media_node["__typename"] == "GraphVideo": 99 media_url = media_node["video_url"] 100 elif media_node["__typename"] == "GraphImage": 101 resources = media_node.get("display_resources", media_node.get("thumbnail_resources")) 102 try: 103 media_url = resources.pop()["src"] 104 except AttributeError: 105 media_url = media_node.get("display_url", "") 106 else: 107 media_url = media_node["display_url"] 108 109 # type, 'mixed' means carousel with video and photo 110 type_map = {"GraphSidecar": "photo", "GraphVideo": "video"} 111 if node["__typename"] != "GraphSidecar": 112 media_type = type_map.get(node["__typename"], "unknown") 113 else: 114 media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]]) 115 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 116 117 location = {"name": "", "latlong": "", "city": ""} 118 # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems 119 if node.get("location"): 120 location["name"] = node["location"].get("name") 121 # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future... 122 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 123 "location"].get("lat") else "" 124 location["city"] = node["location"].get("city") 125 126 no_likes = bool(node.get("like_and_view_counts_disabled")) 127 128 user = node.get("user") 129 owner = node.get("owner") 130 if node.get("user") and node.get("owner"): 131 if user.get("username") != owner.get("username"): 132 raise MapItemException("Unable to parse item: different user and owner") 133 134 mapped_item = { 135 "id": node["shortcode"], 136 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 137 "thread_id": node["shortcode"], 138 "parent_id": node["shortcode"], 139 "body": caption, 140 "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), 141 "author": user.get("username", owner.get("username", MissingMappedField(""))), 142 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 143 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 144 "type": media_type, 145 "url": "https://www.instagram.com/p/" + node["shortcode"], 146 "image_url": node["display_url"], 147 "media_url": media_url, 148 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)), 149 # "usertags": ",".join( 150 # [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 151 "likes_hidden": "yes" if no_likes else "no", 152 "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0), 153 "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0), 154 "num_media": num_media, 155 "location_name": location["name"], 156 "location_latlong": location["latlong"], 157 "location_city": location["city"], 158 "unix_timestamp": node["taken_at_timestamp"] 159 } 160 161 return mapped_item 162 163 @staticmethod 164 def parse_itemlist_item(node): 165 """ 166 Parse Instagram post in 'item list' format 167 168 :param node: Data as received from Instagram 169 :return dict: Mapped item 170 """ 171 num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"]) 172 caption = MissingMappedField("") if not "caption" in node else "" if not node.get("caption") else node["caption"]["text"] 173 174 # get media url 175 # for carousels, get the first media item, for videos, get the video 176 # url, for photos, get the highest resolution 177 if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL: 178 media_node = node["carousel_media"][0] 179 else: 180 media_node = node 181 182 if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO: 183 media_url = media_node["video_versions"][0]["url"] 184 if "image_versions2" in media_node: 185 display_url = media_node["image_versions2"]["candidates"][0]["url"] 186 else: 187 # no image links at all :-/ 188 # video is all we have 189 display_url = media_node["video_versions"][0]["url"] 190 elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"): 191 media_url = media_node["image_versions2"]["candidates"][0]["url"] 192 display_url = media_url 193 else: 194 media_url = MissingMappedField("") 195 display_url = MissingMappedField("") 196 197 # type, 'mixed' means carousel with video and photo 198 type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"} 199 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL: 200 media_type = type_map.get(node["media_type"], "unknown") 201 else: 202 media_types = set([s["media_type"] for s in node["carousel_media"]]) 203 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 204 205 if "comment_count" in node: 206 num_comments = node["comment_count"] 207 elif "comments" in node and type(node["comments"]) is list: 208 num_comments = len(node["comments"]) 209 else: 210 num_comments = -1 211 212 location = {"name": "", "latlong": "", "city": ""} 213 if node.get("location"): 214 location["name"] = node["location"].get("name") 215 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 216 "location"].get("lat") else "" 217 location["city"] = node["location"].get("city") 218 219 user = node.get("user", {}) 220 owner = node.get("owner", {}) 221 if user and owner: 222 if user.get("username") != owner.get("username"): 223 raise MapItemException("Unable to parse item: different user and owner") 224 225 no_likes = bool(node.get("like_and_view_counts_disabled")) 226 227 mapped_item = { 228 "id": node["code"], 229 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 230 "thread_id": node["code"], 231 "parent_id": node["code"], 232 "body": caption, 233 "author": user.get("username", owner.get("username", MissingMappedField(""))), 234 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 235 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 236 "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"), 237 "type": media_type, 238 "url": "https://www.instagram.com/p/" + node["code"], 239 "image_url": display_url, 240 "media_url": media_url, 241 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)), 242 # "usertags": ",".join( 243 # [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 244 "likes_hidden": "yes" if no_likes else "no", 245 "num_likes": node["like_count"] if not no_likes else MissingMappedField(0), 246 "num_comments": num_comments, 247 "num_media": num_media, 248 "location_name": location["name"], 249 "location_latlong": location["latlong"], 250 "location_city": location["city"], 251 "unix_timestamp": node["taken_at"] 252 } 253 254 return mapped_item
Import scraped Instagram data
references =
['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)']
def
get_items(self, query):
39 def get_items(self, query): 40 """ 41 Run custom search 42 43 Not available for Instagram 44 """ 45 raise NotImplementedError("Instagram datasets can only be created by importing data from elsewhere")
Run custom search
Not available for Instagram
@staticmethod
def
map_item(item):
48 @staticmethod 49 def map_item(item): 50 """ 51 Map Instagram item 52 53 Instagram importing is a little bit roundabout since we can expect 54 input in two separate and not completely overlapping formats - an "edge 55 list" or an "item list", and posts are structured differently between 56 those, and do not contain the same data. So we find a middle ground 57 here... each format has its own handler function 58 59 :param dict item: Item to map 60 :return: Mapped item 61 """ 62 link = item.get("link", "") 63 if (item.get("product_type", "") == "ad") or \ 64 (link and link.startswith("https://www.facebook.com/ads/ig_redirect")): 65 # These are ads 66 raise MapItemException("appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.") 67 68 is_graph_response = "__typename" in item and item["__typename"] not in ("XDTMediaDict",) 69 70 if is_graph_response: 71 return MappedItem(SearchInstagram.parse_graph_item(item)) 72 else: 73 return MappedItem(SearchInstagram.parse_itemlist_item(item))
Map Instagram item
Instagram importing is a little bit roundabout since we can expect input in two separate and not completely overlapping formats - an "edge list" or an "item list", and posts are structured differently between those, and do not contain the same data. So we find a middle ground here... each format has its own handler function
Parameters
- dict item: Item to map
Returns
Mapped item
@staticmethod
def
parse_graph_item(node):
75 @staticmethod 76 def parse_graph_item(node): 77 """ 78 Parse Instagram post in Graph format 79 80 :param node: Data as received from Instagram 81 :return dict: Mapped item 82 """ 83 try: 84 caption = node["edge_media_to_caption"]["edges"][0]["node"]["text"] 85 except IndexError: 86 caption = MissingMappedField("") 87 88 num_media = 1 if node["__typename"] != "GraphSidecar" else len(node["edge_sidecar_to_children"]["edges"]) 89 90 # get media url 91 # for carousels, get the first media item, for videos, get the video 92 # url, for photos, get the highest resolution 93 if node["__typename"] == "GraphSidecar": 94 media_node = node["edge_sidecar_to_children"]["edges"][0]["node"] 95 else: 96 media_node = node 97 98 if media_node["__typename"] == "GraphVideo": 99 media_url = media_node["video_url"] 100 elif media_node["__typename"] == "GraphImage": 101 resources = media_node.get("display_resources", media_node.get("thumbnail_resources")) 102 try: 103 media_url = resources.pop()["src"] 104 except AttributeError: 105 media_url = media_node.get("display_url", "") 106 else: 107 media_url = media_node["display_url"] 108 109 # type, 'mixed' means carousel with video and photo 110 type_map = {"GraphSidecar": "photo", "GraphVideo": "video"} 111 if node["__typename"] != "GraphSidecar": 112 media_type = type_map.get(node["__typename"], "unknown") 113 else: 114 media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]]) 115 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 116 117 location = {"name": "", "latlong": "", "city": ""} 118 # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems 119 if node.get("location"): 120 location["name"] = node["location"].get("name") 121 # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future... 122 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 123 "location"].get("lat") else "" 124 location["city"] = node["location"].get("city") 125 126 no_likes = bool(node.get("like_and_view_counts_disabled")) 127 128 user = node.get("user") 129 owner = node.get("owner") 130 if node.get("user") and node.get("owner"): 131 if user.get("username") != owner.get("username"): 132 raise MapItemException("Unable to parse item: different user and owner") 133 134 mapped_item = { 135 "id": node["shortcode"], 136 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 137 "thread_id": node["shortcode"], 138 "parent_id": node["shortcode"], 139 "body": caption, 140 "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), 141 "author": user.get("username", owner.get("username", MissingMappedField(""))), 142 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 143 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 144 "type": media_type, 145 "url": "https://www.instagram.com/p/" + node["shortcode"], 146 "image_url": node["display_url"], 147 "media_url": media_url, 148 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)), 149 # "usertags": ",".join( 150 # [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 151 "likes_hidden": "yes" if no_likes else "no", 152 "num_likes": node["edge_media_preview_like"]["count"] if not no_likes else MissingMappedField(0), 153 "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0), 154 "num_media": num_media, 155 "location_name": location["name"], 156 "location_latlong": location["latlong"], 157 "location_city": location["city"], 158 "unix_timestamp": node["taken_at_timestamp"] 159 } 160 161 return mapped_item
Parse Instagram post in Graph format
Parameters
- node: Data as received from Instagram
Returns
Mapped item
@staticmethod
def
parse_itemlist_item(node):
163 @staticmethod 164 def parse_itemlist_item(node): 165 """ 166 Parse Instagram post in 'item list' format 167 168 :param node: Data as received from Instagram 169 :return dict: Mapped item 170 """ 171 num_media = 1 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL else len(node["carousel_media"]) 172 caption = MissingMappedField("") if not "caption" in node else "" if not node.get("caption") else node["caption"]["text"] 173 174 # get media url 175 # for carousels, get the first media item, for videos, get the video 176 # url, for photos, get the highest resolution 177 if node["media_type"] == SearchInstagram.MEDIA_TYPE_CAROUSEL: 178 media_node = node["carousel_media"][0] 179 else: 180 media_node = node 181 182 if media_node["media_type"] == SearchInstagram.MEDIA_TYPE_VIDEO: 183 media_url = media_node["video_versions"][0]["url"] 184 if "image_versions2" in media_node: 185 display_url = media_node["image_versions2"]["candidates"][0]["url"] 186 else: 187 # no image links at all :-/ 188 # video is all we have 189 display_url = media_node["video_versions"][0]["url"] 190 elif media_node["media_type"] == SearchInstagram.MEDIA_TYPE_PHOTO and media_node.get("image_versions2"): 191 media_url = media_node["image_versions2"]["candidates"][0]["url"] 192 display_url = media_url 193 else: 194 media_url = MissingMappedField("") 195 display_url = MissingMappedField("") 196 197 # type, 'mixed' means carousel with video and photo 198 type_map = {SearchInstagram.MEDIA_TYPE_PHOTO: "photo", SearchInstagram.MEDIA_TYPE_VIDEO: "video"} 199 if node["media_type"] != SearchInstagram.MEDIA_TYPE_CAROUSEL: 200 media_type = type_map.get(node["media_type"], "unknown") 201 else: 202 media_types = set([s["media_type"] for s in node["carousel_media"]]) 203 media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") 204 205 if "comment_count" in node: 206 num_comments = node["comment_count"] 207 elif "comments" in node and type(node["comments"]) is list: 208 num_comments = len(node["comments"]) 209 else: 210 num_comments = -1 211 212 location = {"name": "", "latlong": "", "city": ""} 213 if node.get("location"): 214 location["name"] = node["location"].get("name") 215 location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ 216 "location"].get("lat") else "" 217 location["city"] = node["location"].get("city") 218 219 user = node.get("user", {}) 220 owner = node.get("owner", {}) 221 if user and owner: 222 if user.get("username") != owner.get("username"): 223 raise MapItemException("Unable to parse item: different user and owner") 224 225 no_likes = bool(node.get("like_and_view_counts_disabled")) 226 227 mapped_item = { 228 "id": node["code"], 229 "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata 230 "thread_id": node["code"], 231 "parent_id": node["code"], 232 "body": caption, 233 "author": user.get("username", owner.get("username", MissingMappedField(""))), 234 "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), 235 "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), 236 "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"), 237 "type": media_type, 238 "url": "https://www.instagram.com/p/" + node["code"], 239 "image_url": display_url, 240 "media_url": media_url, 241 "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", caption)), 242 # "usertags": ",".join( 243 # [u["node"]["user"]["username"] for u in node["edge_media_to_tagged_user"]["edges"]]), 244 "likes_hidden": "yes" if no_likes else "no", 245 "num_likes": node["like_count"] if not no_likes else MissingMappedField(0), 246 "num_comments": num_comments, 247 "num_media": num_media, 248 "location_name": location["name"], 249 "location_latlong": location["latlong"], 250 "location_city": location["city"], 251 "unix_timestamp": node["taken_at"] 252 } 253 254 return mapped_item
Parse Instagram post in 'item list' format
Parameters
- node: Data as received from Instagram
Returns
Mapped item
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor