datasources.linkedin.search_linkedin
Import scraped LinkedIn data
It's prohibitively difficult to scrape data from LinkedIn within 4CAT itself due to its aggressive rate limiting and login wall. Instead, import data collected elsewhere.
1""" 2Import scraped LinkedIn data 3 4It's prohibitively difficult to scrape data from LinkedIn within 4CAT itself 5due to its aggressive rate limiting and login wall. Instead, import data 6collected elsewhere. 7""" 8import datetime 9import time 10import re 11 12from backend.lib.search import Search 13from common.lib.item_mapping import MappedItem 14from common.lib.helpers import normalize_url_encoding 15 16class SearchLinkedIn(Search): 17 """ 18 Import scraped LinkedIn data 19 """ 20 type = "linkedin-search" # job ID 21 category = "Search" # category 22 title = "Import scraped LinkedIn data" # title displayed in UI 23 description = "Import LinkedIn data collected with an external tool such as Zeeschuimer." # description displayed in UI 24 extension = "ndjson" # extension of result file, used internally and in UI 25 is_from_zeeschuimer = True 26 27 # not available as a processor for existing datasets 28 accepts = [None] 29 references = [ 30 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 31 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)" 32 ] 33 34 def get_items(self, query): 35 """ 36 Run custom search 37 38 Not available for LinkedIn 39 """ 40 raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere") 41 42 @staticmethod 43 def map_item(item): 44 """ 45 Parse LinkedIn post in Voyager V2 format 46 47 'Voyager V2' seems to be how the format is referred to in the data 48 itself... 49 50 :param item: Data as received from LinkedIn 51 :return dict: Mapped item 52 """ 53 54 # annoyingly, posts don't come with a timestamp 55 # approximate it by using the time of collection and the "time ago" 56 # included with the post (e.g. 'published 18h ago') 57 if not item.get("actor"): 58 return {} 59 60 if "__import_meta" in item: 61 time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000) # milliseconds 62 else: 63 # best we got 64 time_collected = int(time.time()) 65 66 time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else "" 67 timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago)) 68 69 # images are stored in some convoluted way 70 # there are multiple URLs for various thumbnails, use the one for the 71 # largest version of the image 72 images = [] 73 if item["content"] and "images" in item["content"]: 74 for image in item["content"]["images"]: 75 image_data = image["attributes"][0]["vectorImage"] 76 artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True) 77 url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"] 78 images.append(url) 79 80 # or alternatively they are stored here: 81 if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"): 82 image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"] 83 if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]: 84 images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"]) 85 elif image and image.get("artifacts"): 86 images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"]) 87 88 # video thumbnails are stored similarly as image data 89 video_thumb_url = "" 90 thumb_content = None 91 if item["content"] and "*videoPlayMetadata" in item["content"]: 92 thumb_content = item["content"]["*videoPlayMetadata"]["thumbnail"] 93 elif item["content"] and "linkedInVideoComponent" in item["content"] and item["content"]["linkedInVideoComponent"]: 94 thumb_content = item["content"]["linkedInVideoComponent"]["*videoPlayMetadata"]["thumbnail"] 95 elif item["content"] and "externalVideoComponent" in item["content"] and item["content"]["externalVideoComponent"]: 96 thumb_content = item["content"]["externalVideoComponent"]["*videoPlayMetadata"]["thumbnail"] 97 if thumb_content: 98 video_thumb_url = thumb_content["rootUrl"] + thumb_content["artifacts"][0]["fileIdentifyingUrlPathSegment"] 99 100 author = SearchLinkedIn.get_author(item) 101 102 # the ID is in the format 'urn:li:activity:6960882777168695296' 103 # retain the numerical part as the item ID for 4CAT 104 # sometimes posts seem to be combined, e.g.: 105 # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449) 106 # effectively both IDs seem to refer to the same post, so just take the 107 # first one 108 meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"] 109 urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0] 110 item_id = urn.split(":").pop() 111 112 # the way hashtags were stored changed at some point 113 hashtags = [] 114 if item["commentary"] and "attributes" in item["commentary"]["text"]: 115 hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"] 116 elif item["commentary"] and "attributesV2" in item["commentary"]["text"]: 117 hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]] 118 119 # and mentions 120 # we're storing both usernames and full names 121 author_mentions = [] 122 author_name_mentions = [] 123 if item["commentary"] and "attributes" in item["commentary"]["text"]: 124 for mention in item["commentary"]["text"].get("attributes", {}): 125 if mention["type"] == "PROFILE_MENTION": 126 mention = mention["*miniProfile"] 127 author_mentions.append(mention["publicIdentifier"]) 128 author_name_mentions.append(" ".join([mention.get("firstName", ""), mention.get("lastName", "")])) 129 elif mention["type"] == "COMPANY_NAME": 130 mention = mention["*miniCompany"] 131 author_mentions.append(mention["universalName"]) 132 author_name_mentions.append(mention.get("name", "")) 133 134 # same for metrics 135 if "*totalSocialActivityCounts" in item["*socialDetail"]: 136 metrics = { 137 "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"], 138 "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"], 139 "reactions": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"], 140 "reaction_like": 0, 141 "reaction_empathy": 0, 142 "reaction_praise": 0, 143 "reaction_entertainment": 0, 144 "reaction_appreciation": 0, 145 "reaction_interest": 0 146 } 147 # There's different kind of reaction metrics 148 for reaction_type in item["*socialDetail"]["*totalSocialActivityCounts"].get("reactionTypeCounts", []): 149 metrics["reaction_" + reaction_type["reactionType"].lower()] = reaction_type["count"] 150 151 else: 152 metrics = { 153 "comments": item["*socialDetail"]["comments"]["paging"]["total"], 154 "shares": item["*socialDetail"]["totalShares"], 155 "reactions": item["*socialDetail"]["likes"]["paging"]["total"] 156 } 157 158 # and links 159 link_url = "" 160 if item.get("content") and item["content"].get("navigationContext"): 161 link_url = item["content"]["navigationContext"].get("actionTarget", "") 162 elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]: 163 link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "") 164 165 return MappedItem({ 166 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 167 "id": item_id, 168 "thread_id": item_id, 169 "body": item["commentary"]["text"]["text"] if item["commentary"] else "", 170 "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"), 171 "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"), 172 "timestamp_ago": time_ago.split("•")[0].strip(), 173 "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no", 174 **{("author_" + k).replace("_username", ""): v for k, v in author.items()}, 175 "author_mentions": ",".join(author_mentions), 176 "author_name_mentions": ",".join(author_name_mentions), 177 "hashtags": ",".join(hashtags), 178 "image_urls": ",".join(images), 179 "video_thumb_url": video_thumb_url, 180 "post_url": "https://www.linkedin.com/feed/update/" + urn, 181 "link_url": link_url, 182 **metrics, 183 "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "", 184 "unix_timestamp": timestamp, 185 "unix_timestamp_collected": time_collected 186 }) 187 188 @staticmethod 189 def get_author(post): 190 """ 191 Extract author information from post 192 193 This is a bit complicated because it works differently for companies 194 and users and some fields are not always present. Hence, a separate 195 method. 196 197 :param dict post: Post data 198 :return dict: Author information 199 """ 200 author = { 201 "username": post["actor"]["navigationContext"]["actionTarget"].split("linkedin.com/").pop().split("?")[0], 202 "name": post["actor"]["name"]["text"], 203 "description": post["actor"].get("description", {}).get("text", "") if post["actor"].get("description") else "", 204 "pronouns": "", 205 "avatar_url": "", 206 "is_company": "no", 207 "url": post["actor"]["navigationContext"]["actionTarget"].split("?")[0], 208 } 209 210 # likewise for author avatars 211 if post["actor"]["name"].get("attributes"): 212 if "*miniProfile" in post["actor"]["name"]["attributes"][0]: 213 author_profile = post["actor"]["name"]["attributes"][0]["*miniProfile"] 214 if author_profile["picture"]: 215 avatar_artifacts = sorted(author_profile["picture"]["artifacts"], key=lambda x: x["width"], reverse=True) 216 author.update({"avatar_url": author_profile["picture"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]}) 217 218 if author_profile.get("customPronoun"): 219 author.update({"pronouns": author_profile.get("customPronoun")}) 220 elif author_profile.get("standardizedPronoun"): 221 author.update({"pronouns": author_profile.get("standardizedPronoun").lower()}) 222 223 elif "*miniCompany" in post["actor"]["name"]["attributes"][0]: 224 author_profile = post["actor"]["name"]["attributes"][0]["*miniCompany"] 225 avatar_artifacts = sorted(author_profile["logo"]["artifacts"], key=lambda x: x["width"], reverse=True) 226 227 author.update({"is_company": "yes"}) 228 author.update({"avatar_url": author_profile["logo"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]}) 229 230 if post["actor"]["name"].get("attributesV2"): 231 pronouns = post["actor"]["name"]["attributesV2"][0]["detailData"].get("*profileFullName", {}).get("pronoun") 232 if pronouns: 233 if pronouns.get("customPronoun"): 234 author.update({"pronouns": pronouns.get("customPronoun")}) 235 elif pronouns.get("standardizedPronoun"): 236 author.update({"pronouns": pronouns.get("standardizedPronoun")}) 237 238 avatar = post["actor"]["image"].get("attributes", [{}])[0].get("detailData", {}).get("nonEntityProfilePicture") 239 if avatar and avatar["vectorImage"]: 240 author.update({"avatar_url": avatar["vectorImage"]["rootUrl"] + avatar["vectorImage"]["artifacts"][0]["fileIdentifyingUrlPathSegment"]}) 241 242 return author 243 244 @staticmethod 245 def parse_time_ago(time_ago): 246 """ 247 Attempt to parse a timestamp for a post 248 249 LinkedIn doesn't give us the actual timestamp, only a relative 250 indicator like "18h ago". This is annoying because it gets more 251 imprecise the longer ago it is, and because it is language-sensitive. 252 For example, in English 18 months is displayed as "18mo" but in Dutch 253 it is "18 mnd". 254 255 Right now this will only adjust the 'collected at' timestamp if the 256 data was scraped from an English or Dutch interface, and even then the 257 timestamps will still be imprecise. 258 259 :param str time_ago: Relative timestamp, e.g. '18mo'. 260 :return int: Estimated timestamp of post, as unix timestamp 261 """ 262 time_ago = time_ago.split("•")[0] 263 numbers = re.sub(r"[^0-9]", "", time_ago).strip() 264 letters = re.sub(r"[0-9]", "", time_ago).strip() 265 266 period_lengths = { 267 "s": 1, 268 "m": 60, 269 "h": 3600, 270 "d": 86400, 271 "w": 7 * 86400, 272 "mo": 30.4375 * 86400, # we don't know WHICH months, so use the average length of a month 273 "mnd": 30.4375 * 86400, 274 "yr": 365.25 * 86400, # likewise 275 "j": 365.25 * 86400, 276 } 277 278 numbers = int(numbers) if len(numbers) else 0 279 return period_lengths.get(letters, 0) * numbers
17class SearchLinkedIn(Search): 18 """ 19 Import scraped LinkedIn data 20 """ 21 type = "linkedin-search" # job ID 22 category = "Search" # category 23 title = "Import scraped LinkedIn data" # title displayed in UI 24 description = "Import LinkedIn data collected with an external tool such as Zeeschuimer." # description displayed in UI 25 extension = "ndjson" # extension of result file, used internally and in UI 26 is_from_zeeschuimer = True 27 28 # not available as a processor for existing datasets 29 accepts = [None] 30 references = [ 31 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 32 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)" 33 ] 34 35 def get_items(self, query): 36 """ 37 Run custom search 38 39 Not available for LinkedIn 40 """ 41 raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere") 42 43 @staticmethod 44 def map_item(item): 45 """ 46 Parse LinkedIn post in Voyager V2 format 47 48 'Voyager V2' seems to be how the format is referred to in the data 49 itself... 50 51 :param item: Data as received from LinkedIn 52 :return dict: Mapped item 53 """ 54 55 # annoyingly, posts don't come with a timestamp 56 # approximate it by using the time of collection and the "time ago" 57 # included with the post (e.g. 'published 18h ago') 58 if not item.get("actor"): 59 return {} 60 61 if "__import_meta" in item: 62 time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000) # milliseconds 63 else: 64 # best we got 65 time_collected = int(time.time()) 66 67 time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else "" 68 timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago)) 69 70 # images are stored in some convoluted way 71 # there are multiple URLs for various thumbnails, use the one for the 72 # largest version of the image 73 images = [] 74 if item["content"] and "images" in item["content"]: 75 for image in item["content"]["images"]: 76 image_data = image["attributes"][0]["vectorImage"] 77 artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True) 78 url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"] 79 images.append(url) 80 81 # or alternatively they are stored here: 82 if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"): 83 image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"] 84 if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]: 85 images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"]) 86 elif image and image.get("artifacts"): 87 images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"]) 88 89 # video thumbnails are stored similarly as image data 90 video_thumb_url = "" 91 thumb_content = None 92 if item["content"] and "*videoPlayMetadata" in item["content"]: 93 thumb_content = item["content"]["*videoPlayMetadata"]["thumbnail"] 94 elif item["content"] and "linkedInVideoComponent" in item["content"] and item["content"]["linkedInVideoComponent"]: 95 thumb_content = item["content"]["linkedInVideoComponent"]["*videoPlayMetadata"]["thumbnail"] 96 elif item["content"] and "externalVideoComponent" in item["content"] and item["content"]["externalVideoComponent"]: 97 thumb_content = item["content"]["externalVideoComponent"]["*videoPlayMetadata"]["thumbnail"] 98 if thumb_content: 99 video_thumb_url = thumb_content["rootUrl"] + thumb_content["artifacts"][0]["fileIdentifyingUrlPathSegment"] 100 101 author = SearchLinkedIn.get_author(item) 102 103 # the ID is in the format 'urn:li:activity:6960882777168695296' 104 # retain the numerical part as the item ID for 4CAT 105 # sometimes posts seem to be combined, e.g.: 106 # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449) 107 # effectively both IDs seem to refer to the same post, so just take the 108 # first one 109 meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"] 110 urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0] 111 item_id = urn.split(":").pop() 112 113 # the way hashtags were stored changed at some point 114 hashtags = [] 115 if item["commentary"] and "attributes" in item["commentary"]["text"]: 116 hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"] 117 elif item["commentary"] and "attributesV2" in item["commentary"]["text"]: 118 hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]] 119 120 # and mentions 121 # we're storing both usernames and full names 122 author_mentions = [] 123 author_name_mentions = [] 124 if item["commentary"] and "attributes" in item["commentary"]["text"]: 125 for mention in item["commentary"]["text"].get("attributes", {}): 126 if mention["type"] == "PROFILE_MENTION": 127 mention = mention["*miniProfile"] 128 author_mentions.append(mention["publicIdentifier"]) 129 author_name_mentions.append(" ".join([mention.get("firstName", ""), mention.get("lastName", "")])) 130 elif mention["type"] == "COMPANY_NAME": 131 mention = mention["*miniCompany"] 132 author_mentions.append(mention["universalName"]) 133 author_name_mentions.append(mention.get("name", "")) 134 135 # same for metrics 136 if "*totalSocialActivityCounts" in item["*socialDetail"]: 137 metrics = { 138 "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"], 139 "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"], 140 "reactions": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"], 141 "reaction_like": 0, 142 "reaction_empathy": 0, 143 "reaction_praise": 0, 144 "reaction_entertainment": 0, 145 "reaction_appreciation": 0, 146 "reaction_interest": 0 147 } 148 # There's different kind of reaction metrics 149 for reaction_type in item["*socialDetail"]["*totalSocialActivityCounts"].get("reactionTypeCounts", []): 150 metrics["reaction_" + reaction_type["reactionType"].lower()] = reaction_type["count"] 151 152 else: 153 metrics = { 154 "comments": item["*socialDetail"]["comments"]["paging"]["total"], 155 "shares": item["*socialDetail"]["totalShares"], 156 "reactions": item["*socialDetail"]["likes"]["paging"]["total"] 157 } 158 159 # and links 160 link_url = "" 161 if item.get("content") and item["content"].get("navigationContext"): 162 link_url = item["content"]["navigationContext"].get("actionTarget", "") 163 elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]: 164 link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "") 165 166 return MappedItem({ 167 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 168 "id": item_id, 169 "thread_id": item_id, 170 "body": item["commentary"]["text"]["text"] if item["commentary"] else "", 171 "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"), 172 "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"), 173 "timestamp_ago": time_ago.split("•")[0].strip(), 174 "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no", 175 **{("author_" + k).replace("_username", ""): v for k, v in author.items()}, 176 "author_mentions": ",".join(author_mentions), 177 "author_name_mentions": ",".join(author_name_mentions), 178 "hashtags": ",".join(hashtags), 179 "image_urls": ",".join(images), 180 "video_thumb_url": video_thumb_url, 181 "post_url": "https://www.linkedin.com/feed/update/" + urn, 182 "link_url": link_url, 183 **metrics, 184 "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "", 185 "unix_timestamp": timestamp, 186 "unix_timestamp_collected": time_collected 187 }) 188 189 @staticmethod 190 def get_author(post): 191 """ 192 Extract author information from post 193 194 This is a bit complicated because it works differently for companies 195 and users and some fields are not always present. Hence, a separate 196 method. 197 198 :param dict post: Post data 199 :return dict: Author information 200 """ 201 author = { 202 "username": post["actor"]["navigationContext"]["actionTarget"].split("linkedin.com/").pop().split("?")[0], 203 "name": post["actor"]["name"]["text"], 204 "description": post["actor"].get("description", {}).get("text", "") if post["actor"].get("description") else "", 205 "pronouns": "", 206 "avatar_url": "", 207 "is_company": "no", 208 "url": post["actor"]["navigationContext"]["actionTarget"].split("?")[0], 209 } 210 211 # likewise for author avatars 212 if post["actor"]["name"].get("attributes"): 213 if "*miniProfile" in post["actor"]["name"]["attributes"][0]: 214 author_profile = post["actor"]["name"]["attributes"][0]["*miniProfile"] 215 if author_profile["picture"]: 216 avatar_artifacts = sorted(author_profile["picture"]["artifacts"], key=lambda x: x["width"], reverse=True) 217 author.update({"avatar_url": author_profile["picture"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]}) 218 219 if author_profile.get("customPronoun"): 220 author.update({"pronouns": author_profile.get("customPronoun")}) 221 elif author_profile.get("standardizedPronoun"): 222 author.update({"pronouns": author_profile.get("standardizedPronoun").lower()}) 223 224 elif "*miniCompany" in post["actor"]["name"]["attributes"][0]: 225 author_profile = post["actor"]["name"]["attributes"][0]["*miniCompany"] 226 avatar_artifacts = sorted(author_profile["logo"]["artifacts"], key=lambda x: x["width"], reverse=True) 227 228 author.update({"is_company": "yes"}) 229 author.update({"avatar_url": author_profile["logo"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]}) 230 231 if post["actor"]["name"].get("attributesV2"): 232 pronouns = post["actor"]["name"]["attributesV2"][0]["detailData"].get("*profileFullName", {}).get("pronoun") 233 if pronouns: 234 if pronouns.get("customPronoun"): 235 author.update({"pronouns": pronouns.get("customPronoun")}) 236 elif pronouns.get("standardizedPronoun"): 237 author.update({"pronouns": pronouns.get("standardizedPronoun")}) 238 239 avatar = post["actor"]["image"].get("attributes", [{}])[0].get("detailData", {}).get("nonEntityProfilePicture") 240 if avatar and avatar["vectorImage"]: 241 author.update({"avatar_url": avatar["vectorImage"]["rootUrl"] + avatar["vectorImage"]["artifacts"][0]["fileIdentifyingUrlPathSegment"]}) 242 243 return author 244 245 @staticmethod 246 def parse_time_ago(time_ago): 247 """ 248 Attempt to parse a timestamp for a post 249 250 LinkedIn doesn't give us the actual timestamp, only a relative 251 indicator like "18h ago". This is annoying because it gets more 252 imprecise the longer ago it is, and because it is language-sensitive. 253 For example, in English 18 months is displayed as "18mo" but in Dutch 254 it is "18 mnd". 255 256 Right now this will only adjust the 'collected at' timestamp if the 257 data was scraped from an English or Dutch interface, and even then the 258 timestamps will still be imprecise. 259 260 :param str time_ago: Relative timestamp, e.g. '18mo'. 261 :return int: Estimated timestamp of post, as unix timestamp 262 """ 263 time_ago = time_ago.split("•")[0] 264 numbers = re.sub(r"[^0-9]", "", time_ago).strip() 265 letters = re.sub(r"[0-9]", "", time_ago).strip() 266 267 period_lengths = { 268 "s": 1, 269 "m": 60, 270 "h": 3600, 271 "d": 86400, 272 "w": 7 * 86400, 273 "mo": 30.4375 * 86400, # we don't know WHICH months, so use the average length of a month 274 "mnd": 30.4375 * 86400, 275 "yr": 365.25 * 86400, # likewise 276 "j": 365.25 * 86400, 277 } 278 279 numbers = int(numbers) if len(numbers) else 0 280 return period_lengths.get(letters, 0) * numbers
Import scraped LinkedIn data
35 def get_items(self, query): 36 """ 37 Run custom search 38 39 Not available for LinkedIn 40 """ 41 raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere")
Run custom search
Not available for LinkedIn
43 @staticmethod 44 def map_item(item): 45 """ 46 Parse LinkedIn post in Voyager V2 format 47 48 'Voyager V2' seems to be how the format is referred to in the data 49 itself... 50 51 :param item: Data as received from LinkedIn 52 :return dict: Mapped item 53 """ 54 55 # annoyingly, posts don't come with a timestamp 56 # approximate it by using the time of collection and the "time ago" 57 # included with the post (e.g. 'published 18h ago') 58 if not item.get("actor"): 59 return {} 60 61 if "__import_meta" in item: 62 time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000) # milliseconds 63 else: 64 # best we got 65 time_collected = int(time.time()) 66 67 time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else "" 68 timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago)) 69 70 # images are stored in some convoluted way 71 # there are multiple URLs for various thumbnails, use the one for the 72 # largest version of the image 73 images = [] 74 if item["content"] and "images" in item["content"]: 75 for image in item["content"]["images"]: 76 image_data = image["attributes"][0]["vectorImage"] 77 artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True) 78 url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"] 79 images.append(url) 80 81 # or alternatively they are stored here: 82 if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"): 83 image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"] 84 if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]: 85 images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"]) 86 elif image and image.get("artifacts"): 87 images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"]) 88 89 # video thumbnails are stored similarly as image data 90 video_thumb_url = "" 91 thumb_content = None 92 if item["content"] and "*videoPlayMetadata" in item["content"]: 93 thumb_content = item["content"]["*videoPlayMetadata"]["thumbnail"] 94 elif item["content"] and "linkedInVideoComponent" in item["content"] and item["content"]["linkedInVideoComponent"]: 95 thumb_content = item["content"]["linkedInVideoComponent"]["*videoPlayMetadata"]["thumbnail"] 96 elif item["content"] and "externalVideoComponent" in item["content"] and item["content"]["externalVideoComponent"]: 97 thumb_content = item["content"]["externalVideoComponent"]["*videoPlayMetadata"]["thumbnail"] 98 if thumb_content: 99 video_thumb_url = thumb_content["rootUrl"] + thumb_content["artifacts"][0]["fileIdentifyingUrlPathSegment"] 100 101 author = SearchLinkedIn.get_author(item) 102 103 # the ID is in the format 'urn:li:activity:6960882777168695296' 104 # retain the numerical part as the item ID for 4CAT 105 # sometimes posts seem to be combined, e.g.: 106 # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449) 107 # effectively both IDs seem to refer to the same post, so just take the 108 # first one 109 meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"] 110 urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0] 111 item_id = urn.split(":").pop() 112 113 # the way hashtags were stored changed at some point 114 hashtags = [] 115 if item["commentary"] and "attributes" in item["commentary"]["text"]: 116 hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"] 117 elif item["commentary"] and "attributesV2" in item["commentary"]["text"]: 118 hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]] 119 120 # and mentions 121 # we're storing both usernames and full names 122 author_mentions = [] 123 author_name_mentions = [] 124 if item["commentary"] and "attributes" in item["commentary"]["text"]: 125 for mention in item["commentary"]["text"].get("attributes", {}): 126 if mention["type"] == "PROFILE_MENTION": 127 mention = mention["*miniProfile"] 128 author_mentions.append(mention["publicIdentifier"]) 129 author_name_mentions.append(" ".join([mention.get("firstName", ""), mention.get("lastName", "")])) 130 elif mention["type"] == "COMPANY_NAME": 131 mention = mention["*miniCompany"] 132 author_mentions.append(mention["universalName"]) 133 author_name_mentions.append(mention.get("name", "")) 134 135 # same for metrics 136 if "*totalSocialActivityCounts" in item["*socialDetail"]: 137 metrics = { 138 "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"], 139 "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"], 140 "reactions": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"], 141 "reaction_like": 0, 142 "reaction_empathy": 0, 143 "reaction_praise": 0, 144 "reaction_entertainment": 0, 145 "reaction_appreciation": 0, 146 "reaction_interest": 0 147 } 148 # There's different kind of reaction metrics 149 for reaction_type in item["*socialDetail"]["*totalSocialActivityCounts"].get("reactionTypeCounts", []): 150 metrics["reaction_" + reaction_type["reactionType"].lower()] = reaction_type["count"] 151 152 else: 153 metrics = { 154 "comments": item["*socialDetail"]["comments"]["paging"]["total"], 155 "shares": item["*socialDetail"]["totalShares"], 156 "reactions": item["*socialDetail"]["likes"]["paging"]["total"] 157 } 158 159 # and links 160 link_url = "" 161 if item.get("content") and item["content"].get("navigationContext"): 162 link_url = item["content"]["navigationContext"].get("actionTarget", "") 163 elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]: 164 link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "") 165 166 return MappedItem({ 167 "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")), # Zeeschuimer metadata 168 "id": item_id, 169 "thread_id": item_id, 170 "body": item["commentary"]["text"]["text"] if item["commentary"] else "", 171 "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"), 172 "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"), 173 "timestamp_ago": time_ago.split("•")[0].strip(), 174 "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no", 175 **{("author_" + k).replace("_username", ""): v for k, v in author.items()}, 176 "author_mentions": ",".join(author_mentions), 177 "author_name_mentions": ",".join(author_name_mentions), 178 "hashtags": ",".join(hashtags), 179 "image_urls": ",".join(images), 180 "video_thumb_url": video_thumb_url, 181 "post_url": "https://www.linkedin.com/feed/update/" + urn, 182 "link_url": link_url, 183 **metrics, 184 "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "", 185 "unix_timestamp": timestamp, 186 "unix_timestamp_collected": time_collected 187 })
Parse LinkedIn post in Voyager V2 format
'Voyager V2' seems to be how the format is referred to in the data itself...
Parameters
- item: Data as received from LinkedIn
Returns
Mapped item
245 @staticmethod 246 def parse_time_ago(time_ago): 247 """ 248 Attempt to parse a timestamp for a post 249 250 LinkedIn doesn't give us the actual timestamp, only a relative 251 indicator like "18h ago". This is annoying because it gets more 252 imprecise the longer ago it is, and because it is language-sensitive. 253 For example, in English 18 months is displayed as "18mo" but in Dutch 254 it is "18 mnd". 255 256 Right now this will only adjust the 'collected at' timestamp if the 257 data was scraped from an English or Dutch interface, and even then the 258 timestamps will still be imprecise. 259 260 :param str time_ago: Relative timestamp, e.g. '18mo'. 261 :return int: Estimated timestamp of post, as unix timestamp 262 """ 263 time_ago = time_ago.split("•")[0] 264 numbers = re.sub(r"[^0-9]", "", time_ago).strip() 265 letters = re.sub(r"[0-9]", "", time_ago).strip() 266 267 period_lengths = { 268 "s": 1, 269 "m": 60, 270 "h": 3600, 271 "d": 86400, 272 "w": 7 * 86400, 273 "mo": 30.4375 * 86400, # we don't know WHICH months, so use the average length of a month 274 "mnd": 30.4375 * 86400, 275 "yr": 365.25 * 86400, # likewise 276 "j": 365.25 * 86400, 277 } 278 279 numbers = int(numbers) if len(numbers) else 0 280 return period_lengths.get(letters, 0) * numbers
Attempt to parse a timestamp for a post
LinkedIn doesn't give us the actual timestamp, only a relative indicator like "18h ago". This is annoying because it gets more imprecise the longer ago it is, and because it is language-sensitive. For example, in English 18 months is displayed as "18mo" but in Dutch it is "18 mnd".
Right now this will only adjust the 'collected at' timestamp if the data was scraped from an English or Dutch interface, and even then the timestamps will still be imprecise.
Parameters
- str time_ago: Relative timestamp, e.g. '18mo'.
Returns
Estimated timestamp of post, as unix timestamp
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- run_interruptable_process
- get_queue_id
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- for_cleanup
- work
- after_process
- clean_up_on_error
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor