datasources.linkedin.search_linkedin
Import scraped LinkedIn data
It's prohibitively difficult to scrape data from LinkedIn within 4CAT itself due to its aggressive rate limiting and login wall. Instead, import data collected elsewhere.
1""" 2Import scraped LinkedIn data 3 4It's prohibitively difficult to scrape data from LinkedIn within 4CAT itself 5due to its aggressive rate limiting and login wall. Instead, import data 6collected elsewhere. 7""" 8import datetime 9import time 10import re 11 12from backend.lib.search import Search 13from common.lib.item_mapping import MappedItem 14 15class SearchLinkedIn(Search): 16 """ 17 Import scraped LinkedIn data 18 """ 19 type = "linkedin-search" # job ID 20 category = "Search" # category 21 title = "Import scraped LinkedIn data" # title displayed in UI 22 description = "Import LinkedIn data collected with an external tool such as Zeeschuimer." # description displayed in UI 23 extension = "ndjson" # extension of result file, used internally and in UI 24 is_from_zeeschuimer = True 25 26 # not available as a processor for existing datasets 27 accepts = [None] 28 references = [ 29 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 30 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)" 31 ] 32 33 def get_items(self, query): 34 """ 35 Run custom search 36 37 Not available for LinkedIn 38 """ 39 raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere") 40 41 @staticmethod 42 def map_item(item): 43 """ 44 Parse LinkedIn post in Voyager V2 format 45 46 'Voyager V2' seems to be how the format is referred to in the data 47 itself... 48 49 :param item: Data as received from LinkedIn 50 :return dict: Mapped item 51 """ 52 53 # annoyingly, posts don't come with a timestamp 54 # approximate it by using the time of collection and the "time ago" 55 # included with the post (e.g. 'published 18h ago') 56 if not item.get("actor"): 57 return {} 58 59 if "__import_meta" in item: 60 time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000) # milliseconds 61 else: 62 # best we got 63 time_collected = int(time.time()) 64 65 time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else "" 66 timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago)) 67 68 # images are stored in some convoluted way 69 # there are multiple URLs for various thumbnails, use the one for the 70 # largest version of the image 71 images = [] 72 if item["content"] and "images" in item["content"]: 73 for image in item["content"]["images"]: 74 image_data = image["attributes"][0]["vectorImage"] 75 artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True) 76 url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"] 77 images.append(url) 78 79 # or alternatively they are stored here: 80 if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"): 81 image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"] 82 if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]: 83 images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"]) 84 elif image and image.get("artifacts"): 85 images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"]) 86 87 author = SearchLinkedIn.get_author(item) 88 89 # the ID is in the format 'urn:li:activity:6960882777168695296' 90 # retain the numerical part as the item ID for 4CAT 91 # sometimes posts seem to be combined, e.g.: 92 # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449) 93 # effectively both IDs seem to refer to the same post, so just take the 94 # first one 95 meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"] 96 urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0] 97 item_id = urn.split(":").pop() 98 99 # the way hashtags were stored changed at some point 100 hashtags = [] 101 if item["commentary"] and "attributes" in item["commentary"]["text"]: 102 hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"] 103 elif item["commentary"] and "attributesV2" in item["commentary"]["text"]: 104 hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]] 105 106 # same for metrics 107 if "*totalSocialActivityCounts" in item["*socialDetail"]: 108 metrics = { 109 "likes": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"], 110 "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"], 111 "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"] 112 } 113 else: 114 metrics = { 115 "likes": item["*socialDetail"]["likes"]["paging"]["total"], 116 "comments": item["*socialDetail"]["comments"]["paging"]["total"], 117 "shares": item["*socialDetail"]["totalShares"], 118 } 119 120 # and links 121 link_url = "" 122 if item.get("content") and item["content"].get("navigationContext"): 123 link_url = item["content"]["navigationContext"].get("actionTarget", "") 124 elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]: 125 link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "") 126 127 return MappedItem({ 128 "id": item_id, 129 "thread_id": item_id, 130 "body": item["commentary"]["text"]["text"] if item["commentary"] else "", 131 "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"), 132 "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"), 133 "timestamp_ago": time_ago.split("•")[0].strip(), 134 "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no", 135 **{("author_" + k).replace("_username", ""): v for k, v in author.items()}, 136 "hashtags": ",".join(hashtags), 137 "image_urls": ",".join(images), 138 "post_url": "https://www.linkedin.com/feed/update/" + urn, 139 "link_url": link_url, 140 **metrics, 141 "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "", 142 "unix_timestamp": timestamp, 143 "unix_timestamp_collected": time_collected 144 }) 145 146 @staticmethod 147 def get_author(post): 148 """ 149 Extract author information from post 150 151 This is a bit complicated because it works differently for companies 152 and users and some fields are not always present. Hence, a separate 153 method. 154 155 :param dict post: Post data 156 :return dict: Author information 157 """ 158 author = { 159 "username": post["actor"]["navigationContext"]["actionTarget"].split("linkedin.com/").pop().split("?")[0], 160 "name": post["actor"]["name"]["text"], 161 "description": post["actor"].get("description", {}).get("text", "") if post["actor"].get("description") else "", 162 "pronouns": "", 163 "avatar_url": "", 164 "is_company": "no", 165 "url": post["actor"]["navigationContext"]["actionTarget"].split("?")[0], 166 } 167 168 # likewise for author avatars 169 if post["actor"]["name"].get("attributes"): 170 if "*miniProfile" in post["actor"]["name"]["attributes"][0]: 171 author_profile = post["actor"]["name"]["attributes"][0]["*miniProfile"] 172 if author_profile["picture"]: 173 avatar_artifacts = sorted(author_profile["picture"]["artifacts"], key=lambda x: x["width"], reverse=True) 174 author.update({"avatar_url": author_profile["picture"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]}) 175 176 if author_profile.get("customPronoun"): 177 author.update({"pronouns": author_profile.get("customPronoun")}) 178 elif author_profile.get("standardizedPronoun"): 179 author.update({"pronouns": author_profile.get("standardizedPronoun").lower()}) 180 181 elif "*miniCompany" in post["actor"]["name"]["attributes"][0]: 182 author_profile = post["actor"]["name"]["attributes"][0]["*miniCompany"] 183 avatar_artifacts = sorted(author_profile["logo"]["artifacts"], key=lambda x: x["width"], reverse=True) 184 185 author.update({"is_company": "yes"}) 186 author.update({"avatar_url": author_profile["logo"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]}) 187 188 if post["actor"]["name"].get("attributesV2"): 189 pronouns = post["actor"]["name"]["attributesV2"][0]["detailData"].get("*profileFullName", {}).get("pronoun") 190 if pronouns: 191 if pronouns.get("customPronoun"): 192 author.update({"pronouns": pronouns.get("customPronoun")}) 193 elif pronouns.get("standardizedPronoun"): 194 author.update({"pronouns": pronouns.get("standardizedPronoun")}) 195 196 avatar = post["actor"]["image"].get("attributes", [{}])[0].get("detailData", {}).get("nonEntityProfilePicture") 197 if avatar and avatar["vectorImage"]: 198 author.update({"avatar_url": avatar["vectorImage"]["rootUrl"] + avatar["vectorImage"]["artifacts"][0]["fileIdentifyingUrlPathSegment"]}) 199 200 return author 201 202 @staticmethod 203 def parse_time_ago(time_ago): 204 """ 205 Attempt to parse a timestamp for a post 206 207 LinkedIn doesn't give us the actual timestamp, only a relative 208 indicator like "18h ago". This is annoying because it gets more 209 imprecise the longer ago it is, and because it is language-sensitive. 210 For example, in English 18 months is displayed as "18mo" but in Dutch 211 it is "18 mnd". 212 213 Right now this will only adjust the 'collected at' timestamp if the 214 data was scraped from an English or Dutch interface, and even then the 215 timestamps will still be imprecise. 216 217 :param str time_ago: Relative timestamp, e.g. '18mo'. 218 :return int: Estimated timestamp of post, as unix timestamp 219 """ 220 time_ago = time_ago.split("•")[0] 221 numbers = re.sub(r"[^0-9]", "", time_ago).strip() 222 letters = re.sub(r"[0-9]", "", time_ago).strip() 223 224 period_lengths = { 225 "s": 1, 226 "m": 60, 227 "h": 3600, 228 "d": 86400, 229 "w": 7 * 86400, 230 "mo": 30.4375 * 86400, # we don't know WHICH months, so use the average length of a month 231 "mnd": 30.4375 * 86400, 232 "yr": 365.25 * 86400, # likewise 233 "j": 365.25 * 86400, 234 } 235 236 numbers = int(numbers) if len(numbers) else 0 237 return period_lengths.get(letters, 0) * numbers
16class SearchLinkedIn(Search): 17 """ 18 Import scraped LinkedIn data 19 """ 20 type = "linkedin-search" # job ID 21 category = "Search" # category 22 title = "Import scraped LinkedIn data" # title displayed in UI 23 description = "Import LinkedIn data collected with an external tool such as Zeeschuimer." # description displayed in UI 24 extension = "ndjson" # extension of result file, used internally and in UI 25 is_from_zeeschuimer = True 26 27 # not available as a processor for existing datasets 28 accepts = [None] 29 references = [ 30 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 31 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)" 32 ] 33 34 def get_items(self, query): 35 """ 36 Run custom search 37 38 Not available for LinkedIn 39 """ 40 raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere") 41 42 @staticmethod 43 def map_item(item): 44 """ 45 Parse LinkedIn post in Voyager V2 format 46 47 'Voyager V2' seems to be how the format is referred to in the data 48 itself... 49 50 :param item: Data as received from LinkedIn 51 :return dict: Mapped item 52 """ 53 54 # annoyingly, posts don't come with a timestamp 55 # approximate it by using the time of collection and the "time ago" 56 # included with the post (e.g. 'published 18h ago') 57 if not item.get("actor"): 58 return {} 59 60 if "__import_meta" in item: 61 time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000) # milliseconds 62 else: 63 # best we got 64 time_collected = int(time.time()) 65 66 time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else "" 67 timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago)) 68 69 # images are stored in some convoluted way 70 # there are multiple URLs for various thumbnails, use the one for the 71 # largest version of the image 72 images = [] 73 if item["content"] and "images" in item["content"]: 74 for image in item["content"]["images"]: 75 image_data = image["attributes"][0]["vectorImage"] 76 artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True) 77 url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"] 78 images.append(url) 79 80 # or alternatively they are stored here: 81 if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"): 82 image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"] 83 if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]: 84 images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"]) 85 elif image and image.get("artifacts"): 86 images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"]) 87 88 author = SearchLinkedIn.get_author(item) 89 90 # the ID is in the format 'urn:li:activity:6960882777168695296' 91 # retain the numerical part as the item ID for 4CAT 92 # sometimes posts seem to be combined, e.g.: 93 # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449) 94 # effectively both IDs seem to refer to the same post, so just take the 95 # first one 96 meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"] 97 urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0] 98 item_id = urn.split(":").pop() 99 100 # the way hashtags were stored changed at some point 101 hashtags = [] 102 if item["commentary"] and "attributes" in item["commentary"]["text"]: 103 hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"] 104 elif item["commentary"] and "attributesV2" in item["commentary"]["text"]: 105 hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]] 106 107 # same for metrics 108 if "*totalSocialActivityCounts" in item["*socialDetail"]: 109 metrics = { 110 "likes": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"], 111 "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"], 112 "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"] 113 } 114 else: 115 metrics = { 116 "likes": item["*socialDetail"]["likes"]["paging"]["total"], 117 "comments": item["*socialDetail"]["comments"]["paging"]["total"], 118 "shares": item["*socialDetail"]["totalShares"], 119 } 120 121 # and links 122 link_url = "" 123 if item.get("content") and item["content"].get("navigationContext"): 124 link_url = item["content"]["navigationContext"].get("actionTarget", "") 125 elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]: 126 link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "") 127 128 return MappedItem({ 129 "id": item_id, 130 "thread_id": item_id, 131 "body": item["commentary"]["text"]["text"] if item["commentary"] else "", 132 "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"), 133 "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"), 134 "timestamp_ago": time_ago.split("•")[0].strip(), 135 "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no", 136 **{("author_" + k).replace("_username", ""): v for k, v in author.items()}, 137 "hashtags": ",".join(hashtags), 138 "image_urls": ",".join(images), 139 "post_url": "https://www.linkedin.com/feed/update/" + urn, 140 "link_url": link_url, 141 **metrics, 142 "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "", 143 "unix_timestamp": timestamp, 144 "unix_timestamp_collected": time_collected 145 }) 146 147 @staticmethod 148 def get_author(post): 149 """ 150 Extract author information from post 151 152 This is a bit complicated because it works differently for companies 153 and users and some fields are not always present. Hence, a separate 154 method. 155 156 :param dict post: Post data 157 :return dict: Author information 158 """ 159 author = { 160 "username": post["actor"]["navigationContext"]["actionTarget"].split("linkedin.com/").pop().split("?")[0], 161 "name": post["actor"]["name"]["text"], 162 "description": post["actor"].get("description", {}).get("text", "") if post["actor"].get("description") else "", 163 "pronouns": "", 164 "avatar_url": "", 165 "is_company": "no", 166 "url": post["actor"]["navigationContext"]["actionTarget"].split("?")[0], 167 } 168 169 # likewise for author avatars 170 if post["actor"]["name"].get("attributes"): 171 if "*miniProfile" in post["actor"]["name"]["attributes"][0]: 172 author_profile = post["actor"]["name"]["attributes"][0]["*miniProfile"] 173 if author_profile["picture"]: 174 avatar_artifacts = sorted(author_profile["picture"]["artifacts"], key=lambda x: x["width"], reverse=True) 175 author.update({"avatar_url": author_profile["picture"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]}) 176 177 if author_profile.get("customPronoun"): 178 author.update({"pronouns": author_profile.get("customPronoun")}) 179 elif author_profile.get("standardizedPronoun"): 180 author.update({"pronouns": author_profile.get("standardizedPronoun").lower()}) 181 182 elif "*miniCompany" in post["actor"]["name"]["attributes"][0]: 183 author_profile = post["actor"]["name"]["attributes"][0]["*miniCompany"] 184 avatar_artifacts = sorted(author_profile["logo"]["artifacts"], key=lambda x: x["width"], reverse=True) 185 186 author.update({"is_company": "yes"}) 187 author.update({"avatar_url": author_profile["logo"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]}) 188 189 if post["actor"]["name"].get("attributesV2"): 190 pronouns = post["actor"]["name"]["attributesV2"][0]["detailData"].get("*profileFullName", {}).get("pronoun") 191 if pronouns: 192 if pronouns.get("customPronoun"): 193 author.update({"pronouns": pronouns.get("customPronoun")}) 194 elif pronouns.get("standardizedPronoun"): 195 author.update({"pronouns": pronouns.get("standardizedPronoun")}) 196 197 avatar = post["actor"]["image"].get("attributes", [{}])[0].get("detailData", {}).get("nonEntityProfilePicture") 198 if avatar and avatar["vectorImage"]: 199 author.update({"avatar_url": avatar["vectorImage"]["rootUrl"] + avatar["vectorImage"]["artifacts"][0]["fileIdentifyingUrlPathSegment"]}) 200 201 return author 202 203 @staticmethod 204 def parse_time_ago(time_ago): 205 """ 206 Attempt to parse a timestamp for a post 207 208 LinkedIn doesn't give us the actual timestamp, only a relative 209 indicator like "18h ago". This is annoying because it gets more 210 imprecise the longer ago it is, and because it is language-sensitive. 211 For example, in English 18 months is displayed as "18mo" but in Dutch 212 it is "18 mnd". 213 214 Right now this will only adjust the 'collected at' timestamp if the 215 data was scraped from an English or Dutch interface, and even then the 216 timestamps will still be imprecise. 217 218 :param str time_ago: Relative timestamp, e.g. '18mo'. 219 :return int: Estimated timestamp of post, as unix timestamp 220 """ 221 time_ago = time_ago.split("•")[0] 222 numbers = re.sub(r"[^0-9]", "", time_ago).strip() 223 letters = re.sub(r"[0-9]", "", time_ago).strip() 224 225 period_lengths = { 226 "s": 1, 227 "m": 60, 228 "h": 3600, 229 "d": 86400, 230 "w": 7 * 86400, 231 "mo": 30.4375 * 86400, # we don't know WHICH months, so use the average length of a month 232 "mnd": 30.4375 * 86400, 233 "yr": 365.25 * 86400, # likewise 234 "j": 365.25 * 86400, 235 } 236 237 numbers = int(numbers) if len(numbers) else 0 238 return period_lengths.get(letters, 0) * numbers
Import scraped LinkedIn data
34 def get_items(self, query): 35 """ 36 Run custom search 37 38 Not available for LinkedIn 39 """ 40 raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere")
Run custom search
Not available for LinkedIn
42 @staticmethod 43 def map_item(item): 44 """ 45 Parse LinkedIn post in Voyager V2 format 46 47 'Voyager V2' seems to be how the format is referred to in the data 48 itself... 49 50 :param item: Data as received from LinkedIn 51 :return dict: Mapped item 52 """ 53 54 # annoyingly, posts don't come with a timestamp 55 # approximate it by using the time of collection and the "time ago" 56 # included with the post (e.g. 'published 18h ago') 57 if not item.get("actor"): 58 return {} 59 60 if "__import_meta" in item: 61 time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000) # milliseconds 62 else: 63 # best we got 64 time_collected = int(time.time()) 65 66 time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else "" 67 timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago)) 68 69 # images are stored in some convoluted way 70 # there are multiple URLs for various thumbnails, use the one for the 71 # largest version of the image 72 images = [] 73 if item["content"] and "images" in item["content"]: 74 for image in item["content"]["images"]: 75 image_data = image["attributes"][0]["vectorImage"] 76 artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True) 77 url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"] 78 images.append(url) 79 80 # or alternatively they are stored here: 81 if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"): 82 image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"] 83 if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]: 84 images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"]) 85 elif image and image.get("artifacts"): 86 images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"]) 87 88 author = SearchLinkedIn.get_author(item) 89 90 # the ID is in the format 'urn:li:activity:6960882777168695296' 91 # retain the numerical part as the item ID for 4CAT 92 # sometimes posts seem to be combined, e.g.: 93 # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449) 94 # effectively both IDs seem to refer to the same post, so just take the 95 # first one 96 meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"] 97 urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0] 98 item_id = urn.split(":").pop() 99 100 # the way hashtags were stored changed at some point 101 hashtags = [] 102 if item["commentary"] and "attributes" in item["commentary"]["text"]: 103 hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"] 104 elif item["commentary"] and "attributesV2" in item["commentary"]["text"]: 105 hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]] 106 107 # same for metrics 108 if "*totalSocialActivityCounts" in item["*socialDetail"]: 109 metrics = { 110 "likes": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"], 111 "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"], 112 "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"] 113 } 114 else: 115 metrics = { 116 "likes": item["*socialDetail"]["likes"]["paging"]["total"], 117 "comments": item["*socialDetail"]["comments"]["paging"]["total"], 118 "shares": item["*socialDetail"]["totalShares"], 119 } 120 121 # and links 122 link_url = "" 123 if item.get("content") and item["content"].get("navigationContext"): 124 link_url = item["content"]["navigationContext"].get("actionTarget", "") 125 elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]: 126 link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "") 127 128 return MappedItem({ 129 "id": item_id, 130 "thread_id": item_id, 131 "body": item["commentary"]["text"]["text"] if item["commentary"] else "", 132 "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"), 133 "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"), 134 "timestamp_ago": time_ago.split("•")[0].strip(), 135 "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no", 136 **{("author_" + k).replace("_username", ""): v for k, v in author.items()}, 137 "hashtags": ",".join(hashtags), 138 "image_urls": ",".join(images), 139 "post_url": "https://www.linkedin.com/feed/update/" + urn, 140 "link_url": link_url, 141 **metrics, 142 "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "", 143 "unix_timestamp": timestamp, 144 "unix_timestamp_collected": time_collected 145 })
Parse LinkedIn post in Voyager V2 format
'Voyager V2' seems to be how the format is referred to in the data itself...
Parameters
- item: Data as received from LinkedIn
Returns
Mapped item
203 @staticmethod 204 def parse_time_ago(time_ago): 205 """ 206 Attempt to parse a timestamp for a post 207 208 LinkedIn doesn't give us the actual timestamp, only a relative 209 indicator like "18h ago". This is annoying because it gets more 210 imprecise the longer ago it is, and because it is language-sensitive. 211 For example, in English 18 months is displayed as "18mo" but in Dutch 212 it is "18 mnd". 213 214 Right now this will only adjust the 'collected at' timestamp if the 215 data was scraped from an English or Dutch interface, and even then the 216 timestamps will still be imprecise. 217 218 :param str time_ago: Relative timestamp, e.g. '18mo'. 219 :return int: Estimated timestamp of post, as unix timestamp 220 """ 221 time_ago = time_ago.split("•")[0] 222 numbers = re.sub(r"[^0-9]", "", time_ago).strip() 223 letters = re.sub(r"[0-9]", "", time_ago).strip() 224 225 period_lengths = { 226 "s": 1, 227 "m": 60, 228 "h": 3600, 229 "d": 86400, 230 "w": 7 * 86400, 231 "mo": 30.4375 * 86400, # we don't know WHICH months, so use the average length of a month 232 "mnd": 30.4375 * 86400, 233 "yr": 365.25 * 86400, # likewise 234 "j": 365.25 * 86400, 235 } 236 237 numbers = int(numbers) if len(numbers) else 0 238 return period_lengths.get(letters, 0) * numbers
Attempt to parse a timestamp for a post
LinkedIn doesn't give us the actual timestamp, only a relative indicator like "18h ago". This is annoying because it gets more imprecise the longer ago it is, and because it is language-sensitive. For example, in English 18 months is displayed as "18mo" but in Dutch it is "18 mnd".
Right now this will only adjust the 'collected at' timestamp if the data was scraped from an English or Dutch interface, and even then the timestamps will still be imprecise.
Parameters
- str time_ago: Relative timestamp, e.g. '18mo'.
Returns
Estimated timestamp of post, as unix timestamp
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor