Edit on GitHub

datasources.linkedin.search_linkedin

Import scraped LinkedIn data

It's prohibitively difficult to scrape data from LinkedIn within 4CAT itself due to its aggressive rate limiting and login wall. Instead, import data collected elsewhere.

  1"""
  2Import scraped LinkedIn data
  3
  4It's prohibitively difficult to scrape data from LinkedIn within 4CAT itself
  5due to its aggressive rate limiting and login wall. Instead, import data
  6collected elsewhere.
  7"""
  8import datetime
  9import time
 10import re
 11
 12from backend.lib.search import Search
 13from common.lib.item_mapping import MappedItem
 14
 15class SearchLinkedIn(Search):
 16    """
 17    Import scraped LinkedIn data
 18    """
 19    type = "linkedin-search"  # job ID
 20    category = "Search"  # category
 21    title = "Import scraped LinkedIn data"  # title displayed in UI
 22    description = "Import LinkedIn data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 23    extension = "ndjson"  # extension of result file, used internally and in UI
 24    is_from_zeeschuimer = True
 25
 26    # not available as a processor for existing datasets
 27    accepts = [None]
 28    references = [
 29        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 30        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)"
 31    ]
 32
 33    def get_items(self, query):
 34        """
 35        Run custom search
 36
 37        Not available for LinkedIn
 38        """
 39        raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere")
 40
 41    @staticmethod
 42    def map_item(item):
 43        """
 44        Parse LinkedIn post in Voyager V2 format
 45
 46        'Voyager V2' seems to be how the format is referred to in the data
 47        itself...
 48
 49        :param item:  Data as received from LinkedIn
 50        :return dict:  Mapped item
 51        """
 52
 53        # annoyingly, posts don't come with a timestamp
 54        # approximate it by using the time of collection and the "time ago"
 55        # included with the post (e.g. 'published 18h ago')
 56        if not item.get("actor"):
 57            return {}
 58
 59        if "__import_meta" in item:
 60            time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000)  # milliseconds
 61        else:
 62            # best we got
 63            time_collected = int(time.time())
 64
 65        time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else ""
 66        timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago))
 67
 68        # images are stored in some convoluted way
 69        # there are multiple URLs for various thumbnails, use the one for the
 70        # largest version of the image
 71        images = []
 72        if item["content"] and "images" in item["content"]:
 73            for image in item["content"]["images"]:
 74                image_data = image["attributes"][0]["vectorImage"]
 75                artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True)
 76                url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"]
 77                images.append(url)
 78
 79        # or alternatively they are stored here:
 80        if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"):
 81            image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"]
 82            if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]:
 83                images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"])
 84            elif image and image.get("artifacts"):
 85                images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"])
 86
 87        author = SearchLinkedIn.get_author(item)
 88
 89        # the ID is in the format 'urn:li:activity:6960882777168695296'
 90        # retain the numerical part as the item ID for 4CAT
 91        # sometimes posts seem to be combined, e.g.:
 92        # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449)
 93        # effectively both IDs seem to refer to the same post, so just take the
 94        # first one
 95        meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"]
 96        urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0]
 97        item_id = urn.split(":").pop()
 98
 99        # the way hashtags were stored changed at some point
100        hashtags = []
101        if item["commentary"] and "attributes" in item["commentary"]["text"]:
102            hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"]
103        elif item["commentary"] and "attributesV2" in item["commentary"]["text"]:
104            hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]]
105
106        # same for metrics
107        if "*totalSocialActivityCounts" in item["*socialDetail"]:
108            metrics = {
109                "likes": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"],
110                "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"],
111                "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"]
112            }
113        else:
114            metrics = {
115                "likes": item["*socialDetail"]["likes"]["paging"]["total"],
116                "comments": item["*socialDetail"]["comments"]["paging"]["total"],
117                "shares": item["*socialDetail"]["totalShares"],
118            }
119
120        # and links
121        link_url = ""
122        if item.get("content") and item["content"].get("navigationContext"):
123            link_url = item["content"]["navigationContext"].get("actionTarget", "")
124        elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]:
125            link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "")
126
127        return MappedItem({
128            "id": item_id,
129            "thread_id": item_id,
130            "body": item["commentary"]["text"]["text"] if item["commentary"] else "",
131            "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"),
132            "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"),
133            "timestamp_ago": time_ago.split("•")[0].strip(),
134            "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no",
135            **{("author_" + k).replace("_username", ""): v for k, v in author.items()},
136            "hashtags": ",".join(hashtags),
137            "image_urls": ",".join(images),
138            "post_url": "https://www.linkedin.com/feed/update/" + urn,
139            "link_url":  link_url,
140            **metrics,
141            "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "",
142            "unix_timestamp": timestamp,
143            "unix_timestamp_collected": time_collected
144        })
145
146    @staticmethod
147    def get_author(post):
148        """
149        Extract author information from post
150
151        This is a bit complicated because it works differently for companies
152        and users and some fields are not always present. Hence, a separate
153        method.
154
155        :param dict post:  Post data
156        :return dict:  Author information
157        """
158        author = {
159            "username": post["actor"]["navigationContext"]["actionTarget"].split("linkedin.com/").pop().split("?")[0],
160            "name": post["actor"]["name"]["text"],
161            "description": post["actor"].get("description", {}).get("text", "") if post["actor"].get("description") else "",
162            "pronouns": "",
163            "avatar_url": "",
164            "is_company": "no",
165            "url": post["actor"]["navigationContext"]["actionTarget"].split("?")[0],
166        }
167
168        # likewise for author avatars
169        if post["actor"]["name"].get("attributes"):
170            if "*miniProfile" in post["actor"]["name"]["attributes"][0]:
171                author_profile = post["actor"]["name"]["attributes"][0]["*miniProfile"]
172                if author_profile["picture"]:
173                    avatar_artifacts = sorted(author_profile["picture"]["artifacts"], key=lambda x: x["width"], reverse=True)
174                    author.update({"avatar_url": author_profile["picture"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
175
176                if author_profile.get("customPronoun"):
177                    author.update({"pronouns": author_profile.get("customPronoun")})
178                elif author_profile.get("standardizedPronoun"):
179                    author.update({"pronouns": author_profile.get("standardizedPronoun").lower()})
180
181            elif "*miniCompany" in post["actor"]["name"]["attributes"][0]:
182                author_profile = post["actor"]["name"]["attributes"][0]["*miniCompany"]
183                avatar_artifacts = sorted(author_profile["logo"]["artifacts"], key=lambda x: x["width"], reverse=True)
184
185                author.update({"is_company": "yes"})
186                author.update({"avatar_url": author_profile["logo"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
187
188        if post["actor"]["name"].get("attributesV2"):
189            pronouns = post["actor"]["name"]["attributesV2"][0]["detailData"].get("*profileFullName", {}).get("pronoun")
190            if pronouns:
191                if pronouns.get("customPronoun"):
192                    author.update({"pronouns": pronouns.get("customPronoun")})
193                elif pronouns.get("standardizedPronoun"):
194                    author.update({"pronouns": pronouns.get("standardizedPronoun")})
195
196        avatar = post["actor"]["image"].get("attributes", [{}])[0].get("detailData", {}).get("nonEntityProfilePicture")
197        if avatar and avatar["vectorImage"]:
198            author.update({"avatar_url": avatar["vectorImage"]["rootUrl"] + avatar["vectorImage"]["artifacts"][0]["fileIdentifyingUrlPathSegment"]})
199
200        return author
201
202    @staticmethod
203    def parse_time_ago(time_ago):
204        """
205        Attempt to parse a timestamp for a post
206
207        LinkedIn doesn't give us the actual timestamp, only a relative
208        indicator like "18h ago". This is annoying because it gets more
209        imprecise the longer ago it is, and because it is language-sensitive.
210        For example, in English 18 months is displayed as "18mo" but in Dutch
211        it is "18 mnd".
212
213        Right now this will only adjust the 'collected at' timestamp if the
214        data was scraped from an English or Dutch interface, and even then the
215        timestamps will still be imprecise.
216
217        :param str time_ago:  Relative timestamp, e.g. '18mo'.
218        :return int:  Estimated timestamp of post, as unix timestamp
219        """
220        time_ago = time_ago.split("•")[0]
221        numbers = re.sub(r"[^0-9]", "", time_ago).strip()
222        letters = re.sub(r"[0-9]", "", time_ago).strip()
223
224        period_lengths = {
225            "s": 1,
226            "m": 60,
227            "h": 3600,
228            "d": 86400,
229            "w": 7 * 86400,
230            "mo": 30.4375 * 86400,  # we don't know WHICH months, so use the average length of a month
231            "mnd": 30.4375 * 86400,
232            "yr": 365.25 * 86400,  # likewise
233            "j": 365.25 * 86400,
234        }
235
236        numbers = int(numbers) if len(numbers) else 0
237        return period_lengths.get(letters, 0) * numbers
class SearchLinkedIn(backend.lib.search.Search):
 16class SearchLinkedIn(Search):
 17    """
 18    Import scraped LinkedIn data
 19    """
 20    type = "linkedin-search"  # job ID
 21    category = "Search"  # category
 22    title = "Import scraped LinkedIn data"  # title displayed in UI
 23    description = "Import LinkedIn data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 24    extension = "ndjson"  # extension of result file, used internally and in UI
 25    is_from_zeeschuimer = True
 26
 27    # not available as a processor for existing datasets
 28    accepts = [None]
 29    references = [
 30        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 31        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)"
 32    ]
 33
 34    def get_items(self, query):
 35        """
 36        Run custom search
 37
 38        Not available for LinkedIn
 39        """
 40        raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere")
 41
 42    @staticmethod
 43    def map_item(item):
 44        """
 45        Parse LinkedIn post in Voyager V2 format
 46
 47        'Voyager V2' seems to be how the format is referred to in the data
 48        itself...
 49
 50        :param item:  Data as received from LinkedIn
 51        :return dict:  Mapped item
 52        """
 53
 54        # annoyingly, posts don't come with a timestamp
 55        # approximate it by using the time of collection and the "time ago"
 56        # included with the post (e.g. 'published 18h ago')
 57        if not item.get("actor"):
 58            return {}
 59
 60        if "__import_meta" in item:
 61            time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000)  # milliseconds
 62        else:
 63            # best we got
 64            time_collected = int(time.time())
 65
 66        time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else ""
 67        timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago))
 68
 69        # images are stored in some convoluted way
 70        # there are multiple URLs for various thumbnails, use the one for the
 71        # largest version of the image
 72        images = []
 73        if item["content"] and "images" in item["content"]:
 74            for image in item["content"]["images"]:
 75                image_data = image["attributes"][0]["vectorImage"]
 76                artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True)
 77                url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"]
 78                images.append(url)
 79
 80        # or alternatively they are stored here:
 81        if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"):
 82            image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"]
 83            if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]:
 84                images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"])
 85            elif image and image.get("artifacts"):
 86                images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"])
 87
 88        author = SearchLinkedIn.get_author(item)
 89
 90        # the ID is in the format 'urn:li:activity:6960882777168695296'
 91        # retain the numerical part as the item ID for 4CAT
 92        # sometimes posts seem to be combined, e.g.:
 93        # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449)
 94        # effectively both IDs seem to refer to the same post, so just take the
 95        # first one
 96        meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"]
 97        urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0]
 98        item_id = urn.split(":").pop()
 99
100        # the way hashtags were stored changed at some point
101        hashtags = []
102        if item["commentary"] and "attributes" in item["commentary"]["text"]:
103            hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"]
104        elif item["commentary"] and "attributesV2" in item["commentary"]["text"]:
105            hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]]
106
107        # same for metrics
108        if "*totalSocialActivityCounts" in item["*socialDetail"]:
109            metrics = {
110                "likes": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"],
111                "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"],
112                "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"]
113            }
114        else:
115            metrics = {
116                "likes": item["*socialDetail"]["likes"]["paging"]["total"],
117                "comments": item["*socialDetail"]["comments"]["paging"]["total"],
118                "shares": item["*socialDetail"]["totalShares"],
119            }
120
121        # and links
122        link_url = ""
123        if item.get("content") and item["content"].get("navigationContext"):
124            link_url = item["content"]["navigationContext"].get("actionTarget", "")
125        elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]:
126            link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "")
127
128        return MappedItem({
129            "id": item_id,
130            "thread_id": item_id,
131            "body": item["commentary"]["text"]["text"] if item["commentary"] else "",
132            "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"),
133            "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"),
134            "timestamp_ago": time_ago.split("•")[0].strip(),
135            "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no",
136            **{("author_" + k).replace("_username", ""): v for k, v in author.items()},
137            "hashtags": ",".join(hashtags),
138            "image_urls": ",".join(images),
139            "post_url": "https://www.linkedin.com/feed/update/" + urn,
140            "link_url":  link_url,
141            **metrics,
142            "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "",
143            "unix_timestamp": timestamp,
144            "unix_timestamp_collected": time_collected
145        })
146
147    @staticmethod
148    def get_author(post):
149        """
150        Extract author information from post
151
152        This is a bit complicated because it works differently for companies
153        and users and some fields are not always present. Hence, a separate
154        method.
155
156        :param dict post:  Post data
157        :return dict:  Author information
158        """
159        author = {
160            "username": post["actor"]["navigationContext"]["actionTarget"].split("linkedin.com/").pop().split("?")[0],
161            "name": post["actor"]["name"]["text"],
162            "description": post["actor"].get("description", {}).get("text", "") if post["actor"].get("description") else "",
163            "pronouns": "",
164            "avatar_url": "",
165            "is_company": "no",
166            "url": post["actor"]["navigationContext"]["actionTarget"].split("?")[0],
167        }
168
169        # likewise for author avatars
170        if post["actor"]["name"].get("attributes"):
171            if "*miniProfile" in post["actor"]["name"]["attributes"][0]:
172                author_profile = post["actor"]["name"]["attributes"][0]["*miniProfile"]
173                if author_profile["picture"]:
174                    avatar_artifacts = sorted(author_profile["picture"]["artifacts"], key=lambda x: x["width"], reverse=True)
175                    author.update({"avatar_url": author_profile["picture"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
176
177                if author_profile.get("customPronoun"):
178                    author.update({"pronouns": author_profile.get("customPronoun")})
179                elif author_profile.get("standardizedPronoun"):
180                    author.update({"pronouns": author_profile.get("standardizedPronoun").lower()})
181
182            elif "*miniCompany" in post["actor"]["name"]["attributes"][0]:
183                author_profile = post["actor"]["name"]["attributes"][0]["*miniCompany"]
184                avatar_artifacts = sorted(author_profile["logo"]["artifacts"], key=lambda x: x["width"], reverse=True)
185
186                author.update({"is_company": "yes"})
187                author.update({"avatar_url": author_profile["logo"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
188
189        if post["actor"]["name"].get("attributesV2"):
190            pronouns = post["actor"]["name"]["attributesV2"][0]["detailData"].get("*profileFullName", {}).get("pronoun")
191            if pronouns:
192                if pronouns.get("customPronoun"):
193                    author.update({"pronouns": pronouns.get("customPronoun")})
194                elif pronouns.get("standardizedPronoun"):
195                    author.update({"pronouns": pronouns.get("standardizedPronoun")})
196
197        avatar = post["actor"]["image"].get("attributes", [{}])[0].get("detailData", {}).get("nonEntityProfilePicture")
198        if avatar and avatar["vectorImage"]:
199            author.update({"avatar_url": avatar["vectorImage"]["rootUrl"] + avatar["vectorImage"]["artifacts"][0]["fileIdentifyingUrlPathSegment"]})
200
201        return author
202
203    @staticmethod
204    def parse_time_ago(time_ago):
205        """
206        Attempt to parse a timestamp for a post
207
208        LinkedIn doesn't give us the actual timestamp, only a relative
209        indicator like "18h ago". This is annoying because it gets more
210        imprecise the longer ago it is, and because it is language-sensitive.
211        For example, in English 18 months is displayed as "18mo" but in Dutch
212        it is "18 mnd".
213
214        Right now this will only adjust the 'collected at' timestamp if the
215        data was scraped from an English or Dutch interface, and even then the
216        timestamps will still be imprecise.
217
218        :param str time_ago:  Relative timestamp, e.g. '18mo'.
219        :return int:  Estimated timestamp of post, as unix timestamp
220        """
221        time_ago = time_ago.split("•")[0]
222        numbers = re.sub(r"[^0-9]", "", time_ago).strip()
223        letters = re.sub(r"[0-9]", "", time_ago).strip()
224
225        period_lengths = {
226            "s": 1,
227            "m": 60,
228            "h": 3600,
229            "d": 86400,
230            "w": 7 * 86400,
231            "mo": 30.4375 * 86400,  # we don't know WHICH months, so use the average length of a month
232            "mnd": 30.4375 * 86400,
233            "yr": 365.25 * 86400,  # likewise
234            "j": 365.25 * 86400,
235        }
236
237        numbers = int(numbers) if len(numbers) else 0
238        return period_lengths.get(letters, 0) * numbers

Import scraped LinkedIn data

type = 'linkedin-search'
category = 'Search'
title = 'Import scraped LinkedIn data'
description = 'Import LinkedIn data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)']
def get_items(self, query):
34    def get_items(self, query):
35        """
36        Run custom search
37
38        Not available for LinkedIn
39        """
40        raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere")

Run custom search

Not available for LinkedIn

@staticmethod
def map_item(item):
 42    @staticmethod
 43    def map_item(item):
 44        """
 45        Parse LinkedIn post in Voyager V2 format
 46
 47        'Voyager V2' seems to be how the format is referred to in the data
 48        itself...
 49
 50        :param item:  Data as received from LinkedIn
 51        :return dict:  Mapped item
 52        """
 53
 54        # annoyingly, posts don't come with a timestamp
 55        # approximate it by using the time of collection and the "time ago"
 56        # included with the post (e.g. 'published 18h ago')
 57        if not item.get("actor"):
 58            return {}
 59
 60        if "__import_meta" in item:
 61            time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000)  # milliseconds
 62        else:
 63            # best we got
 64            time_collected = int(time.time())
 65
 66        time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else ""
 67        timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago))
 68
 69        # images are stored in some convoluted way
 70        # there are multiple URLs for various thumbnails, use the one for the
 71        # largest version of the image
 72        images = []
 73        if item["content"] and "images" in item["content"]:
 74            for image in item["content"]["images"]:
 75                image_data = image["attributes"][0]["vectorImage"]
 76                artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True)
 77                url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"]
 78                images.append(url)
 79
 80        # or alternatively they are stored here:
 81        if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"):
 82            image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"]
 83            if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]:
 84                images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"])
 85            elif image and image.get("artifacts"):
 86                images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"])
 87
 88        author = SearchLinkedIn.get_author(item)
 89
 90        # the ID is in the format 'urn:li:activity:6960882777168695296'
 91        # retain the numerical part as the item ID for 4CAT
 92        # sometimes posts seem to be combined, e.g.:
 93        # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449)
 94        # effectively both IDs seem to refer to the same post, so just take the
 95        # first one
 96        meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"]
 97        urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0]
 98        item_id = urn.split(":").pop()
 99
100        # the way hashtags were stored changed at some point
101        hashtags = []
102        if item["commentary"] and "attributes" in item["commentary"]["text"]:
103            hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"]
104        elif item["commentary"] and "attributesV2" in item["commentary"]["text"]:
105            hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]]
106
107        # same for metrics
108        if "*totalSocialActivityCounts" in item["*socialDetail"]:
109            metrics = {
110                "likes": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"],
111                "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"],
112                "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"]
113            }
114        else:
115            metrics = {
116                "likes": item["*socialDetail"]["likes"]["paging"]["total"],
117                "comments": item["*socialDetail"]["comments"]["paging"]["total"],
118                "shares": item["*socialDetail"]["totalShares"],
119            }
120
121        # and links
122        link_url = ""
123        if item.get("content") and item["content"].get("navigationContext"):
124            link_url = item["content"]["navigationContext"].get("actionTarget", "")
125        elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]:
126            link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "")
127
128        return MappedItem({
129            "id": item_id,
130            "thread_id": item_id,
131            "body": item["commentary"]["text"]["text"] if item["commentary"] else "",
132            "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"),
133            "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"),
134            "timestamp_ago": time_ago.split("•")[0].strip(),
135            "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no",
136            **{("author_" + k).replace("_username", ""): v for k, v in author.items()},
137            "hashtags": ",".join(hashtags),
138            "image_urls": ",".join(images),
139            "post_url": "https://www.linkedin.com/feed/update/" + urn,
140            "link_url":  link_url,
141            **metrics,
142            "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "",
143            "unix_timestamp": timestamp,
144            "unix_timestamp_collected": time_collected
145        })

Parse LinkedIn post in Voyager V2 format

'Voyager V2' seems to be how the format is referred to in the data itself...

Parameters
  • item: Data as received from LinkedIn
Returns

Mapped item

@staticmethod
def get_author(post):
147    @staticmethod
148    def get_author(post):
149        """
150        Extract author information from post
151
152        This is a bit complicated because it works differently for companies
153        and users and some fields are not always present. Hence, a separate
154        method.
155
156        :param dict post:  Post data
157        :return dict:  Author information
158        """
159        author = {
160            "username": post["actor"]["navigationContext"]["actionTarget"].split("linkedin.com/").pop().split("?")[0],
161            "name": post["actor"]["name"]["text"],
162            "description": post["actor"].get("description", {}).get("text", "") if post["actor"].get("description") else "",
163            "pronouns": "",
164            "avatar_url": "",
165            "is_company": "no",
166            "url": post["actor"]["navigationContext"]["actionTarget"].split("?")[0],
167        }
168
169        # likewise for author avatars
170        if post["actor"]["name"].get("attributes"):
171            if "*miniProfile" in post["actor"]["name"]["attributes"][0]:
172                author_profile = post["actor"]["name"]["attributes"][0]["*miniProfile"]
173                if author_profile["picture"]:
174                    avatar_artifacts = sorted(author_profile["picture"]["artifacts"], key=lambda x: x["width"], reverse=True)
175                    author.update({"avatar_url": author_profile["picture"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
176
177                if author_profile.get("customPronoun"):
178                    author.update({"pronouns": author_profile.get("customPronoun")})
179                elif author_profile.get("standardizedPronoun"):
180                    author.update({"pronouns": author_profile.get("standardizedPronoun").lower()})
181
182            elif "*miniCompany" in post["actor"]["name"]["attributes"][0]:
183                author_profile = post["actor"]["name"]["attributes"][0]["*miniCompany"]
184                avatar_artifacts = sorted(author_profile["logo"]["artifacts"], key=lambda x: x["width"], reverse=True)
185
186                author.update({"is_company": "yes"})
187                author.update({"avatar_url": author_profile["logo"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
188
189        if post["actor"]["name"].get("attributesV2"):
190            pronouns = post["actor"]["name"]["attributesV2"][0]["detailData"].get("*profileFullName", {}).get("pronoun")
191            if pronouns:
192                if pronouns.get("customPronoun"):
193                    author.update({"pronouns": pronouns.get("customPronoun")})
194                elif pronouns.get("standardizedPronoun"):
195                    author.update({"pronouns": pronouns.get("standardizedPronoun")})
196
197        avatar = post["actor"]["image"].get("attributes", [{}])[0].get("detailData", {}).get("nonEntityProfilePicture")
198        if avatar and avatar["vectorImage"]:
199            author.update({"avatar_url": avatar["vectorImage"]["rootUrl"] + avatar["vectorImage"]["artifacts"][0]["fileIdentifyingUrlPathSegment"]})
200
201        return author

Extract author information from post

This is a bit complicated because it works differently for companies and users and some fields are not always present. Hence, a separate method.

Parameters
  • dict post: Post data
Returns

Author information

@staticmethod
def parse_time_ago(time_ago):
203    @staticmethod
204    def parse_time_ago(time_ago):
205        """
206        Attempt to parse a timestamp for a post
207
208        LinkedIn doesn't give us the actual timestamp, only a relative
209        indicator like "18h ago". This is annoying because it gets more
210        imprecise the longer ago it is, and because it is language-sensitive.
211        For example, in English 18 months is displayed as "18mo" but in Dutch
212        it is "18 mnd".
213
214        Right now this will only adjust the 'collected at' timestamp if the
215        data was scraped from an English or Dutch interface, and even then the
216        timestamps will still be imprecise.
217
218        :param str time_ago:  Relative timestamp, e.g. '18mo'.
219        :return int:  Estimated timestamp of post, as unix timestamp
220        """
221        time_ago = time_ago.split("•")[0]
222        numbers = re.sub(r"[^0-9]", "", time_ago).strip()
223        letters = re.sub(r"[0-9]", "", time_ago).strip()
224
225        period_lengths = {
226            "s": 1,
227            "m": 60,
228            "h": 3600,
229            "d": 86400,
230            "w": 7 * 86400,
231            "mo": 30.4375 * 86400,  # we don't know WHICH months, so use the average length of a month
232            "mnd": 30.4375 * 86400,
233            "yr": 365.25 * 86400,  # likewise
234            "j": 365.25 * 86400,
235        }
236
237        numbers = int(numbers) if len(numbers) else 0
238        return period_lengths.get(letters, 0) * numbers

Attempt to parse a timestamp for a post

LinkedIn doesn't give us the actual timestamp, only a relative indicator like "18h ago". This is annoying because it gets more imprecise the longer ago it is, and because it is language-sensitive. For example, in English 18 months is displayed as "18mo" but in Dutch it is "18 mnd".

Right now this will only adjust the 'collected at' timestamp if the data was scraped from an English or Dutch interface, and even then the timestamps will still be imprecise.

Parameters
  • str time_ago: Relative timestamp, e.g. '18mo'.
Returns

Estimated timestamp of post, as unix timestamp