Edit on GitHub

datasources.linkedin.search_linkedin

Import scraped LinkedIn data

It's prohibitively difficult to scrape data from LinkedIn within 4CAT itself due to its aggressive rate limiting and login wall. Instead, import data collected elsewhere.

  1"""
  2Import scraped LinkedIn data
  3
  4It's prohibitively difficult to scrape data from LinkedIn within 4CAT itself
  5due to its aggressive rate limiting and login wall. Instead, import data
  6collected elsewhere.
  7"""
  8import datetime
  9import time
 10import re
 11
 12from backend.lib.search import Search
 13from common.lib.item_mapping import MappedItem
 14from common.lib.helpers import normalize_url_encoding
 15
 16class SearchLinkedIn(Search):
 17    """
 18    Import scraped LinkedIn data
 19    """
 20    type = "linkedin-search"  # job ID
 21    category = "Search"  # category
 22    title = "Import scraped LinkedIn data"  # title displayed in UI
 23    description = "Import LinkedIn data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 24    extension = "ndjson"  # extension of result file, used internally and in UI
 25    is_from_zeeschuimer = True
 26
 27    # not available as a processor for existing datasets
 28    accepts = [None]
 29    references = [
 30        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 31        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)"
 32    ]
 33
 34    def get_items(self, query):
 35        """
 36        Run custom search
 37
 38        Not available for LinkedIn
 39        """
 40        raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere")
 41
 42    @staticmethod
 43    def map_item(item):
 44        """
 45        Parse LinkedIn post in Voyager V2 format
 46
 47        'Voyager V2' seems to be how the format is referred to in the data
 48        itself...
 49
 50        :param item:  Data as received from LinkedIn
 51        :return dict:  Mapped item
 52        """
 53
 54        # annoyingly, posts don't come with a timestamp
 55        # approximate it by using the time of collection and the "time ago"
 56        # included with the post (e.g. 'published 18h ago')
 57        if not item.get("actor"):
 58            return {}
 59
 60        if "__import_meta" in item:
 61            time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000)  # milliseconds
 62        else:
 63            # best we got
 64            time_collected = int(time.time())
 65
 66        time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else ""
 67        timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago))
 68
 69        # images are stored in some convoluted way
 70        # there are multiple URLs for various thumbnails, use the one for the
 71        # largest version of the image
 72        images = []
 73        if item["content"] and "images" in item["content"]:
 74            for image in item["content"]["images"]:
 75                image_data = image["attributes"][0]["vectorImage"]
 76                artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True)
 77                url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"]
 78                images.append(url)
 79
 80        # or alternatively they are stored here:
 81        if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"):
 82            image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"]
 83            if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]:
 84                images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"])
 85            elif image and image.get("artifacts"):
 86                images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"])
 87
 88        # video thumbnails are stored similarly as image data
 89        video_thumb_url = ""
 90        thumb_content = None
 91        if item["content"] and "*videoPlayMetadata" in item["content"]:
 92            thumb_content = item["content"]["*videoPlayMetadata"]["thumbnail"]
 93        elif item["content"] and "linkedInVideoComponent" in item["content"] and item["content"]["linkedInVideoComponent"]:
 94            thumb_content = item["content"]["linkedInVideoComponent"]["*videoPlayMetadata"]["thumbnail"]
 95        elif item["content"] and "externalVideoComponent" in item["content"] and item["content"]["externalVideoComponent"]:
 96            thumb_content = item["content"]["externalVideoComponent"]["*videoPlayMetadata"]["thumbnail"]
 97        if thumb_content:
 98            video_thumb_url = thumb_content["rootUrl"] + thumb_content["artifacts"][0]["fileIdentifyingUrlPathSegment"]
 99
100        author = SearchLinkedIn.get_author(item)
101
102        # the ID is in the format 'urn:li:activity:6960882777168695296'
103        # retain the numerical part as the item ID for 4CAT
104        # sometimes posts seem to be combined, e.g.:
105        # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449)
106        # effectively both IDs seem to refer to the same post, so just take the
107        # first one
108        meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"]
109        urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0]
110        item_id = urn.split(":").pop()
111
112        # the way hashtags were stored changed at some point
113        hashtags = []
114        if item["commentary"] and "attributes" in item["commentary"]["text"]:
115            hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"]
116        elif item["commentary"] and "attributesV2" in item["commentary"]["text"]:
117            hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]]
118
119        # and mentions
120        # we're storing both usernames and full names
121        author_mentions = []
122        author_name_mentions = []
123        if item["commentary"] and "attributes" in item["commentary"]["text"]:
124            for mention in item["commentary"]["text"].get("attributes", {}):
125                if mention["type"] == "PROFILE_MENTION":
126                    mention = mention["*miniProfile"]
127                    author_mentions.append(mention["publicIdentifier"])         
128                    author_name_mentions.append(" ".join([mention.get("firstName", ""), mention.get("lastName", "")]))
129                elif mention["type"] == "COMPANY_NAME":
130                    mention = mention["*miniCompany"]
131                    author_mentions.append(mention["universalName"])         
132                    author_name_mentions.append(mention.get("name", ""))
133
134        # same for metrics
135        if "*totalSocialActivityCounts" in item["*socialDetail"]:
136            metrics = {
137                "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"],
138                "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"],
139                "reactions": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"],
140                "reaction_like": 0,
141                "reaction_empathy": 0,
142                "reaction_praise": 0,
143                "reaction_entertainment": 0,
144                "reaction_appreciation": 0,
145                "reaction_interest": 0
146                }
147            # There's different kind of reaction metrics
148            for reaction_type in item["*socialDetail"]["*totalSocialActivityCounts"].get("reactionTypeCounts", []):
149                metrics["reaction_" + reaction_type["reactionType"].lower()] = reaction_type["count"]
150
151        else:
152            metrics = {
153                "comments": item["*socialDetail"]["comments"]["paging"]["total"],
154                "shares": item["*socialDetail"]["totalShares"],
155                "reactions": item["*socialDetail"]["likes"]["paging"]["total"]
156            }
157
158        # and links
159        link_url = ""
160        if item.get("content") and item["content"].get("navigationContext"):
161            link_url = item["content"]["navigationContext"].get("actionTarget", "")
162        elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]:
163            link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "")
164
165        return MappedItem({
166            "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")),  # Zeeschuimer metadata
167            "id": item_id,
168            "thread_id": item_id,
169            "body": item["commentary"]["text"]["text"] if item["commentary"] else "",
170            "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"),
171            "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"),
172            "timestamp_ago": time_ago.split("•")[0].strip(),
173            "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no",
174            **{("author_" + k).replace("_username", ""): v for k, v in author.items()},
175            "author_mentions": ",".join(author_mentions),
176            "author_name_mentions": ",".join(author_name_mentions),
177            "hashtags": ",".join(hashtags),
178            "image_urls": ",".join(images),
179            "video_thumb_url": video_thumb_url,
180            "post_url": "https://www.linkedin.com/feed/update/" + urn,
181            "link_url":  link_url,
182            **metrics,
183            "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "",
184            "unix_timestamp": timestamp,
185            "unix_timestamp_collected": time_collected
186        })
187
188    @staticmethod
189    def get_author(post):
190        """
191        Extract author information from post
192
193        This is a bit complicated because it works differently for companies
194        and users and some fields are not always present. Hence, a separate
195        method.
196
197        :param dict post:  Post data
198        :return dict:  Author information
199        """
200        author = {
201            "username": post["actor"]["navigationContext"]["actionTarget"].split("linkedin.com/").pop().split("?")[0],
202            "name": post["actor"]["name"]["text"],
203            "description": post["actor"].get("description", {}).get("text", "") if post["actor"].get("description") else "",
204            "pronouns": "",
205            "avatar_url": "",
206            "is_company": "no",
207            "url": post["actor"]["navigationContext"]["actionTarget"].split("?")[0],
208        }
209
210        # likewise for author avatars
211        if post["actor"]["name"].get("attributes"):
212            if "*miniProfile" in post["actor"]["name"]["attributes"][0]:
213                author_profile = post["actor"]["name"]["attributes"][0]["*miniProfile"]
214                if author_profile["picture"]:
215                    avatar_artifacts = sorted(author_profile["picture"]["artifacts"], key=lambda x: x["width"], reverse=True)
216                    author.update({"avatar_url": author_profile["picture"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
217
218                if author_profile.get("customPronoun"):
219                    author.update({"pronouns": author_profile.get("customPronoun")})
220                elif author_profile.get("standardizedPronoun"):
221                    author.update({"pronouns": author_profile.get("standardizedPronoun").lower()})
222
223            elif "*miniCompany" in post["actor"]["name"]["attributes"][0]:
224                author_profile = post["actor"]["name"]["attributes"][0]["*miniCompany"]
225                avatar_artifacts = sorted(author_profile["logo"]["artifacts"], key=lambda x: x["width"], reverse=True)
226
227                author.update({"is_company": "yes"})
228                author.update({"avatar_url": author_profile["logo"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
229
230        if post["actor"]["name"].get("attributesV2"):
231            pronouns = post["actor"]["name"]["attributesV2"][0]["detailData"].get("*profileFullName", {}).get("pronoun")
232            if pronouns:
233                if pronouns.get("customPronoun"):
234                    author.update({"pronouns": pronouns.get("customPronoun")})
235                elif pronouns.get("standardizedPronoun"):
236                    author.update({"pronouns": pronouns.get("standardizedPronoun")})
237
238        avatar = post["actor"]["image"].get("attributes", [{}])[0].get("detailData", {}).get("nonEntityProfilePicture")
239        if avatar and avatar["vectorImage"]:
240            author.update({"avatar_url": avatar["vectorImage"]["rootUrl"] + avatar["vectorImage"]["artifacts"][0]["fileIdentifyingUrlPathSegment"]})
241
242        return author
243
244    @staticmethod
245    def parse_time_ago(time_ago):
246        """
247        Attempt to parse a timestamp for a post
248
249        LinkedIn doesn't give us the actual timestamp, only a relative
250        indicator like "18h ago". This is annoying because it gets more
251        imprecise the longer ago it is, and because it is language-sensitive.
252        For example, in English 18 months is displayed as "18mo" but in Dutch
253        it is "18 mnd".
254
255        Right now this will only adjust the 'collected at' timestamp if the
256        data was scraped from an English or Dutch interface, and even then the
257        timestamps will still be imprecise.
258
259        :param str time_ago:  Relative timestamp, e.g. '18mo'.
260        :return int:  Estimated timestamp of post, as unix timestamp
261        """
262        time_ago = time_ago.split("•")[0]
263        numbers = re.sub(r"[^0-9]", "", time_ago).strip()
264        letters = re.sub(r"[0-9]", "", time_ago).strip()
265
266        period_lengths = {
267            "s": 1,
268            "m": 60,
269            "h": 3600,
270            "d": 86400,
271            "w": 7 * 86400,
272            "mo": 30.4375 * 86400,  # we don't know WHICH months, so use the average length of a month
273            "mnd": 30.4375 * 86400,
274            "yr": 365.25 * 86400,  # likewise
275            "j": 365.25 * 86400,
276        }
277
278        numbers = int(numbers) if len(numbers) else 0
279        return period_lengths.get(letters, 0) * numbers
class SearchLinkedIn(backend.lib.search.Search):
 17class SearchLinkedIn(Search):
 18    """
 19    Import scraped LinkedIn data
 20    """
 21    type = "linkedin-search"  # job ID
 22    category = "Search"  # category
 23    title = "Import scraped LinkedIn data"  # title displayed in UI
 24    description = "Import LinkedIn data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 25    extension = "ndjson"  # extension of result file, used internally and in UI
 26    is_from_zeeschuimer = True
 27
 28    # not available as a processor for existing datasets
 29    accepts = [None]
 30    references = [
 31        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 32        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)"
 33    ]
 34
 35    def get_items(self, query):
 36        """
 37        Run custom search
 38
 39        Not available for LinkedIn
 40        """
 41        raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere")
 42
 43    @staticmethod
 44    def map_item(item):
 45        """
 46        Parse LinkedIn post in Voyager V2 format
 47
 48        'Voyager V2' seems to be how the format is referred to in the data
 49        itself...
 50
 51        :param item:  Data as received from LinkedIn
 52        :return dict:  Mapped item
 53        """
 54
 55        # annoyingly, posts don't come with a timestamp
 56        # approximate it by using the time of collection and the "time ago"
 57        # included with the post (e.g. 'published 18h ago')
 58        if not item.get("actor"):
 59            return {}
 60
 61        if "__import_meta" in item:
 62            time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000)  # milliseconds
 63        else:
 64            # best we got
 65            time_collected = int(time.time())
 66
 67        time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else ""
 68        timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago))
 69
 70        # images are stored in some convoluted way
 71        # there are multiple URLs for various thumbnails, use the one for the
 72        # largest version of the image
 73        images = []
 74        if item["content"] and "images" in item["content"]:
 75            for image in item["content"]["images"]:
 76                image_data = image["attributes"][0]["vectorImage"]
 77                artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True)
 78                url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"]
 79                images.append(url)
 80
 81        # or alternatively they are stored here:
 82        if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"):
 83            image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"]
 84            if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]:
 85                images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"])
 86            elif image and image.get("artifacts"):
 87                images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"])
 88
 89        # video thumbnails are stored similarly as image data
 90        video_thumb_url = ""
 91        thumb_content = None
 92        if item["content"] and "*videoPlayMetadata" in item["content"]:
 93            thumb_content = item["content"]["*videoPlayMetadata"]["thumbnail"]
 94        elif item["content"] and "linkedInVideoComponent" in item["content"] and item["content"]["linkedInVideoComponent"]:
 95            thumb_content = item["content"]["linkedInVideoComponent"]["*videoPlayMetadata"]["thumbnail"]
 96        elif item["content"] and "externalVideoComponent" in item["content"] and item["content"]["externalVideoComponent"]:
 97            thumb_content = item["content"]["externalVideoComponent"]["*videoPlayMetadata"]["thumbnail"]
 98        if thumb_content:
 99            video_thumb_url = thumb_content["rootUrl"] + thumb_content["artifacts"][0]["fileIdentifyingUrlPathSegment"]
100
101        author = SearchLinkedIn.get_author(item)
102
103        # the ID is in the format 'urn:li:activity:6960882777168695296'
104        # retain the numerical part as the item ID for 4CAT
105        # sometimes posts seem to be combined, e.g.:
106        # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449)
107        # effectively both IDs seem to refer to the same post, so just take the
108        # first one
109        meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"]
110        urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0]
111        item_id = urn.split(":").pop()
112
113        # the way hashtags were stored changed at some point
114        hashtags = []
115        if item["commentary"] and "attributes" in item["commentary"]["text"]:
116            hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"]
117        elif item["commentary"] and "attributesV2" in item["commentary"]["text"]:
118            hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]]
119
120        # and mentions
121        # we're storing both usernames and full names
122        author_mentions = []
123        author_name_mentions = []
124        if item["commentary"] and "attributes" in item["commentary"]["text"]:
125            for mention in item["commentary"]["text"].get("attributes", {}):
126                if mention["type"] == "PROFILE_MENTION":
127                    mention = mention["*miniProfile"]
128                    author_mentions.append(mention["publicIdentifier"])         
129                    author_name_mentions.append(" ".join([mention.get("firstName", ""), mention.get("lastName", "")]))
130                elif mention["type"] == "COMPANY_NAME":
131                    mention = mention["*miniCompany"]
132                    author_mentions.append(mention["universalName"])         
133                    author_name_mentions.append(mention.get("name", ""))
134
135        # same for metrics
136        if "*totalSocialActivityCounts" in item["*socialDetail"]:
137            metrics = {
138                "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"],
139                "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"],
140                "reactions": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"],
141                "reaction_like": 0,
142                "reaction_empathy": 0,
143                "reaction_praise": 0,
144                "reaction_entertainment": 0,
145                "reaction_appreciation": 0,
146                "reaction_interest": 0
147                }
148            # There's different kind of reaction metrics
149            for reaction_type in item["*socialDetail"]["*totalSocialActivityCounts"].get("reactionTypeCounts", []):
150                metrics["reaction_" + reaction_type["reactionType"].lower()] = reaction_type["count"]
151
152        else:
153            metrics = {
154                "comments": item["*socialDetail"]["comments"]["paging"]["total"],
155                "shares": item["*socialDetail"]["totalShares"],
156                "reactions": item["*socialDetail"]["likes"]["paging"]["total"]
157            }
158
159        # and links
160        link_url = ""
161        if item.get("content") and item["content"].get("navigationContext"):
162            link_url = item["content"]["navigationContext"].get("actionTarget", "")
163        elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]:
164            link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "")
165
166        return MappedItem({
167            "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")),  # Zeeschuimer metadata
168            "id": item_id,
169            "thread_id": item_id,
170            "body": item["commentary"]["text"]["text"] if item["commentary"] else "",
171            "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"),
172            "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"),
173            "timestamp_ago": time_ago.split("•")[0].strip(),
174            "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no",
175            **{("author_" + k).replace("_username", ""): v for k, v in author.items()},
176            "author_mentions": ",".join(author_mentions),
177            "author_name_mentions": ",".join(author_name_mentions),
178            "hashtags": ",".join(hashtags),
179            "image_urls": ",".join(images),
180            "video_thumb_url": video_thumb_url,
181            "post_url": "https://www.linkedin.com/feed/update/" + urn,
182            "link_url":  link_url,
183            **metrics,
184            "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "",
185            "unix_timestamp": timestamp,
186            "unix_timestamp_collected": time_collected
187        })
188
189    @staticmethod
190    def get_author(post):
191        """
192        Extract author information from post
193
194        This is a bit complicated because it works differently for companies
195        and users and some fields are not always present. Hence, a separate
196        method.
197
198        :param dict post:  Post data
199        :return dict:  Author information
200        """
201        author = {
202            "username": post["actor"]["navigationContext"]["actionTarget"].split("linkedin.com/").pop().split("?")[0],
203            "name": post["actor"]["name"]["text"],
204            "description": post["actor"].get("description", {}).get("text", "") if post["actor"].get("description") else "",
205            "pronouns": "",
206            "avatar_url": "",
207            "is_company": "no",
208            "url": post["actor"]["navigationContext"]["actionTarget"].split("?")[0],
209        }
210
211        # likewise for author avatars
212        if post["actor"]["name"].get("attributes"):
213            if "*miniProfile" in post["actor"]["name"]["attributes"][0]:
214                author_profile = post["actor"]["name"]["attributes"][0]["*miniProfile"]
215                if author_profile["picture"]:
216                    avatar_artifacts = sorted(author_profile["picture"]["artifacts"], key=lambda x: x["width"], reverse=True)
217                    author.update({"avatar_url": author_profile["picture"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
218
219                if author_profile.get("customPronoun"):
220                    author.update({"pronouns": author_profile.get("customPronoun")})
221                elif author_profile.get("standardizedPronoun"):
222                    author.update({"pronouns": author_profile.get("standardizedPronoun").lower()})
223
224            elif "*miniCompany" in post["actor"]["name"]["attributes"][0]:
225                author_profile = post["actor"]["name"]["attributes"][0]["*miniCompany"]
226                avatar_artifacts = sorted(author_profile["logo"]["artifacts"], key=lambda x: x["width"], reverse=True)
227
228                author.update({"is_company": "yes"})
229                author.update({"avatar_url": author_profile["logo"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
230
231        if post["actor"]["name"].get("attributesV2"):
232            pronouns = post["actor"]["name"]["attributesV2"][0]["detailData"].get("*profileFullName", {}).get("pronoun")
233            if pronouns:
234                if pronouns.get("customPronoun"):
235                    author.update({"pronouns": pronouns.get("customPronoun")})
236                elif pronouns.get("standardizedPronoun"):
237                    author.update({"pronouns": pronouns.get("standardizedPronoun")})
238
239        avatar = post["actor"]["image"].get("attributes", [{}])[0].get("detailData", {}).get("nonEntityProfilePicture")
240        if avatar and avatar["vectorImage"]:
241            author.update({"avatar_url": avatar["vectorImage"]["rootUrl"] + avatar["vectorImage"]["artifacts"][0]["fileIdentifyingUrlPathSegment"]})
242
243        return author
244
245    @staticmethod
246    def parse_time_ago(time_ago):
247        """
248        Attempt to parse a timestamp for a post
249
250        LinkedIn doesn't give us the actual timestamp, only a relative
251        indicator like "18h ago". This is annoying because it gets more
252        imprecise the longer ago it is, and because it is language-sensitive.
253        For example, in English 18 months is displayed as "18mo" but in Dutch
254        it is "18 mnd".
255
256        Right now this will only adjust the 'collected at' timestamp if the
257        data was scraped from an English or Dutch interface, and even then the
258        timestamps will still be imprecise.
259
260        :param str time_ago:  Relative timestamp, e.g. '18mo'.
261        :return int:  Estimated timestamp of post, as unix timestamp
262        """
263        time_ago = time_ago.split("•")[0]
264        numbers = re.sub(r"[^0-9]", "", time_ago).strip()
265        letters = re.sub(r"[0-9]", "", time_ago).strip()
266
267        period_lengths = {
268            "s": 1,
269            "m": 60,
270            "h": 3600,
271            "d": 86400,
272            "w": 7 * 86400,
273            "mo": 30.4375 * 86400,  # we don't know WHICH months, so use the average length of a month
274            "mnd": 30.4375 * 86400,
275            "yr": 365.25 * 86400,  # likewise
276            "j": 365.25 * 86400,
277        }
278
279        numbers = int(numbers) if len(numbers) else 0
280        return period_lengths.get(letters, 0) * numbers

Import scraped LinkedIn data

type = 'linkedin-search'
category = 'Search'
title = 'Import scraped LinkedIn data'
description = 'Import LinkedIn data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)']
def get_items(self, query):
35    def get_items(self, query):
36        """
37        Run custom search
38
39        Not available for LinkedIn
40        """
41        raise NotImplementedError("LinkedIn datasets can only be created by importing data from elsewhere")

Run custom search

Not available for LinkedIn

@staticmethod
def map_item(item):
 43    @staticmethod
 44    def map_item(item):
 45        """
 46        Parse LinkedIn post in Voyager V2 format
 47
 48        'Voyager V2' seems to be how the format is referred to in the data
 49        itself...
 50
 51        :param item:  Data as received from LinkedIn
 52        :return dict:  Mapped item
 53        """
 54
 55        # annoyingly, posts don't come with a timestamp
 56        # approximate it by using the time of collection and the "time ago"
 57        # included with the post (e.g. 'published 18h ago')
 58        if not item.get("actor"):
 59            return {}
 60
 61        if "__import_meta" in item:
 62            time_collected = int(item["__import_meta"]["timestamp_collected"] / 1000)  # milliseconds
 63        else:
 64            # best we got
 65            time_collected = int(time.time())
 66
 67        time_ago = item["actor"]["subDescription"]["text"] if item["actor"].get("subDescription") else ""
 68        timestamp = int(time_collected - SearchLinkedIn.parse_time_ago(time_ago))
 69
 70        # images are stored in some convoluted way
 71        # there are multiple URLs for various thumbnails, use the one for the
 72        # largest version of the image
 73        images = []
 74        if item["content"] and "images" in item["content"]:
 75            for image in item["content"]["images"]:
 76                image_data = image["attributes"][0]["vectorImage"]
 77                artifacts = sorted(image_data["artifacts"], key=lambda x: x["width"], reverse=True)
 78                url = image_data["rootUrl"] + artifacts[0]["fileIdentifyingUrlPathSegment"]
 79                images.append(url)
 80
 81        # or alternatively they are stored here:
 82        if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"):
 83            image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"]
 84            if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]:
 85                images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"])
 86            elif image and image.get("artifacts"):
 87                images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"])
 88
 89        # video thumbnails are stored similarly as image data
 90        video_thumb_url = ""
 91        thumb_content = None
 92        if item["content"] and "*videoPlayMetadata" in item["content"]:
 93            thumb_content = item["content"]["*videoPlayMetadata"]["thumbnail"]
 94        elif item["content"] and "linkedInVideoComponent" in item["content"] and item["content"]["linkedInVideoComponent"]:
 95            thumb_content = item["content"]["linkedInVideoComponent"]["*videoPlayMetadata"]["thumbnail"]
 96        elif item["content"] and "externalVideoComponent" in item["content"] and item["content"]["externalVideoComponent"]:
 97            thumb_content = item["content"]["externalVideoComponent"]["*videoPlayMetadata"]["thumbnail"]
 98        if thumb_content:
 99            video_thumb_url = thumb_content["rootUrl"] + thumb_content["artifacts"][0]["fileIdentifyingUrlPathSegment"]
100
101        author = SearchLinkedIn.get_author(item)
102
103        # the ID is in the format 'urn:li:activity:6960882777168695296'
104        # retain the numerical part as the item ID for 4CAT
105        # sometimes posts seem to be combined, e.g.:
106        # urn:li:aggregate:(urn:li:activity:3966023054712791616,urn:li:activity:3965915018238312449)
107        # effectively both IDs seem to refer to the same post, so just take the
108        # first one
109        meta_urn = item.get("updateMetadata", {"urn": item.get("preDashEntityUrn")})["urn"]
110        urn = "urn:li:activity:" + meta_urn.split("urn:li:activity:")[1].split(",")[0].split(")")[0]
111        item_id = urn.split(":").pop()
112
113        # the way hashtags were stored changed at some point
114        hashtags = []
115        if item["commentary"] and "attributes" in item["commentary"]["text"]:
116            hashtags = [tag["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributes", []) if tag["type"] == "HASHTAG"]
117        elif item["commentary"] and "attributesV2" in item["commentary"]["text"]:
118            hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]]
119
120        # and mentions
121        # we're storing both usernames and full names
122        author_mentions = []
123        author_name_mentions = []
124        if item["commentary"] and "attributes" in item["commentary"]["text"]:
125            for mention in item["commentary"]["text"].get("attributes", {}):
126                if mention["type"] == "PROFILE_MENTION":
127                    mention = mention["*miniProfile"]
128                    author_mentions.append(mention["publicIdentifier"])         
129                    author_name_mentions.append(" ".join([mention.get("firstName", ""), mention.get("lastName", "")]))
130                elif mention["type"] == "COMPANY_NAME":
131                    mention = mention["*miniCompany"]
132                    author_mentions.append(mention["universalName"])         
133                    author_name_mentions.append(mention.get("name", ""))
134
135        # same for metrics
136        if "*totalSocialActivityCounts" in item["*socialDetail"]:
137            metrics = {
138                "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"],
139                "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"],
140                "reactions": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"],
141                "reaction_like": 0,
142                "reaction_empathy": 0,
143                "reaction_praise": 0,
144                "reaction_entertainment": 0,
145                "reaction_appreciation": 0,
146                "reaction_interest": 0
147                }
148            # There's different kind of reaction metrics
149            for reaction_type in item["*socialDetail"]["*totalSocialActivityCounts"].get("reactionTypeCounts", []):
150                metrics["reaction_" + reaction_type["reactionType"].lower()] = reaction_type["count"]
151
152        else:
153            metrics = {
154                "comments": item["*socialDetail"]["comments"]["paging"]["total"],
155                "shares": item["*socialDetail"]["totalShares"],
156                "reactions": item["*socialDetail"]["likes"]["paging"]["total"]
157            }
158
159        # and links
160        link_url = ""
161        if item.get("content") and item["content"].get("navigationContext"):
162            link_url = item["content"]["navigationContext"].get("actionTarget", "")
163        elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]:
164            link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "")
165
166        return MappedItem({
167            "collected_from_url": normalize_url_encoding(item.get("__import_meta", {}).get("source_platform_url", "")),  # Zeeschuimer metadata
168            "id": item_id,
169            "thread_id": item_id,
170            "body": item["commentary"]["text"]["text"] if item["commentary"] else "",
171            "timestamp": datetime.datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S"),
172            "timestamp_collected": datetime.datetime.utcfromtimestamp(time_collected).strftime("%Y-%m-%d %H:%M:%S"),
173            "timestamp_ago": time_ago.split("•")[0].strip(),
174            "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no",
175            **{("author_" + k).replace("_username", ""): v for k, v in author.items()},
176            "author_mentions": ",".join(author_mentions),
177            "author_name_mentions": ",".join(author_name_mentions),
178            "hashtags": ",".join(hashtags),
179            "image_urls": ",".join(images),
180            "video_thumb_url": video_thumb_url,
181            "post_url": "https://www.linkedin.com/feed/update/" + urn,
182            "link_url":  link_url,
183            **metrics,
184            "inclusion_context": item["header"]["text"]["text"] if item.get("header") else "",
185            "unix_timestamp": timestamp,
186            "unix_timestamp_collected": time_collected
187        })

Parse LinkedIn post in Voyager V2 format

'Voyager V2' seems to be how the format is referred to in the data itself...

Parameters
  • item: Data as received from LinkedIn
Returns

Mapped item

@staticmethod
def get_author(post):
189    @staticmethod
190    def get_author(post):
191        """
192        Extract author information from post
193
194        This is a bit complicated because it works differently for companies
195        and users and some fields are not always present. Hence, a separate
196        method.
197
198        :param dict post:  Post data
199        :return dict:  Author information
200        """
201        author = {
202            "username": post["actor"]["navigationContext"]["actionTarget"].split("linkedin.com/").pop().split("?")[0],
203            "name": post["actor"]["name"]["text"],
204            "description": post["actor"].get("description", {}).get("text", "") if post["actor"].get("description") else "",
205            "pronouns": "",
206            "avatar_url": "",
207            "is_company": "no",
208            "url": post["actor"]["navigationContext"]["actionTarget"].split("?")[0],
209        }
210
211        # likewise for author avatars
212        if post["actor"]["name"].get("attributes"):
213            if "*miniProfile" in post["actor"]["name"]["attributes"][0]:
214                author_profile = post["actor"]["name"]["attributes"][0]["*miniProfile"]
215                if author_profile["picture"]:
216                    avatar_artifacts = sorted(author_profile["picture"]["artifacts"], key=lambda x: x["width"], reverse=True)
217                    author.update({"avatar_url": author_profile["picture"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
218
219                if author_profile.get("customPronoun"):
220                    author.update({"pronouns": author_profile.get("customPronoun")})
221                elif author_profile.get("standardizedPronoun"):
222                    author.update({"pronouns": author_profile.get("standardizedPronoun").lower()})
223
224            elif "*miniCompany" in post["actor"]["name"]["attributes"][0]:
225                author_profile = post["actor"]["name"]["attributes"][0]["*miniCompany"]
226                avatar_artifacts = sorted(author_profile["logo"]["artifacts"], key=lambda x: x["width"], reverse=True)
227
228                author.update({"is_company": "yes"})
229                author.update({"avatar_url": author_profile["logo"]["rootUrl"] + avatar_artifacts[0]["fileIdentifyingUrlPathSegment"]})
230
231        if post["actor"]["name"].get("attributesV2"):
232            pronouns = post["actor"]["name"]["attributesV2"][0]["detailData"].get("*profileFullName", {}).get("pronoun")
233            if pronouns:
234                if pronouns.get("customPronoun"):
235                    author.update({"pronouns": pronouns.get("customPronoun")})
236                elif pronouns.get("standardizedPronoun"):
237                    author.update({"pronouns": pronouns.get("standardizedPronoun")})
238
239        avatar = post["actor"]["image"].get("attributes", [{}])[0].get("detailData", {}).get("nonEntityProfilePicture")
240        if avatar and avatar["vectorImage"]:
241            author.update({"avatar_url": avatar["vectorImage"]["rootUrl"] + avatar["vectorImage"]["artifacts"][0]["fileIdentifyingUrlPathSegment"]})
242
243        return author

Extract author information from post

This is a bit complicated because it works differently for companies and users and some fields are not always present. Hence, a separate method.

Parameters
  • dict post: Post data
Returns

Author information

@staticmethod
def parse_time_ago(time_ago):
245    @staticmethod
246    def parse_time_ago(time_ago):
247        """
248        Attempt to parse a timestamp for a post
249
250        LinkedIn doesn't give us the actual timestamp, only a relative
251        indicator like "18h ago". This is annoying because it gets more
252        imprecise the longer ago it is, and because it is language-sensitive.
253        For example, in English 18 months is displayed as "18mo" but in Dutch
254        it is "18 mnd".
255
256        Right now this will only adjust the 'collected at' timestamp if the
257        data was scraped from an English or Dutch interface, and even then the
258        timestamps will still be imprecise.
259
260        :param str time_ago:  Relative timestamp, e.g. '18mo'.
261        :return int:  Estimated timestamp of post, as unix timestamp
262        """
263        time_ago = time_ago.split("•")[0]
264        numbers = re.sub(r"[^0-9]", "", time_ago).strip()
265        letters = re.sub(r"[0-9]", "", time_ago).strip()
266
267        period_lengths = {
268            "s": 1,
269            "m": 60,
270            "h": 3600,
271            "d": 86400,
272            "w": 7 * 86400,
273            "mo": 30.4375 * 86400,  # we don't know WHICH months, so use the average length of a month
274            "mnd": 30.4375 * 86400,
275            "yr": 365.25 * 86400,  # likewise
276            "j": 365.25 * 86400,
277        }
278
279        numbers = int(numbers) if len(numbers) else 0
280        return period_lengths.get(letters, 0) * numbers

Attempt to parse a timestamp for a post

LinkedIn doesn't give us the actual timestamp, only a relative indicator like "18h ago". This is annoying because it gets more imprecise the longer ago it is, and because it is language-sensitive. For example, in English 18 months is displayed as "18mo" but in Dutch it is "18 mnd".

Right now this will only adjust the 'collected at' timestamp if the data was scraped from an English or Dutch interface, and even then the timestamps will still be imprecise.

Parameters
  • str time_ago: Relative timestamp, e.g. '18mo'.
Returns

Estimated timestamp of post, as unix timestamp