Edit on GitHub

datasources.pinterest.search_pinterest

Import scraped Pinterest data

It's prohibitively difficult to scrape data from Pinterest within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

  1"""
  2Import scraped Pinterest data
  3
  4It's prohibitively difficult to scrape data from Pinterest within 4CAT itself due
  5to its aggressive rate limiting. Instead, import data collected elsewhere.
  6"""
  7from datetime import datetime
  8
  9from backend.lib.search import Search
 10from common.lib.item_mapping import MappedItem, MissingMappedField
 11
 12
 13class SearchPinterest(Search):
 14    """
 15    Import scraped Pinterest data
 16    """
 17    type = "pinterest-search"  # job ID
 18    category = "Search"  # category
 19    title = "Import scraped Pinterest data"  # title displayed in UI
 20    description = "Import Pinterest data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 21    extension = "ndjson"  # extension of result file, used internally and in UI
 22    is_from_zeeschuimer = True
 23
 24    # not available as a processor for existing datasets
 25    accepts = [None]
 26    references = [
 27        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 28        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 29    ]
 30
 31    def get_items(self, query):
 32        """
 33        Run custom search
 34
 35        Not available for Pinterest
 36        """
 37        raise NotImplementedError("Pinterest datasets can only be created by importing data from elsewhere")
 38
 39    @staticmethod
 40    def map_item(post):
 41        """
 42        Map Pinterest object to 4CAT item
 43
 44        Depending on whether the object was captured from JSON or HTML, treat it
 45        differently. A lot of data is missing from HTML objects.
 46
 47        :param post:
 48        :return:
 49        """
 50        if post.get("_zs-origin") == "html":
 51            return SearchPinterest.map_item_from_html(post)
 52        else:
 53            return SearchPinterest.map_item_from_json(post)
 54
 55    @staticmethod
 56    def map_item_from_json(post):
 57        """
 58        Map Pinterest object to 4CAT item
 59
 60        Pretty simple, except posts sometimes don't have timestamps :| but at
 61        least these objects are more complete than the HTML data usually
 62
 63        :param dict post:  Pinterest object
 64        :return MappedItem:  Mapped item
 65        """
 66        try:
 67            # there are often no timestamps :'(
 68            timestamp = datetime.strptime(post.get("created_at", post.get("createdAt")), "%a, %d %b %Y %H:%M:%S %z")
 69            unix_timestamp = int(timestamp.timestamp())
 70            str_timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S")
 71        except (ValueError, TypeError):
 72            unix_timestamp = str_timestamp = MissingMappedField("")
 73
 74        post_id = post.get("entityId", post["id"])
 75
 76        if "imageSpec_orig" in post:
 77            image_url = post["imageSpec_orig"]["url"]
 78        else:
 79            image_url = post["images"]["orig"]["url"]
 80
 81        return MappedItem({
 82            "id": post_id,
 83            "thread_id": post_id,
 84            "author": post["pinner"]["username"],
 85            "author_fullname": post["pinner"].get("fullName", post["pinner"].get("full_name", "")),
 86            "author_original": post["nativeCreator"]["username"] if post.get("nativeCreator") else post["pinner"]["username"],
 87            "body": post["description"].strip(),
 88            "subject": post["title"].strip(),
 89            "ai_description": post.get("auto_alt_text", ""),
 90            "pinner_original": post["originPinner"]["fullName"] if post.get("originPinner") else "",
 91            "pinner_via": post["viaPinner"]["fullName"] if post.get("viaPinner") else "",
 92            "board": post["board"]["name"],
 93            "board_pins": post["board"].get("pinCount", post["board"].get("pin_count")),
 94            "board_url": f"https://www.pinterest.com{post['board']['url']}",
 95            "timestamp": str_timestamp,
 96            "idea_tags": ",".join(post["pinJoin"]["visualAnnotation"]) if post.get("pinJoin") else "",
 97            "url": f"https://www.pinterest.com/pin/{post_id}",
 98            # these are not always available (shame)
 99            # "is_repin": "yes" if post["isRepin"] else "no",
100            # "is_unsafe": "yes" if post["isUnsafe"] else "no",
101            # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"],
102            "is_video": "yes" if post.get("isVideo", post.get("videos")) else "no",
103            "image_url": image_url,
104            "dominant_colour": post.get("dominantColor", post.get("dominant_color")),
105            "unix_timestamp": unix_timestamp
106        })
107
108    @staticmethod
109    def map_item_from_html(post):
110        """
111        Map Pinterest object to 4CAT item
112
113        These are from the HTML and have even less data than JSON objects...
114        but enough to be useful in some cases.
115
116        :param dict post:  Pinterest object
117        :return MappedItem:  Mapped item
118        """
119        return MappedItem({
120            "id": int(post["id"]),
121            "thread_id": int(post["id"]),
122            "author": MissingMappedField(""),
123            "author_fullname": MissingMappedField(""),
124            "author_original": MissingMappedField(""),
125            "body": post["body"].strip(),
126            "subject": post["title"].strip(),
127            "ai_description": MissingMappedField(""),
128            "pinner_original": MissingMappedField(""),
129            "pinner_via": MissingMappedField(""),
130            "board": MissingMappedField(""),
131            "board_pins": MissingMappedField(""),
132            "board_url": MissingMappedField(""),
133            "timestamp": MissingMappedField(""),  # there are no timestamps :(
134            "idea_tags": ",".join(post["tags"]),
135            "url": f"https://www.pinterest.com/pin/{post['id']}",
136            # these are not always available (shame)
137            # "is_repin": "yes" if post["isRepin"] else "no",
138            # "is_unsafe": "yes" if post["isUnsafe"] else "no",
139            # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"],
140            "is_video": MissingMappedField(""),
141            "image_url": post["image"],
142            "dominant_colour": MissingMappedField(""),
143            "unix_timestamp": MissingMappedField("")
144        })
class SearchPinterest(backend.lib.search.Search):
 14class SearchPinterest(Search):
 15    """
 16    Import scraped Pinterest data
 17    """
 18    type = "pinterest-search"  # job ID
 19    category = "Search"  # category
 20    title = "Import scraped Pinterest data"  # title displayed in UI
 21    description = "Import Pinterest data collected with an external tool such as Zeeschuimer."  # description displayed in UI
 22    extension = "ndjson"  # extension of result file, used internally and in UI
 23    is_from_zeeschuimer = True
 24
 25    # not available as a processor for existing datasets
 26    accepts = [None]
 27    references = [
 28        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
 29        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
 30    ]
 31
 32    def get_items(self, query):
 33        """
 34        Run custom search
 35
 36        Not available for Pinterest
 37        """
 38        raise NotImplementedError("Pinterest datasets can only be created by importing data from elsewhere")
 39
 40    @staticmethod
 41    def map_item(post):
 42        """
 43        Map Pinterest object to 4CAT item
 44
 45        Depending on whether the object was captured from JSON or HTML, treat it
 46        differently. A lot of data is missing from HTML objects.
 47
 48        :param post:
 49        :return:
 50        """
 51        if post.get("_zs-origin") == "html":
 52            return SearchPinterest.map_item_from_html(post)
 53        else:
 54            return SearchPinterest.map_item_from_json(post)
 55
 56    @staticmethod
 57    def map_item_from_json(post):
 58        """
 59        Map Pinterest object to 4CAT item
 60
 61        Pretty simple, except posts sometimes don't have timestamps :| but at
 62        least these objects are more complete than the HTML data usually
 63
 64        :param dict post:  Pinterest object
 65        :return MappedItem:  Mapped item
 66        """
 67        try:
 68            # there are often no timestamps :'(
 69            timestamp = datetime.strptime(post.get("created_at", post.get("createdAt")), "%a, %d %b %Y %H:%M:%S %z")
 70            unix_timestamp = int(timestamp.timestamp())
 71            str_timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S")
 72        except (ValueError, TypeError):
 73            unix_timestamp = str_timestamp = MissingMappedField("")
 74
 75        post_id = post.get("entityId", post["id"])
 76
 77        if "imageSpec_orig" in post:
 78            image_url = post["imageSpec_orig"]["url"]
 79        else:
 80            image_url = post["images"]["orig"]["url"]
 81
 82        return MappedItem({
 83            "id": post_id,
 84            "thread_id": post_id,
 85            "author": post["pinner"]["username"],
 86            "author_fullname": post["pinner"].get("fullName", post["pinner"].get("full_name", "")),
 87            "author_original": post["nativeCreator"]["username"] if post.get("nativeCreator") else post["pinner"]["username"],
 88            "body": post["description"].strip(),
 89            "subject": post["title"].strip(),
 90            "ai_description": post.get("auto_alt_text", ""),
 91            "pinner_original": post["originPinner"]["fullName"] if post.get("originPinner") else "",
 92            "pinner_via": post["viaPinner"]["fullName"] if post.get("viaPinner") else "",
 93            "board": post["board"]["name"],
 94            "board_pins": post["board"].get("pinCount", post["board"].get("pin_count")),
 95            "board_url": f"https://www.pinterest.com{post['board']['url']}",
 96            "timestamp": str_timestamp,
 97            "idea_tags": ",".join(post["pinJoin"]["visualAnnotation"]) if post.get("pinJoin") else "",
 98            "url": f"https://www.pinterest.com/pin/{post_id}",
 99            # these are not always available (shame)
100            # "is_repin": "yes" if post["isRepin"] else "no",
101            # "is_unsafe": "yes" if post["isUnsafe"] else "no",
102            # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"],
103            "is_video": "yes" if post.get("isVideo", post.get("videos")) else "no",
104            "image_url": image_url,
105            "dominant_colour": post.get("dominantColor", post.get("dominant_color")),
106            "unix_timestamp": unix_timestamp
107        })
108
109    @staticmethod
110    def map_item_from_html(post):
111        """
112        Map Pinterest object to 4CAT item
113
114        These are from the HTML and have even less data than JSON objects...
115        but enough to be useful in some cases.
116
117        :param dict post:  Pinterest object
118        :return MappedItem:  Mapped item
119        """
120        return MappedItem({
121            "id": int(post["id"]),
122            "thread_id": int(post["id"]),
123            "author": MissingMappedField(""),
124            "author_fullname": MissingMappedField(""),
125            "author_original": MissingMappedField(""),
126            "body": post["body"].strip(),
127            "subject": post["title"].strip(),
128            "ai_description": MissingMappedField(""),
129            "pinner_original": MissingMappedField(""),
130            "pinner_via": MissingMappedField(""),
131            "board": MissingMappedField(""),
132            "board_pins": MissingMappedField(""),
133            "board_url": MissingMappedField(""),
134            "timestamp": MissingMappedField(""),  # there are no timestamps :(
135            "idea_tags": ",".join(post["tags"]),
136            "url": f"https://www.pinterest.com/pin/{post['id']}",
137            # these are not always available (shame)
138            # "is_repin": "yes" if post["isRepin"] else "no",
139            # "is_unsafe": "yes" if post["isUnsafe"] else "no",
140            # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"],
141            "is_video": MissingMappedField(""),
142            "image_url": post["image"],
143            "dominant_colour": MissingMappedField(""),
144            "unix_timestamp": MissingMappedField("")
145        })

Import scraped Pinterest data

type = 'pinterest-search'
category = 'Search'
title = 'Import scraped Pinterest data'
description = 'Import Pinterest data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def get_items(self, query):
32    def get_items(self, query):
33        """
34        Run custom search
35
36        Not available for Pinterest
37        """
38        raise NotImplementedError("Pinterest datasets can only be created by importing data from elsewhere")

Run custom search

Not available for Pinterest

@staticmethod
def map_item(post):
40    @staticmethod
41    def map_item(post):
42        """
43        Map Pinterest object to 4CAT item
44
45        Depending on whether the object was captured from JSON or HTML, treat it
46        differently. A lot of data is missing from HTML objects.
47
48        :param post:
49        :return:
50        """
51        if post.get("_zs-origin") == "html":
52            return SearchPinterest.map_item_from_html(post)
53        else:
54            return SearchPinterest.map_item_from_json(post)

Map Pinterest object to 4CAT item

Depending on whether the object was captured from JSON or HTML, treat it differently. A lot of data is missing from HTML objects.

Parameters
  • post:
Returns
@staticmethod
def map_item_from_json(post):
 56    @staticmethod
 57    def map_item_from_json(post):
 58        """
 59        Map Pinterest object to 4CAT item
 60
 61        Pretty simple, except posts sometimes don't have timestamps :| but at
 62        least these objects are more complete than the HTML data usually
 63
 64        :param dict post:  Pinterest object
 65        :return MappedItem:  Mapped item
 66        """
 67        try:
 68            # there are often no timestamps :'(
 69            timestamp = datetime.strptime(post.get("created_at", post.get("createdAt")), "%a, %d %b %Y %H:%M:%S %z")
 70            unix_timestamp = int(timestamp.timestamp())
 71            str_timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S")
 72        except (ValueError, TypeError):
 73            unix_timestamp = str_timestamp = MissingMappedField("")
 74
 75        post_id = post.get("entityId", post["id"])
 76
 77        if "imageSpec_orig" in post:
 78            image_url = post["imageSpec_orig"]["url"]
 79        else:
 80            image_url = post["images"]["orig"]["url"]
 81
 82        return MappedItem({
 83            "id": post_id,
 84            "thread_id": post_id,
 85            "author": post["pinner"]["username"],
 86            "author_fullname": post["pinner"].get("fullName", post["pinner"].get("full_name", "")),
 87            "author_original": post["nativeCreator"]["username"] if post.get("nativeCreator") else post["pinner"]["username"],
 88            "body": post["description"].strip(),
 89            "subject": post["title"].strip(),
 90            "ai_description": post.get("auto_alt_text", ""),
 91            "pinner_original": post["originPinner"]["fullName"] if post.get("originPinner") else "",
 92            "pinner_via": post["viaPinner"]["fullName"] if post.get("viaPinner") else "",
 93            "board": post["board"]["name"],
 94            "board_pins": post["board"].get("pinCount", post["board"].get("pin_count")),
 95            "board_url": f"https://www.pinterest.com{post['board']['url']}",
 96            "timestamp": str_timestamp,
 97            "idea_tags": ",".join(post["pinJoin"]["visualAnnotation"]) if post.get("pinJoin") else "",
 98            "url": f"https://www.pinterest.com/pin/{post_id}",
 99            # these are not always available (shame)
100            # "is_repin": "yes" if post["isRepin"] else "no",
101            # "is_unsafe": "yes" if post["isUnsafe"] else "no",
102            # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"],
103            "is_video": "yes" if post.get("isVideo", post.get("videos")) else "no",
104            "image_url": image_url,
105            "dominant_colour": post.get("dominantColor", post.get("dominant_color")),
106            "unix_timestamp": unix_timestamp
107        })

Map Pinterest object to 4CAT item

Pretty simple, except posts sometimes don't have timestamps :| but at least these objects are more complete than the HTML data usually

Parameters
  • dict post: Pinterest object
Returns

Mapped item

@staticmethod
def map_item_from_html(post):
109    @staticmethod
110    def map_item_from_html(post):
111        """
112        Map Pinterest object to 4CAT item
113
114        These are from the HTML and have even less data than JSON objects...
115        but enough to be useful in some cases.
116
117        :param dict post:  Pinterest object
118        :return MappedItem:  Mapped item
119        """
120        return MappedItem({
121            "id": int(post["id"]),
122            "thread_id": int(post["id"]),
123            "author": MissingMappedField(""),
124            "author_fullname": MissingMappedField(""),
125            "author_original": MissingMappedField(""),
126            "body": post["body"].strip(),
127            "subject": post["title"].strip(),
128            "ai_description": MissingMappedField(""),
129            "pinner_original": MissingMappedField(""),
130            "pinner_via": MissingMappedField(""),
131            "board": MissingMappedField(""),
132            "board_pins": MissingMappedField(""),
133            "board_url": MissingMappedField(""),
134            "timestamp": MissingMappedField(""),  # there are no timestamps :(
135            "idea_tags": ",".join(post["tags"]),
136            "url": f"https://www.pinterest.com/pin/{post['id']}",
137            # these are not always available (shame)
138            # "is_repin": "yes" if post["isRepin"] else "no",
139            # "is_unsafe": "yes" if post["isUnsafe"] else "no",
140            # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"],
141            "is_video": MissingMappedField(""),
142            "image_url": post["image"],
143            "dominant_colour": MissingMappedField(""),
144            "unix_timestamp": MissingMappedField("")
145        })

Map Pinterest object to 4CAT item

These are from the HTML and have even less data than JSON objects... but enough to be useful in some cases.

Parameters
  • dict post: Pinterest object
Returns

Mapped item