datasources.pinterest.search_pinterest
Import scraped Pinterest data
It's prohibitively difficult to scrape data from Pinterest within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.
1""" 2Import scraped Pinterest data 3 4It's prohibitively difficult to scrape data from Pinterest within 4CAT itself due 5to its aggressive rate limiting. Instead, import data collected elsewhere. 6""" 7from datetime import datetime 8 9from backend.lib.search import Search 10from common.lib.item_mapping import MappedItem, MissingMappedField 11 12 13class SearchPinterest(Search): 14 """ 15 Import scraped Pinterest data 16 """ 17 type = "pinterest-search" # job ID 18 category = "Search" # category 19 title = "Import scraped Pinterest data" # title displayed in UI 20 description = "Import Pinterest data collected with an external tool such as Zeeschuimer." # description displayed in UI 21 extension = "ndjson" # extension of result file, used internally and in UI 22 is_from_zeeschuimer = True 23 24 # not available as a processor for existing datasets 25 accepts = [None] 26 references = [ 27 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 28 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 29 ] 30 31 def get_items(self, query): 32 """ 33 Run custom search 34 35 Not available for Pinterest 36 """ 37 raise NotImplementedError("Pinterest datasets can only be created by importing data from elsewhere") 38 39 @staticmethod 40 def map_item(post): 41 """ 42 Map Pinterest object to 4CAT item 43 44 Depending on whether the object was captured from JSON or HTML, treat it 45 differently. A lot of data is missing from HTML objects. 46 47 :param post: 48 :return: 49 """ 50 if post.get("_zs-origin") == "html": 51 return SearchPinterest.map_item_from_html(post) 52 else: 53 return SearchPinterest.map_item_from_json(post) 54 55 @staticmethod 56 def map_item_from_json(post): 57 """ 58 Map Pinterest object to 4CAT item 59 60 Pretty simple, except posts sometimes don't have timestamps :| but at 61 least these objects are more complete than the HTML data usually 62 63 :param dict post: Pinterest object 64 :return MappedItem: Mapped item 65 """ 66 try: 67 # there are often no timestamps :'( 68 timestamp = datetime.strptime(post.get("created_at", post.get("createdAt")), "%a, %d %b %Y %H:%M:%S %z") 69 unix_timestamp = int(timestamp.timestamp()) 70 str_timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S") 71 except (ValueError, TypeError): 72 unix_timestamp = str_timestamp = MissingMappedField("") 73 74 post_id = post.get("entityId", post["id"]) 75 76 if "imageSpec_orig" in post: 77 image_url = post["imageSpec_orig"]["url"] 78 else: 79 image_url = post["images"]["orig"]["url"] 80 81 return MappedItem({ 82 "id": post_id, 83 "thread_id": post_id, 84 "author": post["pinner"]["username"], 85 "author_fullname": post["pinner"].get("fullName", post["pinner"].get("full_name", "")), 86 "author_original": post["nativeCreator"]["username"] if post.get("nativeCreator") else post["pinner"]["username"], 87 "body": post["description"].strip(), 88 "subject": post["title"].strip(), 89 "ai_description": post.get("auto_alt_text", ""), 90 "pinner_original": post["originPinner"]["fullName"] if post.get("originPinner") else "", 91 "pinner_via": post["viaPinner"]["fullName"] if post.get("viaPinner") else "", 92 "board": post["board"]["name"], 93 "board_pins": post["board"].get("pinCount", post["board"].get("pin_count")), 94 "board_url": f"https://www.pinterest.com{post['board']['url']}", 95 "timestamp": str_timestamp, 96 "idea_tags": ",".join(post["pinJoin"]["visualAnnotation"]) if post.get("pinJoin") else "", 97 "url": f"https://www.pinterest.com/pin/{post_id}", 98 # these are not always available (shame) 99 # "is_repin": "yes" if post["isRepin"] else "no", 100 # "is_unsafe": "yes" if post["isUnsafe"] else "no", 101 # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"], 102 "is_video": "yes" if post.get("isVideo", post.get("videos")) else "no", 103 "image_url": image_url, 104 "dominant_colour": post.get("dominantColor", post.get("dominant_color")), 105 "unix_timestamp": unix_timestamp 106 }) 107 108 @staticmethod 109 def map_item_from_html(post): 110 """ 111 Map Pinterest object to 4CAT item 112 113 These are from the HTML and have even less data than JSON objects... 114 but enough to be useful in some cases. 115 116 :param dict post: Pinterest object 117 :return MappedItem: Mapped item 118 """ 119 return MappedItem({ 120 "id": int(post["id"]), 121 "thread_id": int(post["id"]), 122 "author": MissingMappedField(""), 123 "author_fullname": MissingMappedField(""), 124 "author_original": MissingMappedField(""), 125 "body": post["body"].strip(), 126 "subject": post["title"].strip(), 127 "ai_description": MissingMappedField(""), 128 "pinner_original": MissingMappedField(""), 129 "pinner_via": MissingMappedField(""), 130 "board": MissingMappedField(""), 131 "board_pins": MissingMappedField(""), 132 "board_url": MissingMappedField(""), 133 "timestamp": MissingMappedField(""), # there are no timestamps :( 134 "idea_tags": ",".join(post["tags"]), 135 "url": f"https://www.pinterest.com/pin/{post['id']}", 136 # these are not always available (shame) 137 # "is_repin": "yes" if post["isRepin"] else "no", 138 # "is_unsafe": "yes" if post["isUnsafe"] else "no", 139 # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"], 140 "is_video": MissingMappedField(""), 141 "image_url": post["image"], 142 "dominant_colour": MissingMappedField(""), 143 "unix_timestamp": MissingMappedField("") 144 })
14class SearchPinterest(Search): 15 """ 16 Import scraped Pinterest data 17 """ 18 type = "pinterest-search" # job ID 19 category = "Search" # category 20 title = "Import scraped Pinterest data" # title displayed in UI 21 description = "Import Pinterest data collected with an external tool such as Zeeschuimer." # description displayed in UI 22 extension = "ndjson" # extension of result file, used internally and in UI 23 is_from_zeeschuimer = True 24 25 # not available as a processor for existing datasets 26 accepts = [None] 27 references = [ 28 "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", 29 "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" 30 ] 31 32 def get_items(self, query): 33 """ 34 Run custom search 35 36 Not available for Pinterest 37 """ 38 raise NotImplementedError("Pinterest datasets can only be created by importing data from elsewhere") 39 40 @staticmethod 41 def map_item(post): 42 """ 43 Map Pinterest object to 4CAT item 44 45 Depending on whether the object was captured from JSON or HTML, treat it 46 differently. A lot of data is missing from HTML objects. 47 48 :param post: 49 :return: 50 """ 51 if post.get("_zs-origin") == "html": 52 return SearchPinterest.map_item_from_html(post) 53 else: 54 return SearchPinterest.map_item_from_json(post) 55 56 @staticmethod 57 def map_item_from_json(post): 58 """ 59 Map Pinterest object to 4CAT item 60 61 Pretty simple, except posts sometimes don't have timestamps :| but at 62 least these objects are more complete than the HTML data usually 63 64 :param dict post: Pinterest object 65 :return MappedItem: Mapped item 66 """ 67 try: 68 # there are often no timestamps :'( 69 timestamp = datetime.strptime(post.get("created_at", post.get("createdAt")), "%a, %d %b %Y %H:%M:%S %z") 70 unix_timestamp = int(timestamp.timestamp()) 71 str_timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S") 72 except (ValueError, TypeError): 73 unix_timestamp = str_timestamp = MissingMappedField("") 74 75 post_id = post.get("entityId", post["id"]) 76 77 if "imageSpec_orig" in post: 78 image_url = post["imageSpec_orig"]["url"] 79 else: 80 image_url = post["images"]["orig"]["url"] 81 82 return MappedItem({ 83 "id": post_id, 84 "thread_id": post_id, 85 "author": post["pinner"]["username"], 86 "author_fullname": post["pinner"].get("fullName", post["pinner"].get("full_name", "")), 87 "author_original": post["nativeCreator"]["username"] if post.get("nativeCreator") else post["pinner"]["username"], 88 "body": post["description"].strip(), 89 "subject": post["title"].strip(), 90 "ai_description": post.get("auto_alt_text", ""), 91 "pinner_original": post["originPinner"]["fullName"] if post.get("originPinner") else "", 92 "pinner_via": post["viaPinner"]["fullName"] if post.get("viaPinner") else "", 93 "board": post["board"]["name"], 94 "board_pins": post["board"].get("pinCount", post["board"].get("pin_count")), 95 "board_url": f"https://www.pinterest.com{post['board']['url']}", 96 "timestamp": str_timestamp, 97 "idea_tags": ",".join(post["pinJoin"]["visualAnnotation"]) if post.get("pinJoin") else "", 98 "url": f"https://www.pinterest.com/pin/{post_id}", 99 # these are not always available (shame) 100 # "is_repin": "yes" if post["isRepin"] else "no", 101 # "is_unsafe": "yes" if post["isUnsafe"] else "no", 102 # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"], 103 "is_video": "yes" if post.get("isVideo", post.get("videos")) else "no", 104 "image_url": image_url, 105 "dominant_colour": post.get("dominantColor", post.get("dominant_color")), 106 "unix_timestamp": unix_timestamp 107 }) 108 109 @staticmethod 110 def map_item_from_html(post): 111 """ 112 Map Pinterest object to 4CAT item 113 114 These are from the HTML and have even less data than JSON objects... 115 but enough to be useful in some cases. 116 117 :param dict post: Pinterest object 118 :return MappedItem: Mapped item 119 """ 120 return MappedItem({ 121 "id": int(post["id"]), 122 "thread_id": int(post["id"]), 123 "author": MissingMappedField(""), 124 "author_fullname": MissingMappedField(""), 125 "author_original": MissingMappedField(""), 126 "body": post["body"].strip(), 127 "subject": post["title"].strip(), 128 "ai_description": MissingMappedField(""), 129 "pinner_original": MissingMappedField(""), 130 "pinner_via": MissingMappedField(""), 131 "board": MissingMappedField(""), 132 "board_pins": MissingMappedField(""), 133 "board_url": MissingMappedField(""), 134 "timestamp": MissingMappedField(""), # there are no timestamps :( 135 "idea_tags": ",".join(post["tags"]), 136 "url": f"https://www.pinterest.com/pin/{post['id']}", 137 # these are not always available (shame) 138 # "is_repin": "yes" if post["isRepin"] else "no", 139 # "is_unsafe": "yes" if post["isUnsafe"] else "no", 140 # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"], 141 "is_video": MissingMappedField(""), 142 "image_url": post["image"], 143 "dominant_colour": MissingMappedField(""), 144 "unix_timestamp": MissingMappedField("") 145 })
Import scraped Pinterest data
references =
['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def
get_items(self, query):
32 def get_items(self, query): 33 """ 34 Run custom search 35 36 Not available for Pinterest 37 """ 38 raise NotImplementedError("Pinterest datasets can only be created by importing data from elsewhere")
Run custom search
Not available for Pinterest
@staticmethod
def
map_item(post):
40 @staticmethod 41 def map_item(post): 42 """ 43 Map Pinterest object to 4CAT item 44 45 Depending on whether the object was captured from JSON or HTML, treat it 46 differently. A lot of data is missing from HTML objects. 47 48 :param post: 49 :return: 50 """ 51 if post.get("_zs-origin") == "html": 52 return SearchPinterest.map_item_from_html(post) 53 else: 54 return SearchPinterest.map_item_from_json(post)
Map Pinterest object to 4CAT item
Depending on whether the object was captured from JSON or HTML, treat it differently. A lot of data is missing from HTML objects.
Parameters
- post:
Returns
@staticmethod
def
map_item_from_json(post):
56 @staticmethod 57 def map_item_from_json(post): 58 """ 59 Map Pinterest object to 4CAT item 60 61 Pretty simple, except posts sometimes don't have timestamps :| but at 62 least these objects are more complete than the HTML data usually 63 64 :param dict post: Pinterest object 65 :return MappedItem: Mapped item 66 """ 67 try: 68 # there are often no timestamps :'( 69 timestamp = datetime.strptime(post.get("created_at", post.get("createdAt")), "%a, %d %b %Y %H:%M:%S %z") 70 unix_timestamp = int(timestamp.timestamp()) 71 str_timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S") 72 except (ValueError, TypeError): 73 unix_timestamp = str_timestamp = MissingMappedField("") 74 75 post_id = post.get("entityId", post["id"]) 76 77 if "imageSpec_orig" in post: 78 image_url = post["imageSpec_orig"]["url"] 79 else: 80 image_url = post["images"]["orig"]["url"] 81 82 return MappedItem({ 83 "id": post_id, 84 "thread_id": post_id, 85 "author": post["pinner"]["username"], 86 "author_fullname": post["pinner"].get("fullName", post["pinner"].get("full_name", "")), 87 "author_original": post["nativeCreator"]["username"] if post.get("nativeCreator") else post["pinner"]["username"], 88 "body": post["description"].strip(), 89 "subject": post["title"].strip(), 90 "ai_description": post.get("auto_alt_text", ""), 91 "pinner_original": post["originPinner"]["fullName"] if post.get("originPinner") else "", 92 "pinner_via": post["viaPinner"]["fullName"] if post.get("viaPinner") else "", 93 "board": post["board"]["name"], 94 "board_pins": post["board"].get("pinCount", post["board"].get("pin_count")), 95 "board_url": f"https://www.pinterest.com{post['board']['url']}", 96 "timestamp": str_timestamp, 97 "idea_tags": ",".join(post["pinJoin"]["visualAnnotation"]) if post.get("pinJoin") else "", 98 "url": f"https://www.pinterest.com/pin/{post_id}", 99 # these are not always available (shame) 100 # "is_repin": "yes" if post["isRepin"] else "no", 101 # "is_unsafe": "yes" if post["isUnsafe"] else "no", 102 # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"], 103 "is_video": "yes" if post.get("isVideo", post.get("videos")) else "no", 104 "image_url": image_url, 105 "dominant_colour": post.get("dominantColor", post.get("dominant_color")), 106 "unix_timestamp": unix_timestamp 107 })
Map Pinterest object to 4CAT item
Pretty simple, except posts sometimes don't have timestamps :| but at least these objects are more complete than the HTML data usually
Parameters
- dict post: Pinterest object
Returns
Mapped item
@staticmethod
def
map_item_from_html(post):
109 @staticmethod 110 def map_item_from_html(post): 111 """ 112 Map Pinterest object to 4CAT item 113 114 These are from the HTML and have even less data than JSON objects... 115 but enough to be useful in some cases. 116 117 :param dict post: Pinterest object 118 :return MappedItem: Mapped item 119 """ 120 return MappedItem({ 121 "id": int(post["id"]), 122 "thread_id": int(post["id"]), 123 "author": MissingMappedField(""), 124 "author_fullname": MissingMappedField(""), 125 "author_original": MissingMappedField(""), 126 "body": post["body"].strip(), 127 "subject": post["title"].strip(), 128 "ai_description": MissingMappedField(""), 129 "pinner_original": MissingMappedField(""), 130 "pinner_via": MissingMappedField(""), 131 "board": MissingMappedField(""), 132 "board_pins": MissingMappedField(""), 133 "board_url": MissingMappedField(""), 134 "timestamp": MissingMappedField(""), # there are no timestamps :( 135 "idea_tags": ",".join(post["tags"]), 136 "url": f"https://www.pinterest.com/pin/{post['id']}", 137 # these are not always available (shame) 138 # "is_repin": "yes" if post["isRepin"] else "no", 139 # "is_unsafe": "yes" if post["isUnsafe"] else "no", 140 # "total_saves": post["aggregatedPinData"]["aggregatedStats"]["saves"], 141 "is_video": MissingMappedField(""), 142 "image_url": post["image"], 143 "dominant_colour": MissingMappedField(""), 144 "unix_timestamp": MissingMappedField("") 145 })
Map Pinterest object to 4CAT item
These are from the HTML and have even less data than JSON objects... but enough to be useful in some cases.
Parameters
- dict post: Pinterest object
Returns
Mapped item
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.search.Search
- max_workers
- prefix
- return_cols
- import_error_count
- import_warning_count
- process
- search
- import_from_file
- items_to_csv
- items_to_ndjson
- items_to_archive
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_options
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor