Edit on GitHub

datasources.facebook.search_facebook

Import scraped Facebook data

It's prohibitively difficult to scrape data from Facebook within 4CAT itself due to its aggressive rate limiting. Instead, import data collected elsewhere.

 1"""
 2Import scraped Facebook data
 3
 4It's prohibitively difficult to scrape data from Facebook within 4CAT itself due
 5to its aggressive rate limiting. Instead, import data collected elsewhere.
 6"""
 7from datetime import datetime
 8import json
 9
10from backend.lib.search import Search
11from common.lib.item_mapping import MappedItem
12
13
14class SearchFacebook(Search):
15    """
16    Import scraped 9gag data
17    """
18    type = "facebook-search"  # job ID
19    category = "Search"  # category
20    title = "Import scraped Facebook data"  # title displayed in UI
21    description = "Import Facebook data collected with an external tool such as Zeeschuimer."  # description displayed in UI
22    extension = "ndjson"  # extension of result file, used internally and in UI
23    is_from_zeeschuimer = True
24
25    # not available as a processor for existing datasets
26    accepts = [None]
27    references = [
28        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
29        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
30    ]
31
32    def get_items(self, query):
33        """
34        Run custom search
35
36        Not available for 9gag
37        """
38        raise NotImplementedError("Facebook datasets can only be created by importing data from elsewhere")
39
40    @staticmethod
41    def map_item(post):
42        try:
43            main_data = post["comet_sections"]["content"]["story"]
44        except Exception as e:
45            print(json.dumps(post, indent=2))
46            raise e
47
48        # lol, get a load of this
49        metadata = [m for m in post["comet_sections"]["context_layout"]["story"]["comet_sections"]["metadata"] if m["__typename"] == "CometFeedStoryMinimizedTimestampStrategy"].pop(0)["story"]
50        post_timestamp = datetime.fromtimestamp(int(metadata["creation_time"]))
51
52        in_group = "/groups/" in metadata["url"]
53        group = ""
54        if in_group:
55            group = metadata["url"].split("/groups/")[1].split("/")[0]
56
57        author = main_data["actors"][0]
58
59        image_urls = []
60        video_urls = []
61        for attachment in main_data["attachments"]:
62            if attachment["target"]["__typename"] == "Photo":
63                image_urls.append(f"https://www.facebook.com/photo/?fbid={attachment['target']['id']}")
64
65
66        return MappedItem({
67            "id": main_data["post_id"],
68            "url": main_data["wwwURL"],
69            "body": main_data.get("message", {}).get("text", ""),
70            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
71            "author": author.get("url").split("/")[-1],
72            "author_name": author.get("name", ""),
73            "image_url": ",".join(image_urls),
74            "video_url": ",".join(video_urls),
75            "is_in_group": "yes" if in_group else "no",
76            "group_name": group,
77            "unix_timestamp": int(post_timestamp.timestamp()),
78        })
class SearchFacebook(backend.lib.search.Search):
15class SearchFacebook(Search):
16    """
17    Import scraped 9gag data
18    """
19    type = "facebook-search"  # job ID
20    category = "Search"  # category
21    title = "Import scraped Facebook data"  # title displayed in UI
22    description = "Import Facebook data collected with an external tool such as Zeeschuimer."  # description displayed in UI
23    extension = "ndjson"  # extension of result file, used internally and in UI
24    is_from_zeeschuimer = True
25
26    # not available as a processor for existing datasets
27    accepts = [None]
28    references = [
29        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
30        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
31    ]
32
33    def get_items(self, query):
34        """
35        Run custom search
36
37        Not available for 9gag
38        """
39        raise NotImplementedError("Facebook datasets can only be created by importing data from elsewhere")
40
41    @staticmethod
42    def map_item(post):
43        try:
44            main_data = post["comet_sections"]["content"]["story"]
45        except Exception as e:
46            print(json.dumps(post, indent=2))
47            raise e
48
49        # lol, get a load of this
50        metadata = [m for m in post["comet_sections"]["context_layout"]["story"]["comet_sections"]["metadata"] if m["__typename"] == "CometFeedStoryMinimizedTimestampStrategy"].pop(0)["story"]
51        post_timestamp = datetime.fromtimestamp(int(metadata["creation_time"]))
52
53        in_group = "/groups/" in metadata["url"]
54        group = ""
55        if in_group:
56            group = metadata["url"].split("/groups/")[1].split("/")[0]
57
58        author = main_data["actors"][0]
59
60        image_urls = []
61        video_urls = []
62        for attachment in main_data["attachments"]:
63            if attachment["target"]["__typename"] == "Photo":
64                image_urls.append(f"https://www.facebook.com/photo/?fbid={attachment['target']['id']}")
65
66
67        return MappedItem({
68            "id": main_data["post_id"],
69            "url": main_data["wwwURL"],
70            "body": main_data.get("message", {}).get("text", ""),
71            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
72            "author": author.get("url").split("/")[-1],
73            "author_name": author.get("name", ""),
74            "image_url": ",".join(image_urls),
75            "video_url": ",".join(video_urls),
76            "is_in_group": "yes" if in_group else "no",
77            "group_name": group,
78            "unix_timestamp": int(post_timestamp.timestamp()),
79        })

Import scraped 9gag data

type = 'facebook-search'
category = 'Search'
title = 'Import scraped Facebook data'
description = 'Import Facebook data collected with an external tool such as Zeeschuimer.'
extension = 'ndjson'
is_from_zeeschuimer = True
accepts = [None]
references = ['[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)', '[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)']
def get_items(self, query):
33    def get_items(self, query):
34        """
35        Run custom search
36
37        Not available for 9gag
38        """
39        raise NotImplementedError("Facebook datasets can only be created by importing data from elsewhere")

Run custom search

Not available for 9gag

@staticmethod
def map_item(post):
41    @staticmethod
42    def map_item(post):
43        try:
44            main_data = post["comet_sections"]["content"]["story"]
45        except Exception as e:
46            print(json.dumps(post, indent=2))
47            raise e
48
49        # lol, get a load of this
50        metadata = [m for m in post["comet_sections"]["context_layout"]["story"]["comet_sections"]["metadata"] if m["__typename"] == "CometFeedStoryMinimizedTimestampStrategy"].pop(0)["story"]
51        post_timestamp = datetime.fromtimestamp(int(metadata["creation_time"]))
52
53        in_group = "/groups/" in metadata["url"]
54        group = ""
55        if in_group:
56            group = metadata["url"].split("/groups/")[1].split("/")[0]
57
58        author = main_data["actors"][0]
59
60        image_urls = []
61        video_urls = []
62        for attachment in main_data["attachments"]:
63            if attachment["target"]["__typename"] == "Photo":
64                image_urls.append(f"https://www.facebook.com/photo/?fbid={attachment['target']['id']}")
65
66
67        return MappedItem({
68            "id": main_data["post_id"],
69            "url": main_data["wwwURL"],
70            "body": main_data.get("message", {}).get("text", ""),
71            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
72            "author": author.get("url").split("/")[-1],
73            "author_name": author.get("name", ""),
74            "image_url": ",".join(image_urls),
75            "video_url": ",".join(video_urls),
76            "is_in_group": "yes" if in_group else "no",
77            "group_name": group,
78            "unix_timestamp": int(post_timestamp.timestamp()),
79        })