Edit on GitHub

backend.workers.expire_items

Delete old items

  1"""
  2Delete old items
  3"""
  4
  5import datetime
  6import time
  7import re
  8
  9from backend.lib.worker import BasicWorker
 10from common.lib.dataset import DataSet
 11from common.lib.exceptions import DataSetNotFoundException, WorkerInterruptedException
 12
 13from common.lib.user import User
 14from common.config_manager import ConfigWrapper
 15from email.mime.multipart import MIMEMultipart
 16from email.mime.text import MIMEText
 17from common.lib.helpers import send_email
 18
 19
 20class ThingExpirer(BasicWorker):
 21    """
 22    Delete old items
 23
 24    Deletes expired datasets. This may be useful for two reasons: to conserve
 25    disk space and if the user agreement of a particular data source does not
 26    allow storing scraped or extracted data for longer than a given amount of
 27    time, as is the case for e.g. Tumblr.
 28
 29    Also deletes users that have an expiration date that is not zero. Users
 30    with a close expiration date get a notification.
 31
 32    Also deletes expired notifications.
 33    """
 34
 35    type = "expire-datasets"
 36    max_workers = 1
 37
 38    expiry_notification_after_days = 7    
 39
 40    @classmethod
 41    def ensure_job(cls, config=None):
 42        """
 43        Ensure that the expirer is always running
 44
 45        This is used to ensure that the expirer is always running, and if it is
 46        not, it will be started by the WorkerManager.
 47
 48        :return:  Job parameters for the worker
 49        """
 50        return {"remote_id": "localhost", "interval": 1800}
 51
 52    def work(self):
 53        """
 54        Delete datasets, users and notifications
 55        """
 56
 57        self.expire_datasets()
 58        self.expire_users()
 59        self.expire_notifications()
 60
 61        self.job.finish()
 62
 63    def expire_datasets(self):
 64        """
 65        Delete expired datasets
 66        """
 67        # find candidates
 68        # todo: make this better - this can be a lot of datasets!
 69        datasets = self.db.fetchall("""
 70                                    SELECT *
 71                                    FROM datasets
 72                                    WHERE parameters::json->>'keep' IS NULL
 73                                    AND key_parent = ''
 74                                    """)
 75
 76        for dataset in datasets:
 77            # we only check datasets with no parent, because child datasets
 78            # inherit the ownership of the parent, and child datasets are
 79            # deleted when the parent is deleted
 80            if self.interrupted:
 81                raise WorkerInterruptedException("Interrupted while expiring datasets")
 82
 83            # the dataset creator's configuration context determines expiration
 84            try:
 85                dataset = DataSet(data=dataset, db=self.db, modules=self.modules)
 86                wrapper = ConfigWrapper(
 87                    self.config, user=User.get_by_name(self.db, dataset.creator)
 88                )
 89                if dataset.is_expired(config=wrapper):
 90                    self.log.info(f"Deleting dataset {dataset.key} (expired)")
 91                    dataset.delete(commit=False)
 92
 93            except DataSetNotFoundException:
 94                # dataset already deleted I guess?
 95                pass
 96
 97            finally:
 98                self.db.commit()
 99
100    def expire_users(self):
101        """
102        Delete expired users
103
104        Users can have a `delete-after` parameter in their user data which
105        indicates a date or time after which the account should be deleted.
106
107        The date can be in YYYY-MM-DD format or a unix (UTC) timestamp. If
108        the current date is after the given date the account is deleted. If the
109        expiration date is within 7 days a notification is added for the user
110        to warn them.
111        """
112        expiring_users = self.db.fetchall(
113            "SELECT * FROM users WHERE userdata::json->>'delete-after' IS NOT NULL;"
114        )
115        now = datetime.datetime.now()
116
117        for expiring_user in expiring_users:
118            if self.interrupted:
119                raise WorkerInterruptedException("Interrupted while expiring users")
120
121            user = User(db=self.db, data=expiring_user, config=self.config)
122            username = user.data["name"]
123
124            # parse expiration date if available
125            delete_after = user.get_value("delete-after")
126            if not delete_after:
127                continue
128
129            if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$", str(delete_after)):
130                expires_at = datetime.datetime.strptime(delete_after, "%Y-%m-%d")
131            elif re.match(r"^[0-9]+$", str(delete_after)):
132                expires_at = datetime.datetime.fromtimestamp(int(delete_after))
133            else:
134                self.log.warning(
135                    f"User {username} has invalid expiration date {delete_after}"
136                )
137                continue
138
139            # check if expired...
140            if expires_at < now:
141                self.log.info(f"User {username} expired - deleting user and datasets")
142                user.delete(modules=self.modules)
143            else:
144                warning_notification = f'WARNING: This account will be deleted at <time datetime="{expires_at.strftime("%C")}">{expires_at.strftime("%-d %B %Y %H:%M")}</time>. Make sure to back up your data before then.'
145                user.add_notification(warning_notification)
146
147                # If the account will be deleted within 7 days, try sending an email
148                try:
149                    delta = expires_at - now
150                    if datetime.timedelta(0) <= delta <= datetime.timedelta(days=self.expiry_notification_after_days):
151                        if user.get_value("expiry-email-sent", default=False):
152                            # already sent
153                            continue
154
155                        # Ensure mail is configured on this server and username looks like an email
156                        if self.config.get('mail.server') and re.match(r"[^@]+@[^@]+\.[^@]+", username):
157                            msg = MIMEMultipart("alternative")
158                            msg["From"] = self.config.get('mail.noreply')
159                            msg["To"] = username
160                            msg["Subject"] = "4CAT account expiration warning"
161
162                            plain = (
163                                f"Your 4CAT account '{username}' is scheduled for deletion on {expires_at.strftime('%C')}.\n"
164                                "Please back up your data before then."
165                            )
166
167                            html = (
168                                f"<p>Your 4CAT account <strong>{username}</strong> is scheduled for deletion at "
169                                f"<time datetime=\"{expires_at.strftime('%C')}\">{expires_at.strftime('%-d %B %Y %H:%M')}</time>.</p>"
170                                "<p>Please back up your data before then.</p>"
171                            )
172
173                            msg.attach(MIMEText(plain, "plain"))
174                            msg.attach(MIMEText(html, "html"))
175
176                            # send_email expects (recipient, message, mail_config)
177                            send_email([username], msg, self.config)
178                            # mark as sent
179                            user.set_value("expiry-email-sent", int(time.time()))
180                except Exception:
181                    # Don't let email failures interrupt the worker; just log
182                    self.log.warning(f"Failed to send expiration email to {username}")
183
184    def expire_notifications(self):
185        """
186        Delete expired notifications
187
188        Pretty simple!
189        """
190        self.db.execute(
191            f"DELETE FROM users_notifications WHERE timestamp_expires IS NOT NULL AND timestamp_expires < {time.time()}"
192        )
class ThingExpirer(backend.lib.worker.BasicWorker):
 21class ThingExpirer(BasicWorker):
 22    """
 23    Delete old items
 24
 25    Deletes expired datasets. This may be useful for two reasons: to conserve
 26    disk space and if the user agreement of a particular data source does not
 27    allow storing scraped or extracted data for longer than a given amount of
 28    time, as is the case for e.g. Tumblr.
 29
 30    Also deletes users that have an expiration date that is not zero. Users
 31    with a close expiration date get a notification.
 32
 33    Also deletes expired notifications.
 34    """
 35
 36    type = "expire-datasets"
 37    max_workers = 1
 38
 39    expiry_notification_after_days = 7    
 40
 41    @classmethod
 42    def ensure_job(cls, config=None):
 43        """
 44        Ensure that the expirer is always running
 45
 46        This is used to ensure that the expirer is always running, and if it is
 47        not, it will be started by the WorkerManager.
 48
 49        :return:  Job parameters for the worker
 50        """
 51        return {"remote_id": "localhost", "interval": 1800}
 52
 53    def work(self):
 54        """
 55        Delete datasets, users and notifications
 56        """
 57
 58        self.expire_datasets()
 59        self.expire_users()
 60        self.expire_notifications()
 61
 62        self.job.finish()
 63
 64    def expire_datasets(self):
 65        """
 66        Delete expired datasets
 67        """
 68        # find candidates
 69        # todo: make this better - this can be a lot of datasets!
 70        datasets = self.db.fetchall("""
 71                                    SELECT *
 72                                    FROM datasets
 73                                    WHERE parameters::json->>'keep' IS NULL
 74                                    AND key_parent = ''
 75                                    """)
 76
 77        for dataset in datasets:
 78            # we only check datasets with no parent, because child datasets
 79            # inherit the ownership of the parent, and child datasets are
 80            # deleted when the parent is deleted
 81            if self.interrupted:
 82                raise WorkerInterruptedException("Interrupted while expiring datasets")
 83
 84            # the dataset creator's configuration context determines expiration
 85            try:
 86                dataset = DataSet(data=dataset, db=self.db, modules=self.modules)
 87                wrapper = ConfigWrapper(
 88                    self.config, user=User.get_by_name(self.db, dataset.creator)
 89                )
 90                if dataset.is_expired(config=wrapper):
 91                    self.log.info(f"Deleting dataset {dataset.key} (expired)")
 92                    dataset.delete(commit=False)
 93
 94            except DataSetNotFoundException:
 95                # dataset already deleted I guess?
 96                pass
 97
 98            finally:
 99                self.db.commit()
100
101    def expire_users(self):
102        """
103        Delete expired users
104
105        Users can have a `delete-after` parameter in their user data which
106        indicates a date or time after which the account should be deleted.
107
108        The date can be in YYYY-MM-DD format or a unix (UTC) timestamp. If
109        the current date is after the given date the account is deleted. If the
110        expiration date is within 7 days a notification is added for the user
111        to warn them.
112        """
113        expiring_users = self.db.fetchall(
114            "SELECT * FROM users WHERE userdata::json->>'delete-after' IS NOT NULL;"
115        )
116        now = datetime.datetime.now()
117
118        for expiring_user in expiring_users:
119            if self.interrupted:
120                raise WorkerInterruptedException("Interrupted while expiring users")
121
122            user = User(db=self.db, data=expiring_user, config=self.config)
123            username = user.data["name"]
124
125            # parse expiration date if available
126            delete_after = user.get_value("delete-after")
127            if not delete_after:
128                continue
129
130            if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$", str(delete_after)):
131                expires_at = datetime.datetime.strptime(delete_after, "%Y-%m-%d")
132            elif re.match(r"^[0-9]+$", str(delete_after)):
133                expires_at = datetime.datetime.fromtimestamp(int(delete_after))
134            else:
135                self.log.warning(
136                    f"User {username} has invalid expiration date {delete_after}"
137                )
138                continue
139
140            # check if expired...
141            if expires_at < now:
142                self.log.info(f"User {username} expired - deleting user and datasets")
143                user.delete(modules=self.modules)
144            else:
145                warning_notification = f'WARNING: This account will be deleted at <time datetime="{expires_at.strftime("%C")}">{expires_at.strftime("%-d %B %Y %H:%M")}</time>. Make sure to back up your data before then.'
146                user.add_notification(warning_notification)
147
148                # If the account will be deleted within 7 days, try sending an email
149                try:
150                    delta = expires_at - now
151                    if datetime.timedelta(0) <= delta <= datetime.timedelta(days=self.expiry_notification_after_days):
152                        if user.get_value("expiry-email-sent", default=False):
153                            # already sent
154                            continue
155
156                        # Ensure mail is configured on this server and username looks like an email
157                        if self.config.get('mail.server') and re.match(r"[^@]+@[^@]+\.[^@]+", username):
158                            msg = MIMEMultipart("alternative")
159                            msg["From"] = self.config.get('mail.noreply')
160                            msg["To"] = username
161                            msg["Subject"] = "4CAT account expiration warning"
162
163                            plain = (
164                                f"Your 4CAT account '{username}' is scheduled for deletion on {expires_at.strftime('%C')}.\n"
165                                "Please back up your data before then."
166                            )
167
168                            html = (
169                                f"<p>Your 4CAT account <strong>{username}</strong> is scheduled for deletion at "
170                                f"<time datetime=\"{expires_at.strftime('%C')}\">{expires_at.strftime('%-d %B %Y %H:%M')}</time>.</p>"
171                                "<p>Please back up your data before then.</p>"
172                            )
173
174                            msg.attach(MIMEText(plain, "plain"))
175                            msg.attach(MIMEText(html, "html"))
176
177                            # send_email expects (recipient, message, mail_config)
178                            send_email([username], msg, self.config)
179                            # mark as sent
180                            user.set_value("expiry-email-sent", int(time.time()))
181                except Exception:
182                    # Don't let email failures interrupt the worker; just log
183                    self.log.warning(f"Failed to send expiration email to {username}")
184
185    def expire_notifications(self):
186        """
187        Delete expired notifications
188
189        Pretty simple!
190        """
191        self.db.execute(
192            f"DELETE FROM users_notifications WHERE timestamp_expires IS NOT NULL AND timestamp_expires < {time.time()}"
193        )

Delete old items

Deletes expired datasets. This may be useful for two reasons: to conserve disk space and if the user agreement of a particular data source does not allow storing scraped or extracted data for longer than a given amount of time, as is the case for e.g. Tumblr.

Also deletes users that have an expiration date that is not zero. Users with a close expiration date get a notification.

Also deletes expired notifications.

type = 'expire-datasets'
max_workers = 1
expiry_notification_after_days = 7
@classmethod
def ensure_job(cls, config=None):
41    @classmethod
42    def ensure_job(cls, config=None):
43        """
44        Ensure that the expirer is always running
45
46        This is used to ensure that the expirer is always running, and if it is
47        not, it will be started by the WorkerManager.
48
49        :return:  Job parameters for the worker
50        """
51        return {"remote_id": "localhost", "interval": 1800}

Ensure that the expirer is always running

This is used to ensure that the expirer is always running, and if it is not, it will be started by the WorkerManager.

Returns

Job parameters for the worker

def work(self):
53    def work(self):
54        """
55        Delete datasets, users and notifications
56        """
57
58        self.expire_datasets()
59        self.expire_users()
60        self.expire_notifications()
61
62        self.job.finish()

Delete datasets, users and notifications

def expire_datasets(self):
64    def expire_datasets(self):
65        """
66        Delete expired datasets
67        """
68        # find candidates
69        # todo: make this better - this can be a lot of datasets!
70        datasets = self.db.fetchall("""
71                                    SELECT *
72                                    FROM datasets
73                                    WHERE parameters::json->>'keep' IS NULL
74                                    AND key_parent = ''
75                                    """)
76
77        for dataset in datasets:
78            # we only check datasets with no parent, because child datasets
79            # inherit the ownership of the parent, and child datasets are
80            # deleted when the parent is deleted
81            if self.interrupted:
82                raise WorkerInterruptedException("Interrupted while expiring datasets")
83
84            # the dataset creator's configuration context determines expiration
85            try:
86                dataset = DataSet(data=dataset, db=self.db, modules=self.modules)
87                wrapper = ConfigWrapper(
88                    self.config, user=User.get_by_name(self.db, dataset.creator)
89                )
90                if dataset.is_expired(config=wrapper):
91                    self.log.info(f"Deleting dataset {dataset.key} (expired)")
92                    dataset.delete(commit=False)
93
94            except DataSetNotFoundException:
95                # dataset already deleted I guess?
96                pass
97
98            finally:
99                self.db.commit()

Delete expired datasets

def expire_users(self):
101    def expire_users(self):
102        """
103        Delete expired users
104
105        Users can have a `delete-after` parameter in their user data which
106        indicates a date or time after which the account should be deleted.
107
108        The date can be in YYYY-MM-DD format or a unix (UTC) timestamp. If
109        the current date is after the given date the account is deleted. If the
110        expiration date is within 7 days a notification is added for the user
111        to warn them.
112        """
113        expiring_users = self.db.fetchall(
114            "SELECT * FROM users WHERE userdata::json->>'delete-after' IS NOT NULL;"
115        )
116        now = datetime.datetime.now()
117
118        for expiring_user in expiring_users:
119            if self.interrupted:
120                raise WorkerInterruptedException("Interrupted while expiring users")
121
122            user = User(db=self.db, data=expiring_user, config=self.config)
123            username = user.data["name"]
124
125            # parse expiration date if available
126            delete_after = user.get_value("delete-after")
127            if not delete_after:
128                continue
129
130            if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$", str(delete_after)):
131                expires_at = datetime.datetime.strptime(delete_after, "%Y-%m-%d")
132            elif re.match(r"^[0-9]+$", str(delete_after)):
133                expires_at = datetime.datetime.fromtimestamp(int(delete_after))
134            else:
135                self.log.warning(
136                    f"User {username} has invalid expiration date {delete_after}"
137                )
138                continue
139
140            # check if expired...
141            if expires_at < now:
142                self.log.info(f"User {username} expired - deleting user and datasets")
143                user.delete(modules=self.modules)
144            else:
145                warning_notification = f'WARNING: This account will be deleted at <time datetime="{expires_at.strftime("%C")}">{expires_at.strftime("%-d %B %Y %H:%M")}</time>. Make sure to back up your data before then.'
146                user.add_notification(warning_notification)
147
148                # If the account will be deleted within 7 days, try sending an email
149                try:
150                    delta = expires_at - now
151                    if datetime.timedelta(0) <= delta <= datetime.timedelta(days=self.expiry_notification_after_days):
152                        if user.get_value("expiry-email-sent", default=False):
153                            # already sent
154                            continue
155
156                        # Ensure mail is configured on this server and username looks like an email
157                        if self.config.get('mail.server') and re.match(r"[^@]+@[^@]+\.[^@]+", username):
158                            msg = MIMEMultipart("alternative")
159                            msg["From"] = self.config.get('mail.noreply')
160                            msg["To"] = username
161                            msg["Subject"] = "4CAT account expiration warning"
162
163                            plain = (
164                                f"Your 4CAT account '{username}' is scheduled for deletion on {expires_at.strftime('%C')}.\n"
165                                "Please back up your data before then."
166                            )
167
168                            html = (
169                                f"<p>Your 4CAT account <strong>{username}</strong> is scheduled for deletion at "
170                                f"<time datetime=\"{expires_at.strftime('%C')}\">{expires_at.strftime('%-d %B %Y %H:%M')}</time>.</p>"
171                                "<p>Please back up your data before then.</p>"
172                            )
173
174                            msg.attach(MIMEText(plain, "plain"))
175                            msg.attach(MIMEText(html, "html"))
176
177                            # send_email expects (recipient, message, mail_config)
178                            send_email([username], msg, self.config)
179                            # mark as sent
180                            user.set_value("expiry-email-sent", int(time.time()))
181                except Exception:
182                    # Don't let email failures interrupt the worker; just log
183                    self.log.warning(f"Failed to send expiration email to {username}")

Delete expired users

Users can have a delete-after parameter in their user data which indicates a date or time after which the account should be deleted.

The date can be in YYYY-MM-DD format or a unix (UTC) timestamp. If the current date is after the given date the account is deleted. If the expiration date is within 7 days a notification is added for the user to warn them.

def expire_notifications(self):
185    def expire_notifications(self):
186        """
187        Delete expired notifications
188
189        Pretty simple!
190        """
191        self.db.execute(
192            f"DELETE FROM users_notifications WHERE timestamp_expires IS NOT NULL AND timestamp_expires < {time.time()}"
193        )

Delete expired notifications

Pretty simple!