backend.workers.expire_items
Delete old items
1""" 2Delete old items 3""" 4 5import datetime 6import time 7import re 8 9from backend.lib.worker import BasicWorker 10from common.lib.dataset import DataSet 11from common.lib.exceptions import DataSetNotFoundException, WorkerInterruptedException 12 13from common.lib.user import User 14from common.config_manager import ConfigWrapper 15from email.mime.multipart import MIMEMultipart 16from email.mime.text import MIMEText 17from common.lib.helpers import send_email 18 19 20class ThingExpirer(BasicWorker): 21 """ 22 Delete old items 23 24 Deletes expired datasets. This may be useful for two reasons: to conserve 25 disk space and if the user agreement of a particular data source does not 26 allow storing scraped or extracted data for longer than a given amount of 27 time, as is the case for e.g. Tumblr. 28 29 Also deletes users that have an expiration date that is not zero. Users 30 with a close expiration date get a notification. 31 32 Also deletes expired notifications. 33 """ 34 35 type = "expire-datasets" 36 max_workers = 1 37 38 expiry_notification_after_days = 7 39 40 @classmethod 41 def ensure_job(cls, config=None): 42 """ 43 Ensure that the expirer is always running 44 45 This is used to ensure that the expirer is always running, and if it is 46 not, it will be started by the WorkerManager. 47 48 :return: Job parameters for the worker 49 """ 50 return {"remote_id": "localhost", "interval": 1800} 51 52 def work(self): 53 """ 54 Delete datasets, users and notifications 55 """ 56 57 self.expire_datasets() 58 self.expire_users() 59 self.expire_notifications() 60 61 self.job.finish() 62 63 def expire_datasets(self): 64 """ 65 Delete expired datasets 66 """ 67 # find candidates 68 # todo: make this better - this can be a lot of datasets! 69 datasets = self.db.fetchall(""" 70 SELECT * 71 FROM datasets 72 WHERE parameters::json->>'keep' IS NULL 73 AND key_parent = '' 74 """) 75 76 for dataset in datasets: 77 # we only check datasets with no parent, because child datasets 78 # inherit the ownership of the parent, and child datasets are 79 # deleted when the parent is deleted 80 if self.interrupted: 81 raise WorkerInterruptedException("Interrupted while expiring datasets") 82 83 # the dataset creator's configuration context determines expiration 84 try: 85 dataset = DataSet(data=dataset, db=self.db, modules=self.modules) 86 wrapper = ConfigWrapper( 87 self.config, user=User.get_by_name(self.db, dataset.creator) 88 ) 89 if dataset.is_expired(config=wrapper): 90 self.log.info(f"Deleting dataset {dataset.key} (expired)") 91 dataset.delete(commit=False) 92 93 except DataSetNotFoundException: 94 # dataset already deleted I guess? 95 pass 96 97 finally: 98 self.db.commit() 99 100 def expire_users(self): 101 """ 102 Delete expired users 103 104 Users can have a `delete-after` parameter in their user data which 105 indicates a date or time after which the account should be deleted. 106 107 The date can be in YYYY-MM-DD format or a unix (UTC) timestamp. If 108 the current date is after the given date the account is deleted. If the 109 expiration date is within 7 days a notification is added for the user 110 to warn them. 111 """ 112 expiring_users = self.db.fetchall( 113 "SELECT * FROM users WHERE userdata::json->>'delete-after' IS NOT NULL;" 114 ) 115 now = datetime.datetime.now() 116 117 for expiring_user in expiring_users: 118 if self.interrupted: 119 raise WorkerInterruptedException("Interrupted while expiring users") 120 121 user = User(db=self.db, data=expiring_user, config=self.config) 122 username = user.data["name"] 123 124 # parse expiration date if available 125 delete_after = user.get_value("delete-after") 126 if not delete_after: 127 continue 128 129 if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$", str(delete_after)): 130 expires_at = datetime.datetime.strptime(delete_after, "%Y-%m-%d") 131 elif re.match(r"^[0-9]+$", str(delete_after)): 132 expires_at = datetime.datetime.fromtimestamp(int(delete_after)) 133 else: 134 self.log.warning( 135 f"User {username} has invalid expiration date {delete_after}" 136 ) 137 continue 138 139 # check if expired... 140 if expires_at < now: 141 self.log.info(f"User {username} expired - deleting user and datasets") 142 user.delete(modules=self.modules) 143 else: 144 warning_notification = f'WARNING: This account will be deleted at <time datetime="{expires_at.strftime("%C")}">{expires_at.strftime("%-d %B %Y %H:%M")}</time>. Make sure to back up your data before then.' 145 user.add_notification(warning_notification) 146 147 # If the account will be deleted within 7 days, try sending an email 148 try: 149 delta = expires_at - now 150 if datetime.timedelta(0) <= delta <= datetime.timedelta(days=self.expiry_notification_after_days): 151 if user.get_value("expiry-email-sent", default=False): 152 # already sent 153 continue 154 155 # Ensure mail is configured on this server and username looks like an email 156 if self.config.get('mail.server') and re.match(r"[^@]+@[^@]+\.[^@]+", username): 157 msg = MIMEMultipart("alternative") 158 msg["From"] = self.config.get('mail.noreply') 159 msg["To"] = username 160 msg["Subject"] = "4CAT account expiration warning" 161 162 plain = ( 163 f"Your 4CAT account '{username}' is scheduled for deletion on {expires_at.strftime('%C')}.\n" 164 "Please back up your data before then." 165 ) 166 167 html = ( 168 f"<p>Your 4CAT account <strong>{username}</strong> is scheduled for deletion at " 169 f"<time datetime=\"{expires_at.strftime('%C')}\">{expires_at.strftime('%-d %B %Y %H:%M')}</time>.</p>" 170 "<p>Please back up your data before then.</p>" 171 ) 172 173 msg.attach(MIMEText(plain, "plain")) 174 msg.attach(MIMEText(html, "html")) 175 176 # send_email expects (recipient, message, mail_config) 177 send_email([username], msg, self.config) 178 # mark as sent 179 user.set_value("expiry-email-sent", int(time.time())) 180 except Exception: 181 # Don't let email failures interrupt the worker; just log 182 self.log.warning(f"Failed to send expiration email to {username}") 183 184 def expire_notifications(self): 185 """ 186 Delete expired notifications 187 188 Pretty simple! 189 """ 190 self.db.execute( 191 f"DELETE FROM users_notifications WHERE timestamp_expires IS NOT NULL AND timestamp_expires < {time.time()}" 192 )
21class ThingExpirer(BasicWorker): 22 """ 23 Delete old items 24 25 Deletes expired datasets. This may be useful for two reasons: to conserve 26 disk space and if the user agreement of a particular data source does not 27 allow storing scraped or extracted data for longer than a given amount of 28 time, as is the case for e.g. Tumblr. 29 30 Also deletes users that have an expiration date that is not zero. Users 31 with a close expiration date get a notification. 32 33 Also deletes expired notifications. 34 """ 35 36 type = "expire-datasets" 37 max_workers = 1 38 39 expiry_notification_after_days = 7 40 41 @classmethod 42 def ensure_job(cls, config=None): 43 """ 44 Ensure that the expirer is always running 45 46 This is used to ensure that the expirer is always running, and if it is 47 not, it will be started by the WorkerManager. 48 49 :return: Job parameters for the worker 50 """ 51 return {"remote_id": "localhost", "interval": 1800} 52 53 def work(self): 54 """ 55 Delete datasets, users and notifications 56 """ 57 58 self.expire_datasets() 59 self.expire_users() 60 self.expire_notifications() 61 62 self.job.finish() 63 64 def expire_datasets(self): 65 """ 66 Delete expired datasets 67 """ 68 # find candidates 69 # todo: make this better - this can be a lot of datasets! 70 datasets = self.db.fetchall(""" 71 SELECT * 72 FROM datasets 73 WHERE parameters::json->>'keep' IS NULL 74 AND key_parent = '' 75 """) 76 77 for dataset in datasets: 78 # we only check datasets with no parent, because child datasets 79 # inherit the ownership of the parent, and child datasets are 80 # deleted when the parent is deleted 81 if self.interrupted: 82 raise WorkerInterruptedException("Interrupted while expiring datasets") 83 84 # the dataset creator's configuration context determines expiration 85 try: 86 dataset = DataSet(data=dataset, db=self.db, modules=self.modules) 87 wrapper = ConfigWrapper( 88 self.config, user=User.get_by_name(self.db, dataset.creator) 89 ) 90 if dataset.is_expired(config=wrapper): 91 self.log.info(f"Deleting dataset {dataset.key} (expired)") 92 dataset.delete(commit=False) 93 94 except DataSetNotFoundException: 95 # dataset already deleted I guess? 96 pass 97 98 finally: 99 self.db.commit() 100 101 def expire_users(self): 102 """ 103 Delete expired users 104 105 Users can have a `delete-after` parameter in their user data which 106 indicates a date or time after which the account should be deleted. 107 108 The date can be in YYYY-MM-DD format or a unix (UTC) timestamp. If 109 the current date is after the given date the account is deleted. If the 110 expiration date is within 7 days a notification is added for the user 111 to warn them. 112 """ 113 expiring_users = self.db.fetchall( 114 "SELECT * FROM users WHERE userdata::json->>'delete-after' IS NOT NULL;" 115 ) 116 now = datetime.datetime.now() 117 118 for expiring_user in expiring_users: 119 if self.interrupted: 120 raise WorkerInterruptedException("Interrupted while expiring users") 121 122 user = User(db=self.db, data=expiring_user, config=self.config) 123 username = user.data["name"] 124 125 # parse expiration date if available 126 delete_after = user.get_value("delete-after") 127 if not delete_after: 128 continue 129 130 if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$", str(delete_after)): 131 expires_at = datetime.datetime.strptime(delete_after, "%Y-%m-%d") 132 elif re.match(r"^[0-9]+$", str(delete_after)): 133 expires_at = datetime.datetime.fromtimestamp(int(delete_after)) 134 else: 135 self.log.warning( 136 f"User {username} has invalid expiration date {delete_after}" 137 ) 138 continue 139 140 # check if expired... 141 if expires_at < now: 142 self.log.info(f"User {username} expired - deleting user and datasets") 143 user.delete(modules=self.modules) 144 else: 145 warning_notification = f'WARNING: This account will be deleted at <time datetime="{expires_at.strftime("%C")}">{expires_at.strftime("%-d %B %Y %H:%M")}</time>. Make sure to back up your data before then.' 146 user.add_notification(warning_notification) 147 148 # If the account will be deleted within 7 days, try sending an email 149 try: 150 delta = expires_at - now 151 if datetime.timedelta(0) <= delta <= datetime.timedelta(days=self.expiry_notification_after_days): 152 if user.get_value("expiry-email-sent", default=False): 153 # already sent 154 continue 155 156 # Ensure mail is configured on this server and username looks like an email 157 if self.config.get('mail.server') and re.match(r"[^@]+@[^@]+\.[^@]+", username): 158 msg = MIMEMultipart("alternative") 159 msg["From"] = self.config.get('mail.noreply') 160 msg["To"] = username 161 msg["Subject"] = "4CAT account expiration warning" 162 163 plain = ( 164 f"Your 4CAT account '{username}' is scheduled for deletion on {expires_at.strftime('%C')}.\n" 165 "Please back up your data before then." 166 ) 167 168 html = ( 169 f"<p>Your 4CAT account <strong>{username}</strong> is scheduled for deletion at " 170 f"<time datetime=\"{expires_at.strftime('%C')}\">{expires_at.strftime('%-d %B %Y %H:%M')}</time>.</p>" 171 "<p>Please back up your data before then.</p>" 172 ) 173 174 msg.attach(MIMEText(plain, "plain")) 175 msg.attach(MIMEText(html, "html")) 176 177 # send_email expects (recipient, message, mail_config) 178 send_email([username], msg, self.config) 179 # mark as sent 180 user.set_value("expiry-email-sent", int(time.time())) 181 except Exception: 182 # Don't let email failures interrupt the worker; just log 183 self.log.warning(f"Failed to send expiration email to {username}") 184 185 def expire_notifications(self): 186 """ 187 Delete expired notifications 188 189 Pretty simple! 190 """ 191 self.db.execute( 192 f"DELETE FROM users_notifications WHERE timestamp_expires IS NOT NULL AND timestamp_expires < {time.time()}" 193 )
Delete old items
Deletes expired datasets. This may be useful for two reasons: to conserve disk space and if the user agreement of a particular data source does not allow storing scraped or extracted data for longer than a given amount of time, as is the case for e.g. Tumblr.
Also deletes users that have an expiration date that is not zero. Users with a close expiration date get a notification.
Also deletes expired notifications.
41 @classmethod 42 def ensure_job(cls, config=None): 43 """ 44 Ensure that the expirer is always running 45 46 This is used to ensure that the expirer is always running, and if it is 47 not, it will be started by the WorkerManager. 48 49 :return: Job parameters for the worker 50 """ 51 return {"remote_id": "localhost", "interval": 1800}
Ensure that the expirer is always running
This is used to ensure that the expirer is always running, and if it is not, it will be started by the WorkerManager.
Returns
Job parameters for the worker
53 def work(self): 54 """ 55 Delete datasets, users and notifications 56 """ 57 58 self.expire_datasets() 59 self.expire_users() 60 self.expire_notifications() 61 62 self.job.finish()
Delete datasets, users and notifications
64 def expire_datasets(self): 65 """ 66 Delete expired datasets 67 """ 68 # find candidates 69 # todo: make this better - this can be a lot of datasets! 70 datasets = self.db.fetchall(""" 71 SELECT * 72 FROM datasets 73 WHERE parameters::json->>'keep' IS NULL 74 AND key_parent = '' 75 """) 76 77 for dataset in datasets: 78 # we only check datasets with no parent, because child datasets 79 # inherit the ownership of the parent, and child datasets are 80 # deleted when the parent is deleted 81 if self.interrupted: 82 raise WorkerInterruptedException("Interrupted while expiring datasets") 83 84 # the dataset creator's configuration context determines expiration 85 try: 86 dataset = DataSet(data=dataset, db=self.db, modules=self.modules) 87 wrapper = ConfigWrapper( 88 self.config, user=User.get_by_name(self.db, dataset.creator) 89 ) 90 if dataset.is_expired(config=wrapper): 91 self.log.info(f"Deleting dataset {dataset.key} (expired)") 92 dataset.delete(commit=False) 93 94 except DataSetNotFoundException: 95 # dataset already deleted I guess? 96 pass 97 98 finally: 99 self.db.commit()
Delete expired datasets
101 def expire_users(self): 102 """ 103 Delete expired users 104 105 Users can have a `delete-after` parameter in their user data which 106 indicates a date or time after which the account should be deleted. 107 108 The date can be in YYYY-MM-DD format or a unix (UTC) timestamp. If 109 the current date is after the given date the account is deleted. If the 110 expiration date is within 7 days a notification is added for the user 111 to warn them. 112 """ 113 expiring_users = self.db.fetchall( 114 "SELECT * FROM users WHERE userdata::json->>'delete-after' IS NOT NULL;" 115 ) 116 now = datetime.datetime.now() 117 118 for expiring_user in expiring_users: 119 if self.interrupted: 120 raise WorkerInterruptedException("Interrupted while expiring users") 121 122 user = User(db=self.db, data=expiring_user, config=self.config) 123 username = user.data["name"] 124 125 # parse expiration date if available 126 delete_after = user.get_value("delete-after") 127 if not delete_after: 128 continue 129 130 if re.match(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$", str(delete_after)): 131 expires_at = datetime.datetime.strptime(delete_after, "%Y-%m-%d") 132 elif re.match(r"^[0-9]+$", str(delete_after)): 133 expires_at = datetime.datetime.fromtimestamp(int(delete_after)) 134 else: 135 self.log.warning( 136 f"User {username} has invalid expiration date {delete_after}" 137 ) 138 continue 139 140 # check if expired... 141 if expires_at < now: 142 self.log.info(f"User {username} expired - deleting user and datasets") 143 user.delete(modules=self.modules) 144 else: 145 warning_notification = f'WARNING: This account will be deleted at <time datetime="{expires_at.strftime("%C")}">{expires_at.strftime("%-d %B %Y %H:%M")}</time>. Make sure to back up your data before then.' 146 user.add_notification(warning_notification) 147 148 # If the account will be deleted within 7 days, try sending an email 149 try: 150 delta = expires_at - now 151 if datetime.timedelta(0) <= delta <= datetime.timedelta(days=self.expiry_notification_after_days): 152 if user.get_value("expiry-email-sent", default=False): 153 # already sent 154 continue 155 156 # Ensure mail is configured on this server and username looks like an email 157 if self.config.get('mail.server') and re.match(r"[^@]+@[^@]+\.[^@]+", username): 158 msg = MIMEMultipart("alternative") 159 msg["From"] = self.config.get('mail.noreply') 160 msg["To"] = username 161 msg["Subject"] = "4CAT account expiration warning" 162 163 plain = ( 164 f"Your 4CAT account '{username}' is scheduled for deletion on {expires_at.strftime('%C')}.\n" 165 "Please back up your data before then." 166 ) 167 168 html = ( 169 f"<p>Your 4CAT account <strong>{username}</strong> is scheduled for deletion at " 170 f"<time datetime=\"{expires_at.strftime('%C')}\">{expires_at.strftime('%-d %B %Y %H:%M')}</time>.</p>" 171 "<p>Please back up your data before then.</p>" 172 ) 173 174 msg.attach(MIMEText(plain, "plain")) 175 msg.attach(MIMEText(html, "html")) 176 177 # send_email expects (recipient, message, mail_config) 178 send_email([username], msg, self.config) 179 # mark as sent 180 user.set_value("expiry-email-sent", int(time.time())) 181 except Exception: 182 # Don't let email failures interrupt the worker; just log 183 self.log.warning(f"Failed to send expiration email to {username}")
Delete expired users
Users can have a delete-after parameter in their user data which
indicates a date or time after which the account should be deleted.
The date can be in YYYY-MM-DD format or a unix (UTC) timestamp. If the current date is after the given date the account is deleted. If the expiration date is within 7 days a notification is added for the user to warn them.
185 def expire_notifications(self): 186 """ 187 Delete expired notifications 188 189 Pretty simple! 190 """ 191 self.db.execute( 192 f"DELETE FROM users_notifications WHERE timestamp_expires IS NOT NULL AND timestamp_expires < {time.time()}" 193 )
Delete expired notifications
Pretty simple!