backend.workers.cleanup_tempfiles
Delete old datasets
1""" 2Delete old datasets 3""" 4import shutil 5import re 6import json 7from datetime import datetime 8from pathlib import Path 9 10from backend.lib.worker import BasicWorker 11from common.lib.dataset import DataSet 12from common.lib.exceptions import WorkerInterruptedException, DataSetException 13 14 15class TempFileCleaner(BasicWorker): 16 """ 17 Clean up discarded temporary files 18 19 If 4CAT crashes while processing something, it may result in staging 20 folders that are never cleaned up. This worker checks for finished 21 datasets with staging area folders and cleans them up. 22 23 Also cleans up orphaned result files for datasets that no longer exist. 24 """ 25 type = "clean-temp-files" 26 max_workers = 1 27 28 # Use tracking file to delay deletion of files that may still be in use 29 days_to_keep = 7 30 31 @classmethod 32 def ensure_job(cls, config=None): 33 """ 34 Ensure that the temp file cleaner is always running 35 36 This is used to ensure that the temp file cleaner is always running, and 37 if it is not, it will be started by the WorkerManager. 38 39 :return: Job parameters for the worker 40 """ 41 return {"remote_id": "localhost", "interval": 10800} 42 43 def work(self): 44 """ 45 Go through result files, and for each one check if it should still 46 exist 47 :return: 48 """ 49 # Load tracking file 50 tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner") 51 if not tracking_file.exists(): 52 tracked_files = {} 53 else: 54 tracked_files = json.loads(tracking_file.read_text()) 55 56 # Get 4CAT paths to avoid if they are mapped inside PATH_DATA 57 fourcat_paths = [self.config.get(p) for p in self.config.get_all_setting_names() if p.startswith('PATH_')] 58 59 result_files = Path(self.config.get('PATH_DATA')).glob("*") 60 for file in result_files: 61 if file.stem.startswith("."): 62 # skip hidden files 63 continue 64 65 if self.interrupted: 66 tracking_file.write_text(json.dumps(tracked_files)) 67 raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files") 68 69 # Check if the file is inside any of the 4CAT paths 70 if any(file == fourcat_path or file.is_relative_to(fourcat_path) for fourcat_path in fourcat_paths): 71 continue 72 73 # the key of the dataset files belong to can be extracted from the 74 # file name in a predictable way. 75 possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem) 76 if not possible_keys: 77 self.log.warning("File %s does not seem to be a result file - clean up manually" % file) 78 continue 79 80 # if for whatever reason there are multiple hashes in the filename, 81 # the key would always be the last one 82 key = possible_keys.pop() 83 84 try: 85 dataset = DataSet(key=key, db=self.db, modules=self.modules) 86 except DataSetException: 87 # the dataset has been deleted since, but the result file still 88 # exists - should be safe to clean up 89 if file.name not in tracked_files: 90 self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion") 91 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 92 elif tracked_files[file.name] < datetime.now().timestamp(): 93 self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file") 94 if file.is_dir(): 95 try: 96 shutil.rmtree(file) 97 except PermissionError: 98 self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " 99 f"permissions), skipping") 100 101 else: 102 try: 103 file.unlink() 104 except FileNotFoundError: 105 # the file has been deleted since 106 pass 107 108 # Remove from tracking 109 del tracked_files[file.name] 110 111 continue 112 113 if file.is_dir() and "-staging" in file.stem and dataset.is_finished(): 114 # staging area exists but dataset is marked as finished 115 # if the dataset is finished, the staging area should have been 116 # compressed into a zip file, or deleted, so this is also safe 117 # to clean up 118 if file.name not in tracked_files: 119 self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file))) 120 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 121 elif tracked_files[file.name] < datetime.now().timestamp(): 122 self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file))) 123 shutil.rmtree(file) 124 125 # Update tracked files 126 tracking_file.write_text(json.dumps(tracked_files)) 127 128 self.job.finish()
16class TempFileCleaner(BasicWorker): 17 """ 18 Clean up discarded temporary files 19 20 If 4CAT crashes while processing something, it may result in staging 21 folders that are never cleaned up. This worker checks for finished 22 datasets with staging area folders and cleans them up. 23 24 Also cleans up orphaned result files for datasets that no longer exist. 25 """ 26 type = "clean-temp-files" 27 max_workers = 1 28 29 # Use tracking file to delay deletion of files that may still be in use 30 days_to_keep = 7 31 32 @classmethod 33 def ensure_job(cls, config=None): 34 """ 35 Ensure that the temp file cleaner is always running 36 37 This is used to ensure that the temp file cleaner is always running, and 38 if it is not, it will be started by the WorkerManager. 39 40 :return: Job parameters for the worker 41 """ 42 return {"remote_id": "localhost", "interval": 10800} 43 44 def work(self): 45 """ 46 Go through result files, and for each one check if it should still 47 exist 48 :return: 49 """ 50 # Load tracking file 51 tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner") 52 if not tracking_file.exists(): 53 tracked_files = {} 54 else: 55 tracked_files = json.loads(tracking_file.read_text()) 56 57 # Get 4CAT paths to avoid if they are mapped inside PATH_DATA 58 fourcat_paths = [self.config.get(p) for p in self.config.get_all_setting_names() if p.startswith('PATH_')] 59 60 result_files = Path(self.config.get('PATH_DATA')).glob("*") 61 for file in result_files: 62 if file.stem.startswith("."): 63 # skip hidden files 64 continue 65 66 if self.interrupted: 67 tracking_file.write_text(json.dumps(tracked_files)) 68 raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files") 69 70 # Check if the file is inside any of the 4CAT paths 71 if any(file == fourcat_path or file.is_relative_to(fourcat_path) for fourcat_path in fourcat_paths): 72 continue 73 74 # the key of the dataset files belong to can be extracted from the 75 # file name in a predictable way. 76 possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem) 77 if not possible_keys: 78 self.log.warning("File %s does not seem to be a result file - clean up manually" % file) 79 continue 80 81 # if for whatever reason there are multiple hashes in the filename, 82 # the key would always be the last one 83 key = possible_keys.pop() 84 85 try: 86 dataset = DataSet(key=key, db=self.db, modules=self.modules) 87 except DataSetException: 88 # the dataset has been deleted since, but the result file still 89 # exists - should be safe to clean up 90 if file.name not in tracked_files: 91 self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion") 92 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 93 elif tracked_files[file.name] < datetime.now().timestamp(): 94 self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file") 95 if file.is_dir(): 96 try: 97 shutil.rmtree(file) 98 except PermissionError: 99 self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " 100 f"permissions), skipping") 101 102 else: 103 try: 104 file.unlink() 105 except FileNotFoundError: 106 # the file has been deleted since 107 pass 108 109 # Remove from tracking 110 del tracked_files[file.name] 111 112 continue 113 114 if file.is_dir() and "-staging" in file.stem and dataset.is_finished(): 115 # staging area exists but dataset is marked as finished 116 # if the dataset is finished, the staging area should have been 117 # compressed into a zip file, or deleted, so this is also safe 118 # to clean up 119 if file.name not in tracked_files: 120 self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file))) 121 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 122 elif tracked_files[file.name] < datetime.now().timestamp(): 123 self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file))) 124 shutil.rmtree(file) 125 126 # Update tracked files 127 tracking_file.write_text(json.dumps(tracked_files)) 128 129 self.job.finish()
Clean up discarded temporary files
If 4CAT crashes while processing something, it may result in staging folders that are never cleaned up. This worker checks for finished datasets with staging area folders and cleans them up.
Also cleans up orphaned result files for datasets that no longer exist.
@classmethod
def
ensure_job(cls, config=None):
32 @classmethod 33 def ensure_job(cls, config=None): 34 """ 35 Ensure that the temp file cleaner is always running 36 37 This is used to ensure that the temp file cleaner is always running, and 38 if it is not, it will be started by the WorkerManager. 39 40 :return: Job parameters for the worker 41 """ 42 return {"remote_id": "localhost", "interval": 10800}
Ensure that the temp file cleaner is always running
This is used to ensure that the temp file cleaner is always running, and if it is not, it will be started by the WorkerManager.
Returns
Job parameters for the worker
def
work(self):
44 def work(self): 45 """ 46 Go through result files, and for each one check if it should still 47 exist 48 :return: 49 """ 50 # Load tracking file 51 tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner") 52 if not tracking_file.exists(): 53 tracked_files = {} 54 else: 55 tracked_files = json.loads(tracking_file.read_text()) 56 57 # Get 4CAT paths to avoid if they are mapped inside PATH_DATA 58 fourcat_paths = [self.config.get(p) for p in self.config.get_all_setting_names() if p.startswith('PATH_')] 59 60 result_files = Path(self.config.get('PATH_DATA')).glob("*") 61 for file in result_files: 62 if file.stem.startswith("."): 63 # skip hidden files 64 continue 65 66 if self.interrupted: 67 tracking_file.write_text(json.dumps(tracked_files)) 68 raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files") 69 70 # Check if the file is inside any of the 4CAT paths 71 if any(file == fourcat_path or file.is_relative_to(fourcat_path) for fourcat_path in fourcat_paths): 72 continue 73 74 # the key of the dataset files belong to can be extracted from the 75 # file name in a predictable way. 76 possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem) 77 if not possible_keys: 78 self.log.warning("File %s does not seem to be a result file - clean up manually" % file) 79 continue 80 81 # if for whatever reason there are multiple hashes in the filename, 82 # the key would always be the last one 83 key = possible_keys.pop() 84 85 try: 86 dataset = DataSet(key=key, db=self.db, modules=self.modules) 87 except DataSetException: 88 # the dataset has been deleted since, but the result file still 89 # exists - should be safe to clean up 90 if file.name not in tracked_files: 91 self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion") 92 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 93 elif tracked_files[file.name] < datetime.now().timestamp(): 94 self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file") 95 if file.is_dir(): 96 try: 97 shutil.rmtree(file) 98 except PermissionError: 99 self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " 100 f"permissions), skipping") 101 102 else: 103 try: 104 file.unlink() 105 except FileNotFoundError: 106 # the file has been deleted since 107 pass 108 109 # Remove from tracking 110 del tracked_files[file.name] 111 112 continue 113 114 if file.is_dir() and "-staging" in file.stem and dataset.is_finished(): 115 # staging area exists but dataset is marked as finished 116 # if the dataset is finished, the staging area should have been 117 # compressed into a zip file, or deleted, so this is also safe 118 # to clean up 119 if file.name not in tracked_files: 120 self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file))) 121 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 122 elif tracked_files[file.name] < datetime.now().timestamp(): 123 self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file))) 124 shutil.rmtree(file) 125 126 # Update tracked files 127 tracking_file.write_text(json.dumps(tracked_files)) 128 129 self.job.finish()
Go through result files, and for each one check if it should still exist