backend.workers.cleanup_tempfiles
Delete old datasets
1""" 2Delete old datasets 3""" 4import shutil 5import re 6import json 7from datetime import datetime 8from pathlib import Path 9 10from backend.lib.worker import BasicWorker 11from common.lib.dataset import DataSet 12from common.lib.exceptions import WorkerInterruptedException, DataSetException 13 14 15class TempFileCleaner(BasicWorker): 16 """ 17 Clean up discarded temporary files 18 19 If 4CAT crashes while processing something, it may result in staging 20 folders that are never cleaned up. This worker checks for finished 21 datasets with staging area folders and cleans them up. 22 23 Also cleans up orphaned result files for datasets that no longer exist. 24 """ 25 type = "clean-temp-files" 26 max_workers = 1 27 28 ensure_job = {"remote_id": "localhost", "interval": 10800} 29 30 # Use tracking file to delay deletion of files that may still be in use 31 days_to_keep = 7 32 33 def work(self): 34 """ 35 Go through result files, and for each one check if it should still 36 exist 37 :return: 38 """ 39 # Load tracking file 40 tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner") 41 if not tracking_file.exists(): 42 tracked_files = {} 43 else: 44 tracked_files = json.loads(tracking_file.read_text()) 45 46 result_files = Path(self.config.get('PATH_DATA')).glob("*") 47 for file in result_files: 48 if file.stem.startswith("."): 49 # skip hidden files 50 continue 51 52 if self.interrupted: 53 tracking_file.write_text(json.dumps(tracked_files)) 54 raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files") 55 56 # the key of the dataset files belong to can be extracted from the 57 # file name in a predictable way. 58 possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem) 59 if not possible_keys: 60 self.log.warning("File %s does not seem to be a result file - clean up manually" % file) 61 continue 62 63 # if for whatever reason there are multiple hashes in the filename, 64 # the key would always be the last one 65 key = possible_keys.pop() 66 67 try: 68 dataset = DataSet(key=key, db=self.db) 69 except DataSetException: 70 # the dataset has been deleted since, but the result file still 71 # exists - should be safe to clean up 72 if file.name not in tracked_files: 73 self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion") 74 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 75 elif tracked_files[file.name] < datetime.now().timestamp(): 76 self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file") 77 if file.is_dir(): 78 try: 79 shutil.rmtree(file) 80 except PermissionError: 81 self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " 82 f"permissions), skipping") 83 84 else: 85 try: 86 file.unlink() 87 except FileNotFoundError: 88 # the file has been deleted since 89 pass 90 91 # Remove from tracking 92 del tracked_files[file.name] 93 94 continue 95 96 if file.is_dir() and "-staging" in file.stem and dataset.is_finished(): 97 # staging area exists but dataset is marked as finished 98 # if the dataset is finished, the staging area should have been 99 # compressed into a zip file, or deleted, so this is also safe 100 # to clean up 101 if file.name not in tracked_files: 102 self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file))) 103 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 104 elif tracked_files[file.name] < datetime.now().timestamp(): 105 self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file))) 106 shutil.rmtree(file) 107 108 # Update tracked files 109 tracking_file.write_text(json.dumps(tracked_files)) 110 111 self.job.finish()
16class TempFileCleaner(BasicWorker): 17 """ 18 Clean up discarded temporary files 19 20 If 4CAT crashes while processing something, it may result in staging 21 folders that are never cleaned up. This worker checks for finished 22 datasets with staging area folders and cleans them up. 23 24 Also cleans up orphaned result files for datasets that no longer exist. 25 """ 26 type = "clean-temp-files" 27 max_workers = 1 28 29 ensure_job = {"remote_id": "localhost", "interval": 10800} 30 31 # Use tracking file to delay deletion of files that may still be in use 32 days_to_keep = 7 33 34 def work(self): 35 """ 36 Go through result files, and for each one check if it should still 37 exist 38 :return: 39 """ 40 # Load tracking file 41 tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner") 42 if not tracking_file.exists(): 43 tracked_files = {} 44 else: 45 tracked_files = json.loads(tracking_file.read_text()) 46 47 result_files = Path(self.config.get('PATH_DATA')).glob("*") 48 for file in result_files: 49 if file.stem.startswith("."): 50 # skip hidden files 51 continue 52 53 if self.interrupted: 54 tracking_file.write_text(json.dumps(tracked_files)) 55 raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files") 56 57 # the key of the dataset files belong to can be extracted from the 58 # file name in a predictable way. 59 possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem) 60 if not possible_keys: 61 self.log.warning("File %s does not seem to be a result file - clean up manually" % file) 62 continue 63 64 # if for whatever reason there are multiple hashes in the filename, 65 # the key would always be the last one 66 key = possible_keys.pop() 67 68 try: 69 dataset = DataSet(key=key, db=self.db) 70 except DataSetException: 71 # the dataset has been deleted since, but the result file still 72 # exists - should be safe to clean up 73 if file.name not in tracked_files: 74 self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion") 75 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 76 elif tracked_files[file.name] < datetime.now().timestamp(): 77 self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file") 78 if file.is_dir(): 79 try: 80 shutil.rmtree(file) 81 except PermissionError: 82 self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " 83 f"permissions), skipping") 84 85 else: 86 try: 87 file.unlink() 88 except FileNotFoundError: 89 # the file has been deleted since 90 pass 91 92 # Remove from tracking 93 del tracked_files[file.name] 94 95 continue 96 97 if file.is_dir() and "-staging" in file.stem and dataset.is_finished(): 98 # staging area exists but dataset is marked as finished 99 # if the dataset is finished, the staging area should have been 100 # compressed into a zip file, or deleted, so this is also safe 101 # to clean up 102 if file.name not in tracked_files: 103 self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file))) 104 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 105 elif tracked_files[file.name] < datetime.now().timestamp(): 106 self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file))) 107 shutil.rmtree(file) 108 109 # Update tracked files 110 tracking_file.write_text(json.dumps(tracked_files)) 111 112 self.job.finish()
Clean up discarded temporary files
If 4CAT crashes while processing something, it may result in staging folders that are never cleaned up. This worker checks for finished datasets with staging area folders and cleans them up.
Also cleans up orphaned result files for datasets that no longer exist.
def
work(self):
34 def work(self): 35 """ 36 Go through result files, and for each one check if it should still 37 exist 38 :return: 39 """ 40 # Load tracking file 41 tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner") 42 if not tracking_file.exists(): 43 tracked_files = {} 44 else: 45 tracked_files = json.loads(tracking_file.read_text()) 46 47 result_files = Path(self.config.get('PATH_DATA')).glob("*") 48 for file in result_files: 49 if file.stem.startswith("."): 50 # skip hidden files 51 continue 52 53 if self.interrupted: 54 tracking_file.write_text(json.dumps(tracked_files)) 55 raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files") 56 57 # the key of the dataset files belong to can be extracted from the 58 # file name in a predictable way. 59 possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem) 60 if not possible_keys: 61 self.log.warning("File %s does not seem to be a result file - clean up manually" % file) 62 continue 63 64 # if for whatever reason there are multiple hashes in the filename, 65 # the key would always be the last one 66 key = possible_keys.pop() 67 68 try: 69 dataset = DataSet(key=key, db=self.db) 70 except DataSetException: 71 # the dataset has been deleted since, but the result file still 72 # exists - should be safe to clean up 73 if file.name not in tracked_files: 74 self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion") 75 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 76 elif tracked_files[file.name] < datetime.now().timestamp(): 77 self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file") 78 if file.is_dir(): 79 try: 80 shutil.rmtree(file) 81 except PermissionError: 82 self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " 83 f"permissions), skipping") 84 85 else: 86 try: 87 file.unlink() 88 except FileNotFoundError: 89 # the file has been deleted since 90 pass 91 92 # Remove from tracking 93 del tracked_files[file.name] 94 95 continue 96 97 if file.is_dir() and "-staging" in file.stem and dataset.is_finished(): 98 # staging area exists but dataset is marked as finished 99 # if the dataset is finished, the staging area should have been 100 # compressed into a zip file, or deleted, so this is also safe 101 # to clean up 102 if file.name not in tracked_files: 103 self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file))) 104 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 105 elif tracked_files[file.name] < datetime.now().timestamp(): 106 self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file))) 107 shutil.rmtree(file) 108 109 # Update tracked files 110 tracking_file.write_text(json.dumps(tracked_files)) 111 112 self.job.finish()
Go through result files, and for each one check if it should still exist