backend.workers.cleanup_tempfiles
Delete old datasets
1""" 2Delete old datasets 3""" 4import shutil 5import re 6import json 7from datetime import datetime 8from pathlib import Path 9 10from common.config_manager import config 11from backend.lib.worker import BasicWorker 12from common.lib.dataset import DataSet 13from common.lib.exceptions import WorkerInterruptedException, DataSetException 14 15 16class TempFileCleaner(BasicWorker): 17 """ 18 Clean up discarded temporary files 19 20 If 4CAT crashes while processing something, it may result in staging 21 folders that are never cleaned up. This worker checks for finished 22 datasets with staging area folders and cleans them up. 23 24 Also cleans up orphaned result files for datasets that no longer exist. 25 """ 26 type = "clean-temp-files" 27 max_workers = 1 28 29 ensure_job = {"remote_id": "localhost", "interval": 10800} 30 31 # Use tracking file to delay deletion of files that may still be in use 32 tracking_file = config.get('PATH_DATA').joinpath(".temp_file_cleaner") 33 days_to_keep = 7 34 35 def work(self): 36 """ 37 Go through result files, and for each one check if it should still 38 exist 39 :return: 40 """ 41 # Load tracking file 42 if not self.tracking_file.exists(): 43 tracked_files = {} 44 else: 45 tracked_files = json.loads(self.tracking_file.read_text()) 46 47 result_files = Path(config.get('PATH_DATA')).glob("*") 48 for file in result_files: 49 if file.stem.startswith("."): 50 # skip hidden files 51 continue 52 53 if self.interrupted: 54 self.tracking_file.write_text(json.dumps(tracked_files)) 55 raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files") 56 57 # the key of the dataset files belong to can be extracted from the 58 # file name in a predictable way. 59 possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem) 60 if not possible_keys: 61 self.log.warning("File %s does not seem to be a result file - clean up manually" % file) 62 continue 63 64 # if for whatever reason there are multiple hashes in the filename, 65 # the key would always be the last one 66 key = possible_keys.pop() 67 68 try: 69 dataset = DataSet(key=key, db=self.db) 70 except DataSetException: 71 # the dataset has been deleted since, but the result file still 72 # exists - should be safe to clean up 73 if file.name not in tracked_files: 74 self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion") 75 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 76 elif tracked_files[file.name] < datetime.now().timestamp(): 77 self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file") 78 if file.is_dir(): 79 try: 80 shutil.rmtree(file) 81 except PermissionError: 82 self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " 83 f"permissions), skipping") 84 85 else: 86 try: 87 file.unlink() 88 except FileNotFoundError: 89 # the file has been deleted since 90 pass 91 92 # Remove from tracking 93 del tracked_files[file.name] 94 95 continue 96 97 if file.is_dir() and "-staging" in file.stem and dataset.is_finished(): 98 # staging area exists but dataset is marked as finished 99 # if the dataset is finished, the staging area should have been 100 # compressed into a zip file, or deleted, so this is also safe 101 # to clean up 102 self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % ( 103 dataset.key, str(file))) 104 shutil.rmtree(file) 105 106 # Update tracked files 107 self.tracking_file.write_text(json.dumps(tracked_files)) 108 109 self.job.finish()
17class TempFileCleaner(BasicWorker): 18 """ 19 Clean up discarded temporary files 20 21 If 4CAT crashes while processing something, it may result in staging 22 folders that are never cleaned up. This worker checks for finished 23 datasets with staging area folders and cleans them up. 24 25 Also cleans up orphaned result files for datasets that no longer exist. 26 """ 27 type = "clean-temp-files" 28 max_workers = 1 29 30 ensure_job = {"remote_id": "localhost", "interval": 10800} 31 32 # Use tracking file to delay deletion of files that may still be in use 33 tracking_file = config.get('PATH_DATA').joinpath(".temp_file_cleaner") 34 days_to_keep = 7 35 36 def work(self): 37 """ 38 Go through result files, and for each one check if it should still 39 exist 40 :return: 41 """ 42 # Load tracking file 43 if not self.tracking_file.exists(): 44 tracked_files = {} 45 else: 46 tracked_files = json.loads(self.tracking_file.read_text()) 47 48 result_files = Path(config.get('PATH_DATA')).glob("*") 49 for file in result_files: 50 if file.stem.startswith("."): 51 # skip hidden files 52 continue 53 54 if self.interrupted: 55 self.tracking_file.write_text(json.dumps(tracked_files)) 56 raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files") 57 58 # the key of the dataset files belong to can be extracted from the 59 # file name in a predictable way. 60 possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem) 61 if not possible_keys: 62 self.log.warning("File %s does not seem to be a result file - clean up manually" % file) 63 continue 64 65 # if for whatever reason there are multiple hashes in the filename, 66 # the key would always be the last one 67 key = possible_keys.pop() 68 69 try: 70 dataset = DataSet(key=key, db=self.db) 71 except DataSetException: 72 # the dataset has been deleted since, but the result file still 73 # exists - should be safe to clean up 74 if file.name not in tracked_files: 75 self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion") 76 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 77 elif tracked_files[file.name] < datetime.now().timestamp(): 78 self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file") 79 if file.is_dir(): 80 try: 81 shutil.rmtree(file) 82 except PermissionError: 83 self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " 84 f"permissions), skipping") 85 86 else: 87 try: 88 file.unlink() 89 except FileNotFoundError: 90 # the file has been deleted since 91 pass 92 93 # Remove from tracking 94 del tracked_files[file.name] 95 96 continue 97 98 if file.is_dir() and "-staging" in file.stem and dataset.is_finished(): 99 # staging area exists but dataset is marked as finished 100 # if the dataset is finished, the staging area should have been 101 # compressed into a zip file, or deleted, so this is also safe 102 # to clean up 103 self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % ( 104 dataset.key, str(file))) 105 shutil.rmtree(file) 106 107 # Update tracked files 108 self.tracking_file.write_text(json.dumps(tracked_files)) 109 110 self.job.finish()
Clean up discarded temporary files
If 4CAT crashes while processing something, it may result in staging folders that are never cleaned up. This worker checks for finished datasets with staging area folders and cleans them up.
Also cleans up orphaned result files for datasets that no longer exist.
def
work(self):
36 def work(self): 37 """ 38 Go through result files, and for each one check if it should still 39 exist 40 :return: 41 """ 42 # Load tracking file 43 if not self.tracking_file.exists(): 44 tracked_files = {} 45 else: 46 tracked_files = json.loads(self.tracking_file.read_text()) 47 48 result_files = Path(config.get('PATH_DATA')).glob("*") 49 for file in result_files: 50 if file.stem.startswith("."): 51 # skip hidden files 52 continue 53 54 if self.interrupted: 55 self.tracking_file.write_text(json.dumps(tracked_files)) 56 raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files") 57 58 # the key of the dataset files belong to can be extracted from the 59 # file name in a predictable way. 60 possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem) 61 if not possible_keys: 62 self.log.warning("File %s does not seem to be a result file - clean up manually" % file) 63 continue 64 65 # if for whatever reason there are multiple hashes in the filename, 66 # the key would always be the last one 67 key = possible_keys.pop() 68 69 try: 70 dataset = DataSet(key=key, db=self.db) 71 except DataSetException: 72 # the dataset has been deleted since, but the result file still 73 # exists - should be safe to clean up 74 if file.name not in tracked_files: 75 self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion") 76 tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) 77 elif tracked_files[file.name] < datetime.now().timestamp(): 78 self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file") 79 if file.is_dir(): 80 try: 81 shutil.rmtree(file) 82 except PermissionError: 83 self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " 84 f"permissions), skipping") 85 86 else: 87 try: 88 file.unlink() 89 except FileNotFoundError: 90 # the file has been deleted since 91 pass 92 93 # Remove from tracking 94 del tracked_files[file.name] 95 96 continue 97 98 if file.is_dir() and "-staging" in file.stem and dataset.is_finished(): 99 # staging area exists but dataset is marked as finished 100 # if the dataset is finished, the staging area should have been 101 # compressed into a zip file, or deleted, so this is also safe 102 # to clean up 103 self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % ( 104 dataset.key, str(file))) 105 shutil.rmtree(file) 106 107 # Update tracked files 108 self.tracking_file.write_text(json.dumps(tracked_files)) 109 110 self.job.finish()
Go through result files, and for each one check if it should still exist