Edit on GitHub

backend.workers.cleanup_tempfiles

Delete old datasets

  1"""
  2Delete old datasets
  3"""
  4import shutil
  5import re
  6import json
  7from datetime import datetime
  8from pathlib import Path
  9
 10from backend.lib.worker import BasicWorker
 11from common.lib.dataset import DataSet
 12from common.lib.exceptions import WorkerInterruptedException, DataSetException
 13
 14
 15class TempFileCleaner(BasicWorker):
 16    """
 17    Clean up discarded temporary files
 18
 19    If 4CAT crashes while processing something, it may result in staging
 20    folders that are never cleaned up. This worker checks for finished
 21    datasets with staging area folders and cleans them up.
 22
 23    Also cleans up orphaned result files for datasets that no longer exist.
 24    """
 25    type = "clean-temp-files"
 26    max_workers = 1
 27
 28    # Use tracking file to delay deletion of files that may still be in use
 29    days_to_keep = 7
 30
 31    @classmethod
 32    def ensure_job(cls, config=None):
 33        """
 34        Ensure that the temp file cleaner is always running
 35
 36        This is used to ensure that the temp file cleaner is always running, and
 37        if it is not, it will be started by the WorkerManager.
 38
 39        :return:  Job parameters for the worker
 40        """
 41        return {"remote_id": "localhost", "interval": 10800}
 42
 43    def work(self):
 44        """
 45        Go through result files, and for each one check if it should still
 46        exist
 47        :return:
 48        """
 49        # Load tracking file
 50        tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner")
 51        if not tracking_file.exists():
 52            tracked_files = {}
 53        else:
 54            tracked_files = json.loads(tracking_file.read_text())
 55
 56        # Get 4CAT paths to avoid if they are mapped inside PATH_DATA
 57        fourcat_paths = [self.config.get(p) for p in self.config.get_all_setting_names() if p.startswith('PATH_')]
 58
 59        result_files = Path(self.config.get('PATH_DATA')).glob("*")
 60        for file in result_files:
 61            if file.stem.startswith("."):
 62                # skip hidden files
 63                continue
 64
 65            if self.interrupted:
 66                tracking_file.write_text(json.dumps(tracked_files))
 67                raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
 68            
 69            # Check if the file is inside any of the 4CAT paths
 70            if any(file == fourcat_path or file.is_relative_to(fourcat_path) for fourcat_path in fourcat_paths):
 71                continue
 72
 73            # the key of the dataset files belong to can be extracted from the
 74            # file name in a predictable way.
 75            possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem)
 76            if not possible_keys:
 77                self.log.warning("File %s does not seem to be a result file - clean up manually" % file)
 78                continue
 79
 80            # if for whatever reason there are multiple hashes in the filename,
 81            # the key would always be the last one
 82            key = possible_keys.pop()
 83
 84            try:
 85                dataset = DataSet(key=key, db=self.db, modules=self.modules)
 86            except DataSetException:
 87                # the dataset has been deleted since, but the result file still
 88                # exists - should be safe to clean up
 89                if file.name not in tracked_files:
 90                    self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
 91                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
 92                elif tracked_files[file.name] < datetime.now().timestamp():
 93                    self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
 94                    if file.is_dir():
 95                        try:
 96                            shutil.rmtree(file)
 97                        except PermissionError:
 98                            self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
 99                                          f"permissions), skipping")
100
101                    else:
102                        try:
103                            file.unlink()
104                        except FileNotFoundError:
105                            # the file has been deleted since
106                            pass
107
108                    # Remove from tracking
109                    del tracked_files[file.name]
110
111                continue
112
113            if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
114                # staging area exists but dataset is marked as finished
115                # if the dataset is finished, the staging area should have been
116                # compressed into a zip file, or deleted, so this is also safe
117                # to clean up
118                if file.name not in tracked_files:
119                    self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file)))
120                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
121                elif tracked_files[file.name] < datetime.now().timestamp():
122                    self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file)))
123                    shutil.rmtree(file)
124
125        # Update tracked files
126        tracking_file.write_text(json.dumps(tracked_files))
127
128        self.job.finish()
class TempFileCleaner(backend.lib.worker.BasicWorker):
 16class TempFileCleaner(BasicWorker):
 17    """
 18    Clean up discarded temporary files
 19
 20    If 4CAT crashes while processing something, it may result in staging
 21    folders that are never cleaned up. This worker checks for finished
 22    datasets with staging area folders and cleans them up.
 23
 24    Also cleans up orphaned result files for datasets that no longer exist.
 25    """
 26    type = "clean-temp-files"
 27    max_workers = 1
 28
 29    # Use tracking file to delay deletion of files that may still be in use
 30    days_to_keep = 7
 31
 32    @classmethod
 33    def ensure_job(cls, config=None):
 34        """
 35        Ensure that the temp file cleaner is always running
 36
 37        This is used to ensure that the temp file cleaner is always running, and
 38        if it is not, it will be started by the WorkerManager.
 39
 40        :return:  Job parameters for the worker
 41        """
 42        return {"remote_id": "localhost", "interval": 10800}
 43
 44    def work(self):
 45        """
 46        Go through result files, and for each one check if it should still
 47        exist
 48        :return:
 49        """
 50        # Load tracking file
 51        tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner")
 52        if not tracking_file.exists():
 53            tracked_files = {}
 54        else:
 55            tracked_files = json.loads(tracking_file.read_text())
 56
 57        # Get 4CAT paths to avoid if they are mapped inside PATH_DATA
 58        fourcat_paths = [self.config.get(p) for p in self.config.get_all_setting_names() if p.startswith('PATH_')]
 59
 60        result_files = Path(self.config.get('PATH_DATA')).glob("*")
 61        for file in result_files:
 62            if file.stem.startswith("."):
 63                # skip hidden files
 64                continue
 65
 66            if self.interrupted:
 67                tracking_file.write_text(json.dumps(tracked_files))
 68                raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
 69            
 70            # Check if the file is inside any of the 4CAT paths
 71            if any(file == fourcat_path or file.is_relative_to(fourcat_path) for fourcat_path in fourcat_paths):
 72                continue
 73
 74            # the key of the dataset files belong to can be extracted from the
 75            # file name in a predictable way.
 76            possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem)
 77            if not possible_keys:
 78                self.log.warning("File %s does not seem to be a result file - clean up manually" % file)
 79                continue
 80
 81            # if for whatever reason there are multiple hashes in the filename,
 82            # the key would always be the last one
 83            key = possible_keys.pop()
 84
 85            try:
 86                dataset = DataSet(key=key, db=self.db, modules=self.modules)
 87            except DataSetException:
 88                # the dataset has been deleted since, but the result file still
 89                # exists - should be safe to clean up
 90                if file.name not in tracked_files:
 91                    self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
 92                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
 93                elif tracked_files[file.name] < datetime.now().timestamp():
 94                    self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
 95                    if file.is_dir():
 96                        try:
 97                            shutil.rmtree(file)
 98                        except PermissionError:
 99                            self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
100                                          f"permissions), skipping")
101
102                    else:
103                        try:
104                            file.unlink()
105                        except FileNotFoundError:
106                            # the file has been deleted since
107                            pass
108
109                    # Remove from tracking
110                    del tracked_files[file.name]
111
112                continue
113
114            if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
115                # staging area exists but dataset is marked as finished
116                # if the dataset is finished, the staging area should have been
117                # compressed into a zip file, or deleted, so this is also safe
118                # to clean up
119                if file.name not in tracked_files:
120                    self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file)))
121                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
122                elif tracked_files[file.name] < datetime.now().timestamp():
123                    self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file)))
124                    shutil.rmtree(file)
125
126        # Update tracked files
127        tracking_file.write_text(json.dumps(tracked_files))
128
129        self.job.finish()

Clean up discarded temporary files

If 4CAT crashes while processing something, it may result in staging folders that are never cleaned up. This worker checks for finished datasets with staging area folders and cleans them up.

Also cleans up orphaned result files for datasets that no longer exist.

type = 'clean-temp-files'
max_workers = 1
days_to_keep = 7
@classmethod
def ensure_job(cls, config=None):
32    @classmethod
33    def ensure_job(cls, config=None):
34        """
35        Ensure that the temp file cleaner is always running
36
37        This is used to ensure that the temp file cleaner is always running, and
38        if it is not, it will be started by the WorkerManager.
39
40        :return:  Job parameters for the worker
41        """
42        return {"remote_id": "localhost", "interval": 10800}

Ensure that the temp file cleaner is always running

This is used to ensure that the temp file cleaner is always running, and if it is not, it will be started by the WorkerManager.

Returns

Job parameters for the worker

def work(self):
 44    def work(self):
 45        """
 46        Go through result files, and for each one check if it should still
 47        exist
 48        :return:
 49        """
 50        # Load tracking file
 51        tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner")
 52        if not tracking_file.exists():
 53            tracked_files = {}
 54        else:
 55            tracked_files = json.loads(tracking_file.read_text())
 56
 57        # Get 4CAT paths to avoid if they are mapped inside PATH_DATA
 58        fourcat_paths = [self.config.get(p) for p in self.config.get_all_setting_names() if p.startswith('PATH_')]
 59
 60        result_files = Path(self.config.get('PATH_DATA')).glob("*")
 61        for file in result_files:
 62            if file.stem.startswith("."):
 63                # skip hidden files
 64                continue
 65
 66            if self.interrupted:
 67                tracking_file.write_text(json.dumps(tracked_files))
 68                raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
 69            
 70            # Check if the file is inside any of the 4CAT paths
 71            if any(file == fourcat_path or file.is_relative_to(fourcat_path) for fourcat_path in fourcat_paths):
 72                continue
 73
 74            # the key of the dataset files belong to can be extracted from the
 75            # file name in a predictable way.
 76            possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem)
 77            if not possible_keys:
 78                self.log.warning("File %s does not seem to be a result file - clean up manually" % file)
 79                continue
 80
 81            # if for whatever reason there are multiple hashes in the filename,
 82            # the key would always be the last one
 83            key = possible_keys.pop()
 84
 85            try:
 86                dataset = DataSet(key=key, db=self.db, modules=self.modules)
 87            except DataSetException:
 88                # the dataset has been deleted since, but the result file still
 89                # exists - should be safe to clean up
 90                if file.name not in tracked_files:
 91                    self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
 92                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
 93                elif tracked_files[file.name] < datetime.now().timestamp():
 94                    self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
 95                    if file.is_dir():
 96                        try:
 97                            shutil.rmtree(file)
 98                        except PermissionError:
 99                            self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
100                                          f"permissions), skipping")
101
102                    else:
103                        try:
104                            file.unlink()
105                        except FileNotFoundError:
106                            # the file has been deleted since
107                            pass
108
109                    # Remove from tracking
110                    del tracked_files[file.name]
111
112                continue
113
114            if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
115                # staging area exists but dataset is marked as finished
116                # if the dataset is finished, the staging area should have been
117                # compressed into a zip file, or deleted, so this is also safe
118                # to clean up
119                if file.name not in tracked_files:
120                    self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file)))
121                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
122                elif tracked_files[file.name] < datetime.now().timestamp():
123                    self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file)))
124                    shutil.rmtree(file)
125
126        # Update tracked files
127        tracking_file.write_text(json.dumps(tracked_files))
128
129        self.job.finish()

Go through result files, and for each one check if it should still exist

Returns