Edit on GitHub

backend.workers.cleanup_tempfiles

Delete old datasets

  1"""
  2Delete old datasets
  3"""
  4import shutil
  5import re
  6import json
  7from datetime import datetime
  8from pathlib import Path
  9
 10from common.config_manager import config
 11from backend.lib.worker import BasicWorker
 12from common.lib.dataset import DataSet
 13from common.lib.exceptions import WorkerInterruptedException, DataSetException
 14
 15
 16class TempFileCleaner(BasicWorker):
 17    """
 18    Clean up discarded temporary files
 19
 20    If 4CAT crashes while processing something, it may result in staging
 21    folders that are never cleaned up. This worker checks for finished
 22    datasets with staging area folders and cleans them up.
 23
 24    Also cleans up orphaned result files for datasets that no longer exist.
 25    """
 26    type = "clean-temp-files"
 27    max_workers = 1
 28
 29    ensure_job = {"remote_id": "localhost", "interval": 10800}
 30
 31    # Use tracking file to delay deletion of files that may still be in use
 32    tracking_file = config.get('PATH_DATA').joinpath(".temp_file_cleaner")
 33    days_to_keep = 7
 34
 35    def work(self):
 36        """
 37        Go through result files, and for each one check if it should still
 38        exist
 39        :return:
 40        """
 41        # Load tracking file
 42        if not self.tracking_file.exists():
 43            tracked_files = {}
 44        else:
 45            tracked_files = json.loads(self.tracking_file.read_text())
 46
 47        result_files = Path(config.get('PATH_DATA')).glob("*")
 48        for file in result_files:
 49            if file.stem.startswith("."):
 50                # skip hidden files
 51                continue
 52
 53            if self.interrupted:
 54                self.tracking_file.write_text(json.dumps(tracked_files))
 55                raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
 56
 57            # the key of the dataset files belong to can be extracted from the
 58            # file name in a predictable way.
 59            possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem)
 60            if not possible_keys:
 61                self.log.warning("File %s does not seem to be a result file - clean up manually" % file)
 62                continue
 63
 64            # if for whatever reason there are multiple hashes in the filename,
 65            # the key would always be the last one
 66            key = possible_keys.pop()
 67
 68            try:
 69                dataset = DataSet(key=key, db=self.db)
 70            except DataSetException:
 71                # the dataset has been deleted since, but the result file still
 72                # exists - should be safe to clean up
 73                if file.name not in tracked_files:
 74                    self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
 75                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
 76                elif tracked_files[file.name] < datetime.now().timestamp():
 77                    self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
 78                    if file.is_dir():
 79                        try:
 80                            shutil.rmtree(file)
 81                        except PermissionError:
 82                            self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
 83                                          f"permissions), skipping")
 84
 85                    else:
 86                        try:
 87                            file.unlink()
 88                        except FileNotFoundError:
 89                            # the file has been deleted since
 90                            pass
 91
 92                    # Remove from tracking
 93                    del tracked_files[file.name]
 94
 95                continue
 96
 97            if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
 98                # staging area exists but dataset is marked as finished
 99                # if the dataset is finished, the staging area should have been
100                # compressed into a zip file, or deleted, so this is also safe
101                # to clean up
102                self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (
103                dataset.key, str(file)))
104                shutil.rmtree(file)
105
106        # Update tracked files
107        self.tracking_file.write_text(json.dumps(tracked_files))
108
109        self.job.finish()
class TempFileCleaner(backend.lib.worker.BasicWorker):
 17class TempFileCleaner(BasicWorker):
 18    """
 19    Clean up discarded temporary files
 20
 21    If 4CAT crashes while processing something, it may result in staging
 22    folders that are never cleaned up. This worker checks for finished
 23    datasets with staging area folders and cleans them up.
 24
 25    Also cleans up orphaned result files for datasets that no longer exist.
 26    """
 27    type = "clean-temp-files"
 28    max_workers = 1
 29
 30    ensure_job = {"remote_id": "localhost", "interval": 10800}
 31
 32    # Use tracking file to delay deletion of files that may still be in use
 33    tracking_file = config.get('PATH_DATA').joinpath(".temp_file_cleaner")
 34    days_to_keep = 7
 35
 36    def work(self):
 37        """
 38        Go through result files, and for each one check if it should still
 39        exist
 40        :return:
 41        """
 42        # Load tracking file
 43        if not self.tracking_file.exists():
 44            tracked_files = {}
 45        else:
 46            tracked_files = json.loads(self.tracking_file.read_text())
 47
 48        result_files = Path(config.get('PATH_DATA')).glob("*")
 49        for file in result_files:
 50            if file.stem.startswith("."):
 51                # skip hidden files
 52                continue
 53
 54            if self.interrupted:
 55                self.tracking_file.write_text(json.dumps(tracked_files))
 56                raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
 57
 58            # the key of the dataset files belong to can be extracted from the
 59            # file name in a predictable way.
 60            possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem)
 61            if not possible_keys:
 62                self.log.warning("File %s does not seem to be a result file - clean up manually" % file)
 63                continue
 64
 65            # if for whatever reason there are multiple hashes in the filename,
 66            # the key would always be the last one
 67            key = possible_keys.pop()
 68
 69            try:
 70                dataset = DataSet(key=key, db=self.db)
 71            except DataSetException:
 72                # the dataset has been deleted since, but the result file still
 73                # exists - should be safe to clean up
 74                if file.name not in tracked_files:
 75                    self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
 76                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
 77                elif tracked_files[file.name] < datetime.now().timestamp():
 78                    self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
 79                    if file.is_dir():
 80                        try:
 81                            shutil.rmtree(file)
 82                        except PermissionError:
 83                            self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
 84                                          f"permissions), skipping")
 85
 86                    else:
 87                        try:
 88                            file.unlink()
 89                        except FileNotFoundError:
 90                            # the file has been deleted since
 91                            pass
 92
 93                    # Remove from tracking
 94                    del tracked_files[file.name]
 95
 96                continue
 97
 98            if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
 99                # staging area exists but dataset is marked as finished
100                # if the dataset is finished, the staging area should have been
101                # compressed into a zip file, or deleted, so this is also safe
102                # to clean up
103                self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (
104                dataset.key, str(file)))
105                shutil.rmtree(file)
106
107        # Update tracked files
108        self.tracking_file.write_text(json.dumps(tracked_files))
109
110        self.job.finish()

Clean up discarded temporary files

If 4CAT crashes while processing something, it may result in staging folders that are never cleaned up. This worker checks for finished datasets with staging area folders and cleans them up.

Also cleans up orphaned result files for datasets that no longer exist.

type = 'clean-temp-files'
max_workers = 1
ensure_job = {'remote_id': 'localhost', 'interval': 10800}
tracking_file = PosixPath('data/.temp_file_cleaner')
days_to_keep = 7
def work(self):
 36    def work(self):
 37        """
 38        Go through result files, and for each one check if it should still
 39        exist
 40        :return:
 41        """
 42        # Load tracking file
 43        if not self.tracking_file.exists():
 44            tracked_files = {}
 45        else:
 46            tracked_files = json.loads(self.tracking_file.read_text())
 47
 48        result_files = Path(config.get('PATH_DATA')).glob("*")
 49        for file in result_files:
 50            if file.stem.startswith("."):
 51                # skip hidden files
 52                continue
 53
 54            if self.interrupted:
 55                self.tracking_file.write_text(json.dumps(tracked_files))
 56                raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
 57
 58            # the key of the dataset files belong to can be extracted from the
 59            # file name in a predictable way.
 60            possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem)
 61            if not possible_keys:
 62                self.log.warning("File %s does not seem to be a result file - clean up manually" % file)
 63                continue
 64
 65            # if for whatever reason there are multiple hashes in the filename,
 66            # the key would always be the last one
 67            key = possible_keys.pop()
 68
 69            try:
 70                dataset = DataSet(key=key, db=self.db)
 71            except DataSetException:
 72                # the dataset has been deleted since, but the result file still
 73                # exists - should be safe to clean up
 74                if file.name not in tracked_files:
 75                    self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
 76                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
 77                elif tracked_files[file.name] < datetime.now().timestamp():
 78                    self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
 79                    if file.is_dir():
 80                        try:
 81                            shutil.rmtree(file)
 82                        except PermissionError:
 83                            self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
 84                                          f"permissions), skipping")
 85
 86                    else:
 87                        try:
 88                            file.unlink()
 89                        except FileNotFoundError:
 90                            # the file has been deleted since
 91                            pass
 92
 93                    # Remove from tracking
 94                    del tracked_files[file.name]
 95
 96                continue
 97
 98            if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
 99                # staging area exists but dataset is marked as finished
100                # if the dataset is finished, the staging area should have been
101                # compressed into a zip file, or deleted, so this is also safe
102                # to clean up
103                self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (
104                dataset.key, str(file)))
105                shutil.rmtree(file)
106
107        # Update tracked files
108        self.tracking_file.write_text(json.dumps(tracked_files))
109
110        self.job.finish()

Go through result files, and for each one check if it should still exist

Returns