Edit on GitHub

backend.workers.cleanup_tempfiles

Delete old datasets

  1"""
  2Delete old datasets
  3"""
  4import shutil
  5import re
  6import json
  7from datetime import datetime
  8from pathlib import Path
  9
 10from backend.lib.worker import BasicWorker
 11from common.lib.dataset import DataSet
 12from common.lib.exceptions import WorkerInterruptedException, DataSetException
 13
 14
 15class TempFileCleaner(BasicWorker):
 16    """
 17    Clean up discarded temporary files
 18
 19    If 4CAT crashes while processing something, it may result in staging
 20    folders that are never cleaned up. This worker checks for finished
 21    datasets with staging area folders and cleans them up.
 22
 23    Also cleans up orphaned result files for datasets that no longer exist.
 24    """
 25    type = "clean-temp-files"
 26    max_workers = 1
 27
 28    ensure_job = {"remote_id": "localhost", "interval": 10800}
 29
 30    # Use tracking file to delay deletion of files that may still be in use
 31    days_to_keep = 7
 32
 33    def work(self):
 34        """
 35        Go through result files, and for each one check if it should still
 36        exist
 37        :return:
 38        """
 39        # Load tracking file
 40        tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner")
 41        if not tracking_file.exists():
 42            tracked_files = {}
 43        else:
 44            tracked_files = json.loads(tracking_file.read_text())
 45
 46        result_files = Path(self.config.get('PATH_DATA')).glob("*")
 47        for file in result_files:
 48            if file.stem.startswith("."):
 49                # skip hidden files
 50                continue
 51
 52            if self.interrupted:
 53                tracking_file.write_text(json.dumps(tracked_files))
 54                raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
 55
 56            # the key of the dataset files belong to can be extracted from the
 57            # file name in a predictable way.
 58            possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem)
 59            if not possible_keys:
 60                self.log.warning("File %s does not seem to be a result file - clean up manually" % file)
 61                continue
 62
 63            # if for whatever reason there are multiple hashes in the filename,
 64            # the key would always be the last one
 65            key = possible_keys.pop()
 66
 67            try:
 68                dataset = DataSet(key=key, db=self.db)
 69            except DataSetException:
 70                # the dataset has been deleted since, but the result file still
 71                # exists - should be safe to clean up
 72                if file.name not in tracked_files:
 73                    self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
 74                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
 75                elif tracked_files[file.name] < datetime.now().timestamp():
 76                    self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
 77                    if file.is_dir():
 78                        try:
 79                            shutil.rmtree(file)
 80                        except PermissionError:
 81                            self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
 82                                          f"permissions), skipping")
 83
 84                    else:
 85                        try:
 86                            file.unlink()
 87                        except FileNotFoundError:
 88                            # the file has been deleted since
 89                            pass
 90
 91                    # Remove from tracking
 92                    del tracked_files[file.name]
 93
 94                continue
 95
 96            if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
 97                # staging area exists but dataset is marked as finished
 98                # if the dataset is finished, the staging area should have been
 99                # compressed into a zip file, or deleted, so this is also safe
100                # to clean up
101                if file.name not in tracked_files:
102                    self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file)))
103                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
104                elif tracked_files[file.name] < datetime.now().timestamp():
105                    self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file)))
106                    shutil.rmtree(file)
107
108        # Update tracked files
109        tracking_file.write_text(json.dumps(tracked_files))
110
111        self.job.finish()
class TempFileCleaner(backend.lib.worker.BasicWorker):
 16class TempFileCleaner(BasicWorker):
 17    """
 18    Clean up discarded temporary files
 19
 20    If 4CAT crashes while processing something, it may result in staging
 21    folders that are never cleaned up. This worker checks for finished
 22    datasets with staging area folders and cleans them up.
 23
 24    Also cleans up orphaned result files for datasets that no longer exist.
 25    """
 26    type = "clean-temp-files"
 27    max_workers = 1
 28
 29    ensure_job = {"remote_id": "localhost", "interval": 10800}
 30
 31    # Use tracking file to delay deletion of files that may still be in use
 32    days_to_keep = 7
 33
 34    def work(self):
 35        """
 36        Go through result files, and for each one check if it should still
 37        exist
 38        :return:
 39        """
 40        # Load tracking file
 41        tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner")
 42        if not tracking_file.exists():
 43            tracked_files = {}
 44        else:
 45            tracked_files = json.loads(tracking_file.read_text())
 46
 47        result_files = Path(self.config.get('PATH_DATA')).glob("*")
 48        for file in result_files:
 49            if file.stem.startswith("."):
 50                # skip hidden files
 51                continue
 52
 53            if self.interrupted:
 54                tracking_file.write_text(json.dumps(tracked_files))
 55                raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
 56
 57            # the key of the dataset files belong to can be extracted from the
 58            # file name in a predictable way.
 59            possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem)
 60            if not possible_keys:
 61                self.log.warning("File %s does not seem to be a result file - clean up manually" % file)
 62                continue
 63
 64            # if for whatever reason there are multiple hashes in the filename,
 65            # the key would always be the last one
 66            key = possible_keys.pop()
 67
 68            try:
 69                dataset = DataSet(key=key, db=self.db)
 70            except DataSetException:
 71                # the dataset has been deleted since, but the result file still
 72                # exists - should be safe to clean up
 73                if file.name not in tracked_files:
 74                    self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
 75                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
 76                elif tracked_files[file.name] < datetime.now().timestamp():
 77                    self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
 78                    if file.is_dir():
 79                        try:
 80                            shutil.rmtree(file)
 81                        except PermissionError:
 82                            self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
 83                                          f"permissions), skipping")
 84
 85                    else:
 86                        try:
 87                            file.unlink()
 88                        except FileNotFoundError:
 89                            # the file has been deleted since
 90                            pass
 91
 92                    # Remove from tracking
 93                    del tracked_files[file.name]
 94
 95                continue
 96
 97            if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
 98                # staging area exists but dataset is marked as finished
 99                # if the dataset is finished, the staging area should have been
100                # compressed into a zip file, or deleted, so this is also safe
101                # to clean up
102                if file.name not in tracked_files:
103                    self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file)))
104                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
105                elif tracked_files[file.name] < datetime.now().timestamp():
106                    self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file)))
107                    shutil.rmtree(file)
108
109        # Update tracked files
110        tracking_file.write_text(json.dumps(tracked_files))
111
112        self.job.finish()

Clean up discarded temporary files

If 4CAT crashes while processing something, it may result in staging folders that are never cleaned up. This worker checks for finished datasets with staging area folders and cleans them up.

Also cleans up orphaned result files for datasets that no longer exist.

type = 'clean-temp-files'
max_workers = 1
ensure_job = {'remote_id': 'localhost', 'interval': 10800}
days_to_keep = 7
def work(self):
 34    def work(self):
 35        """
 36        Go through result files, and for each one check if it should still
 37        exist
 38        :return:
 39        """
 40        # Load tracking file
 41        tracking_file = self.config.get('PATH_DATA').joinpath(".temp_file_cleaner")
 42        if not tracking_file.exists():
 43            tracked_files = {}
 44        else:
 45            tracked_files = json.loads(tracking_file.read_text())
 46
 47        result_files = Path(self.config.get('PATH_DATA')).glob("*")
 48        for file in result_files:
 49            if file.stem.startswith("."):
 50                # skip hidden files
 51                continue
 52
 53            if self.interrupted:
 54                tracking_file.write_text(json.dumps(tracked_files))
 55                raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
 56
 57            # the key of the dataset files belong to can be extracted from the
 58            # file name in a predictable way.
 59            possible_keys = re.findall(r"[abcdef0-9]{32}", file.stem)
 60            if not possible_keys:
 61                self.log.warning("File %s does not seem to be a result file - clean up manually" % file)
 62                continue
 63
 64            # if for whatever reason there are multiple hashes in the filename,
 65            # the key would always be the last one
 66            key = possible_keys.pop()
 67
 68            try:
 69                dataset = DataSet(key=key, db=self.db)
 70            except DataSetException:
 71                # the dataset has been deleted since, but the result file still
 72                # exists - should be safe to clean up
 73                if file.name not in tracked_files:
 74                    self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
 75                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
 76                elif tracked_files[file.name] < datetime.now().timestamp():
 77                    self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
 78                    if file.is_dir():
 79                        try:
 80                            shutil.rmtree(file)
 81                        except PermissionError:
 82                            self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
 83                                          f"permissions), skipping")
 84
 85                    else:
 86                        try:
 87                            file.unlink()
 88                        except FileNotFoundError:
 89                            # the file has been deleted since
 90                            pass
 91
 92                    # Remove from tracking
 93                    del tracked_files[file.name]
 94
 95                continue
 96
 97            if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
 98                # staging area exists but dataset is marked as finished
 99                # if the dataset is finished, the staging area should have been
100                # compressed into a zip file, or deleted, so this is also safe
101                # to clean up
102                if file.name not in tracked_files:
103                    self.log.info("Dataset %s is finished, but staging area remains at %s, marking for deletion" % (dataset.key, str(file)))
104                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
105                elif tracked_files[file.name] < datetime.now().timestamp():
106                    self.log.info("Dataset %s is finished, but staging area remains at %s, deleting folder" % (dataset.key, str(file)))
107                    shutil.rmtree(file)
108
109        # Update tracked files
110        tracking_file.write_text(json.dumps(tracked_files))
111
112        self.job.finish()

Go through result files, and for each one check if it should still exist

Returns