datasources.media_import.import_media
1import re 2import math 3import json 4import time 5import zipfile 6import mimetypes 7from io import BytesIO 8 9from backend.lib.processor import BasicProcessor 10from common.config_manager import config 11from common.lib.exceptions import QueryParametersException, QueryNeedsExplicitConfirmationException 12from common.lib.user_input import UserInput 13from common.lib.helpers import andify 14 15# approximate number of files that can be uploaded in a single request rounded to 100 16# todo: bring this back into get_options after merging #455 17# max_files_approx = int((math.floor(config.get('flask.max_form_parts', 1000)-50)/100)) * 100 18 19class SearchMedia(BasicProcessor): 20 type = "media-import-search" # job ID 21 category = "Search" # category 22 title = "Upload Media" # title displayed in UI 23 description = "Upload your own audio, video, or image files to be used as a dataset" # description displayed in UI 24 extension = "zip" # extension of result file, used internally and in UI 25 is_local = False # Whether this datasource is locally scraped 26 is_static = False # Whether this datasource is still updated 27 28 max_workers = 1 29 30 disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]") 31 accepted_file_types = ["audio", "video", "image"] 32 33 @classmethod 34 def get_options(cls, parent_dataset=None, user=None): 35 return { 36 "intro": { 37 "type": UserInput.OPTION_INFO, 38 # Large numbers of files fail possibly due to Flask request size limit (not file size) 39 "help": "Upload media files to make them be available for further analysis. " 40 "Please include only one type of file per dataset (image, audio, or video) and " 41 "4CAT will be able to run various processors on these media collections. " 42 f"\n\nFor collections **larger than a few hundred**, please upload a single " 43 f"ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)." 44 }, 45 "data_upload": { 46 "type": UserInput.OPTION_FILE, 47 "multiple": True, 48 "help": "Files" 49 }, 50 } 51 52 @staticmethod 53 def validate_query(query, request, user): 54 """ 55 Step 1: Validate query and files 56 57 Confirms that the uploaded files exist and that the media type is valid. 58 59 :param dict query: Query parameters, from client-side. 60 :param request: Flask request 61 :param User user: User object of user who has submitted the query 62 :return dict: Safe query parameters 63 """ 64 # do we have uploaded files? 65 bad_files = [] 66 seen_types = set() 67 all_files = 0 68 69 uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file 70 single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip") 71 72 # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file 73 if "option-data_upload-entries" in request.form or single_zip_file: 74 # we have a zip file! 75 try: 76 if single_zip_file: 77 # we have a single uploaded zip file 78 # i.e. the query has already been validated (else we would have 79 # -entries and no file) and we can get the file info from the 80 # zip file itself 81 uploaded_files[0].seek(0) 82 zip_file_data = BytesIO(uploaded_files[0].read()) 83 with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip: 84 files = [{"filename": f} for f in uploaded_zip.namelist()] 85 else: 86 # validating - get file names from entries field 87 files = json.loads(request.form["option-data_upload-entries"]) 88 89 # ignore known metadata files 90 files = [f for f in files if not ( 91 f["filename"].split("/")[-1].startswith(".") 92 or f["filename"].endswith(".log") 93 or f["filename"].split("/")[-1].startswith("__MACOSX") 94 or f["filename"].endswith(".DS_Store") 95 or f["filename"].endswith("/") # sub-directory 96 )] 97 98 # figure out if we have mixed media types 99 seen_types = set() 100 for file in files: 101 if not query.get("frontend-confirm") and file["filename"].endswith(".svg"): 102 raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 103 try: 104 file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0] 105 seen_types.add(file_type) 106 all_files += 1 107 except (AttributeError, TypeError): 108 bad_files.append(file["filename"]) 109 110 except (ValueError, zipfile.BadZipfile) as e: 111 raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot " 112 "be uploaded to 4CAT.") 113 114 elif "option-data_upload" not in request.files: 115 raise QueryParametersException("No files were offered for upload.") 116 117 elif len(uploaded_files) < 1: 118 raise QueryParametersException("No files were offered for upload.") 119 120 else: 121 # we just have a bunch of separate files 122 # Check file types to ensure all are same type of media 123 for file in uploaded_files: 124 # Allow metadata files and log files to be uploaded 125 if file.filename == ".metadata.json" or file.filename.endswith(".log"): 126 continue 127 128 # Warn on SVG files 129 if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"): 130 raise QueryNeedsExplicitConfirmationException( 131 "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 132 133 # when uploading multiple files, we don't want zips 134 if file.filename.lower().endswith(".zip"): 135 raise QueryParametersException("When uploading media in a zip archive, please upload exactly one " 136 "zip file; 4CAT cannot combine multiple separate zip archives.") 137 138 # Guess mime type from filename; we only have partial files at this point 139 mime_type = mimetypes.guess_type(file.filename)[0] 140 if mime_type is None: 141 bad_files.append(file.filename) 142 continue 143 144 mime_type = mime_type.split('/')[0] 145 if mime_type not in SearchMedia.accepted_file_types: 146 raise QueryParametersException(f"This data source only accepts " 147 f"{andify(SearchMedia.accepted_file_types)} files; " 148 f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot " 149 f"process.") 150 151 seen_types.add(mime_type) 152 all_files += 1 153 154 # we need to at least be able to recognise the extension to know we can 155 # do something with the file... 156 if bad_files: 157 separator = "\n- " 158 raise QueryParametersException("The type of the following files cannot be determined; rename them or " 159 f"remove them from the archive or rename them\n{separator.join(bad_files)}") 160 161 # this is not fool-proof, but uncommon extensions are less likely to work 162 # anyway and the user can still choose to proceed 163 if len(set(seen_types)) > 1: 164 raise QueryParametersException( 165 f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require " 166 "files of a single type to work properly. Please re-upload only a single type of media to proceed." 167 ) 168 169 return { 170 "time": time.time(), 171 "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files 172 "num_files": all_files, 173 } 174 175 @staticmethod 176 def after_create(query, dataset, request): 177 """ 178 Step 2: Hook to execute after the dataset for this source has been created 179 180 In this case, save the files in a zip archive. 181 182 :param dict query: Sanitised query parameters 183 :param DataSet dataset: Dataset created for this query 184 :param request: Flask request submitted for its creation 185 """ 186 mime_type = query.get("media_type") 187 saved_files = 0 188 skipped_files = [] 189 with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive: 190 for file in request.files.getlist("option-data_upload"): 191 # Check if file is zip archive 192 file_mime_type = mimetypes.guess_type(file.filename)[0] 193 if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \ 194 file_mime_type.split('/')[1] == "zip": 195 # Save inner files from zip archive to new zip archive with all files 196 file.seek(0) 197 zip_file_data = BytesIO(file.read()) 198 with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive: 199 for inner_file in inner_zip_archive.infolist(): 200 if inner_file.is_dir(): 201 continue 202 203 guessed_file_mime_type = mimetypes.guess_type(inner_file.filename) 204 if guessed_file_mime_type[0]: 205 mime_type = guessed_file_mime_type[0].split('/')[0] 206 207 # skip useless metadata files 208 # also skip files not recognised as media files 209 clean_file_name = inner_file.filename.split("/")[-1] 210 if not guessed_file_mime_type[0] or ( 211 mime_type not in SearchMedia.accepted_file_types 212 and not clean_file_name.endswith(".log") 213 and not clean_file_name == ".metadata.json" 214 ) or clean_file_name.startswith("__MACOSX") \ 215 or inner_file.filename.startswith("__MACOSX"): 216 print(f"skipping {clean_file_name} ({guessed_file_mime_type})") 217 skipped_files.append(inner_file.filename) 218 continue 219 220 # save inner file from the uploaded zip archive to the new zip with all files 221 new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive) 222 new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file)) 223 224 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 225 saved_files += 1 226 continue 227 228 new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive) 229 with new_zip_archive.open(new_filename, mode='w') as dest_file: 230 file.seek(0) 231 while True: 232 chunk = file.read(1024) 233 if len(chunk) == 0: 234 break 235 dest_file.write(chunk) 236 237 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 238 saved_files += 1 239 240 # update the number of files in the dataset 241 dataset.num_files = saved_files 242 dataset.media_type = mime_type 243 # TODO: use job's 'details' to save data to share/log? 244 if skipped_files: 245 # todo: this now doesn't actually get logged because the log is 246 # re-initialised after after_create runs? 247 dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised" 248 "as media files:" + "\n -".join(skipped_files)) 249 250 def process(self): 251 """ 252 Step 3: Ummmm, we kinda did everything 253 """ 254 # Check for SVG files 255 svg_warning = 0 256 if self.parameters.get("media_type") == "image": 257 for file in self.iterate_archive_contents(self.dataset.get_results_path()): 258 if file.suffix == ".svg": 259 if svg_warning == 0: 260 self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.") 261 self.dataset.log(f"SVG file detected: {file.name}") 262 svg_warning += 1 263 self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True) 264 self.dataset.finish(self.parameters.get("num_files")) 265 266 @staticmethod 267 def get_safe_filename(filename, zip_archive=None): 268 new_filename = SearchMedia.disallowed_characters.sub("", filename) 269 if zip_archive: 270 # check if file is in zip archive 271 index = 1 272 while new_filename in zip_archive.namelist(): 273 new_filename = new_filename + "_" + str(index) 274 index += 1 275 276 return new_filename
20class SearchMedia(BasicProcessor): 21 type = "media-import-search" # job ID 22 category = "Search" # category 23 title = "Upload Media" # title displayed in UI 24 description = "Upload your own audio, video, or image files to be used as a dataset" # description displayed in UI 25 extension = "zip" # extension of result file, used internally and in UI 26 is_local = False # Whether this datasource is locally scraped 27 is_static = False # Whether this datasource is still updated 28 29 max_workers = 1 30 31 disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]") 32 accepted_file_types = ["audio", "video", "image"] 33 34 @classmethod 35 def get_options(cls, parent_dataset=None, user=None): 36 return { 37 "intro": { 38 "type": UserInput.OPTION_INFO, 39 # Large numbers of files fail possibly due to Flask request size limit (not file size) 40 "help": "Upload media files to make them be available for further analysis. " 41 "Please include only one type of file per dataset (image, audio, or video) and " 42 "4CAT will be able to run various processors on these media collections. " 43 f"\n\nFor collections **larger than a few hundred**, please upload a single " 44 f"ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)." 45 }, 46 "data_upload": { 47 "type": UserInput.OPTION_FILE, 48 "multiple": True, 49 "help": "Files" 50 }, 51 } 52 53 @staticmethod 54 def validate_query(query, request, user): 55 """ 56 Step 1: Validate query and files 57 58 Confirms that the uploaded files exist and that the media type is valid. 59 60 :param dict query: Query parameters, from client-side. 61 :param request: Flask request 62 :param User user: User object of user who has submitted the query 63 :return dict: Safe query parameters 64 """ 65 # do we have uploaded files? 66 bad_files = [] 67 seen_types = set() 68 all_files = 0 69 70 uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file 71 single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip") 72 73 # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file 74 if "option-data_upload-entries" in request.form or single_zip_file: 75 # we have a zip file! 76 try: 77 if single_zip_file: 78 # we have a single uploaded zip file 79 # i.e. the query has already been validated (else we would have 80 # -entries and no file) and we can get the file info from the 81 # zip file itself 82 uploaded_files[0].seek(0) 83 zip_file_data = BytesIO(uploaded_files[0].read()) 84 with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip: 85 files = [{"filename": f} for f in uploaded_zip.namelist()] 86 else: 87 # validating - get file names from entries field 88 files = json.loads(request.form["option-data_upload-entries"]) 89 90 # ignore known metadata files 91 files = [f for f in files if not ( 92 f["filename"].split("/")[-1].startswith(".") 93 or f["filename"].endswith(".log") 94 or f["filename"].split("/")[-1].startswith("__MACOSX") 95 or f["filename"].endswith(".DS_Store") 96 or f["filename"].endswith("/") # sub-directory 97 )] 98 99 # figure out if we have mixed media types 100 seen_types = set() 101 for file in files: 102 if not query.get("frontend-confirm") and file["filename"].endswith(".svg"): 103 raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 104 try: 105 file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0] 106 seen_types.add(file_type) 107 all_files += 1 108 except (AttributeError, TypeError): 109 bad_files.append(file["filename"]) 110 111 except (ValueError, zipfile.BadZipfile) as e: 112 raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot " 113 "be uploaded to 4CAT.") 114 115 elif "option-data_upload" not in request.files: 116 raise QueryParametersException("No files were offered for upload.") 117 118 elif len(uploaded_files) < 1: 119 raise QueryParametersException("No files were offered for upload.") 120 121 else: 122 # we just have a bunch of separate files 123 # Check file types to ensure all are same type of media 124 for file in uploaded_files: 125 # Allow metadata files and log files to be uploaded 126 if file.filename == ".metadata.json" or file.filename.endswith(".log"): 127 continue 128 129 # Warn on SVG files 130 if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"): 131 raise QueryNeedsExplicitConfirmationException( 132 "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 133 134 # when uploading multiple files, we don't want zips 135 if file.filename.lower().endswith(".zip"): 136 raise QueryParametersException("When uploading media in a zip archive, please upload exactly one " 137 "zip file; 4CAT cannot combine multiple separate zip archives.") 138 139 # Guess mime type from filename; we only have partial files at this point 140 mime_type = mimetypes.guess_type(file.filename)[0] 141 if mime_type is None: 142 bad_files.append(file.filename) 143 continue 144 145 mime_type = mime_type.split('/')[0] 146 if mime_type not in SearchMedia.accepted_file_types: 147 raise QueryParametersException(f"This data source only accepts " 148 f"{andify(SearchMedia.accepted_file_types)} files; " 149 f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot " 150 f"process.") 151 152 seen_types.add(mime_type) 153 all_files += 1 154 155 # we need to at least be able to recognise the extension to know we can 156 # do something with the file... 157 if bad_files: 158 separator = "\n- " 159 raise QueryParametersException("The type of the following files cannot be determined; rename them or " 160 f"remove them from the archive or rename them\n{separator.join(bad_files)}") 161 162 # this is not fool-proof, but uncommon extensions are less likely to work 163 # anyway and the user can still choose to proceed 164 if len(set(seen_types)) > 1: 165 raise QueryParametersException( 166 f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require " 167 "files of a single type to work properly. Please re-upload only a single type of media to proceed." 168 ) 169 170 return { 171 "time": time.time(), 172 "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files 173 "num_files": all_files, 174 } 175 176 @staticmethod 177 def after_create(query, dataset, request): 178 """ 179 Step 2: Hook to execute after the dataset for this source has been created 180 181 In this case, save the files in a zip archive. 182 183 :param dict query: Sanitised query parameters 184 :param DataSet dataset: Dataset created for this query 185 :param request: Flask request submitted for its creation 186 """ 187 mime_type = query.get("media_type") 188 saved_files = 0 189 skipped_files = [] 190 with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive: 191 for file in request.files.getlist("option-data_upload"): 192 # Check if file is zip archive 193 file_mime_type = mimetypes.guess_type(file.filename)[0] 194 if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \ 195 file_mime_type.split('/')[1] == "zip": 196 # Save inner files from zip archive to new zip archive with all files 197 file.seek(0) 198 zip_file_data = BytesIO(file.read()) 199 with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive: 200 for inner_file in inner_zip_archive.infolist(): 201 if inner_file.is_dir(): 202 continue 203 204 guessed_file_mime_type = mimetypes.guess_type(inner_file.filename) 205 if guessed_file_mime_type[0]: 206 mime_type = guessed_file_mime_type[0].split('/')[0] 207 208 # skip useless metadata files 209 # also skip files not recognised as media files 210 clean_file_name = inner_file.filename.split("/")[-1] 211 if not guessed_file_mime_type[0] or ( 212 mime_type not in SearchMedia.accepted_file_types 213 and not clean_file_name.endswith(".log") 214 and not clean_file_name == ".metadata.json" 215 ) or clean_file_name.startswith("__MACOSX") \ 216 or inner_file.filename.startswith("__MACOSX"): 217 print(f"skipping {clean_file_name} ({guessed_file_mime_type})") 218 skipped_files.append(inner_file.filename) 219 continue 220 221 # save inner file from the uploaded zip archive to the new zip with all files 222 new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive) 223 new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file)) 224 225 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 226 saved_files += 1 227 continue 228 229 new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive) 230 with new_zip_archive.open(new_filename, mode='w') as dest_file: 231 file.seek(0) 232 while True: 233 chunk = file.read(1024) 234 if len(chunk) == 0: 235 break 236 dest_file.write(chunk) 237 238 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 239 saved_files += 1 240 241 # update the number of files in the dataset 242 dataset.num_files = saved_files 243 dataset.media_type = mime_type 244 # TODO: use job's 'details' to save data to share/log? 245 if skipped_files: 246 # todo: this now doesn't actually get logged because the log is 247 # re-initialised after after_create runs? 248 dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised" 249 "as media files:" + "\n -".join(skipped_files)) 250 251 def process(self): 252 """ 253 Step 3: Ummmm, we kinda did everything 254 """ 255 # Check for SVG files 256 svg_warning = 0 257 if self.parameters.get("media_type") == "image": 258 for file in self.iterate_archive_contents(self.dataset.get_results_path()): 259 if file.suffix == ".svg": 260 if svg_warning == 0: 261 self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.") 262 self.dataset.log(f"SVG file detected: {file.name}") 263 svg_warning += 1 264 self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True) 265 self.dataset.finish(self.parameters.get("num_files")) 266 267 @staticmethod 268 def get_safe_filename(filename, zip_archive=None): 269 new_filename = SearchMedia.disallowed_characters.sub("", filename) 270 if zip_archive: 271 # check if file is in zip archive 272 index = 1 273 while new_filename in zip_archive.namelist(): 274 new_filename = new_filename + "_" + str(index) 275 index += 1 276 277 return new_filename
Abstract processor class
A processor takes a finished dataset as input and processes its result in some way, with another dataset set as output. The input thus is a file, and the output (usually) as well. In other words, the result of a processor can be used as input for another processor (though whether and when this is useful is another question).
To determine whether a processor can process a given dataset, you can
define a is_compatible_with(FourcatModule module=None, str user=None):) -> bool
class
method which takes a dataset as argument and returns a bool that determines
if this processor is considered compatible with that dataset. For example:
@classmethod def is_compatible_with(cls, module=None, user=None): return module.type == "linguistic-features"
34 @classmethod 35 def get_options(cls, parent_dataset=None, user=None): 36 return { 37 "intro": { 38 "type": UserInput.OPTION_INFO, 39 # Large numbers of files fail possibly due to Flask request size limit (not file size) 40 "help": "Upload media files to make them be available for further analysis. " 41 "Please include only one type of file per dataset (image, audio, or video) and " 42 "4CAT will be able to run various processors on these media collections. " 43 f"\n\nFor collections **larger than a few hundred**, please upload a single " 44 f"ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)." 45 }, 46 "data_upload": { 47 "type": UserInput.OPTION_FILE, 48 "multiple": True, 49 "help": "Files" 50 }, 51 }
Get processor options
This method by default returns the class's "options" attribute, or an empty dictionary. It can be redefined by processors that need more fine-grained options, e.g. in cases where the availability of options is partially determined by the parent dataset's parameters.
Parameters
- DataSet parent_dataset: An object representing the dataset that the processor would be run on
- User user: Flask user the options will be displayed for, in case they are requested for display in the 4CAT web interface. This can be used to show some options only to privileges users.
53 @staticmethod 54 def validate_query(query, request, user): 55 """ 56 Step 1: Validate query and files 57 58 Confirms that the uploaded files exist and that the media type is valid. 59 60 :param dict query: Query parameters, from client-side. 61 :param request: Flask request 62 :param User user: User object of user who has submitted the query 63 :return dict: Safe query parameters 64 """ 65 # do we have uploaded files? 66 bad_files = [] 67 seen_types = set() 68 all_files = 0 69 70 uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file 71 single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip") 72 73 # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file 74 if "option-data_upload-entries" in request.form or single_zip_file: 75 # we have a zip file! 76 try: 77 if single_zip_file: 78 # we have a single uploaded zip file 79 # i.e. the query has already been validated (else we would have 80 # -entries and no file) and we can get the file info from the 81 # zip file itself 82 uploaded_files[0].seek(0) 83 zip_file_data = BytesIO(uploaded_files[0].read()) 84 with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip: 85 files = [{"filename": f} for f in uploaded_zip.namelist()] 86 else: 87 # validating - get file names from entries field 88 files = json.loads(request.form["option-data_upload-entries"]) 89 90 # ignore known metadata files 91 files = [f for f in files if not ( 92 f["filename"].split("/")[-1].startswith(".") 93 or f["filename"].endswith(".log") 94 or f["filename"].split("/")[-1].startswith("__MACOSX") 95 or f["filename"].endswith(".DS_Store") 96 or f["filename"].endswith("/") # sub-directory 97 )] 98 99 # figure out if we have mixed media types 100 seen_types = set() 101 for file in files: 102 if not query.get("frontend-confirm") and file["filename"].endswith(".svg"): 103 raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 104 try: 105 file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0] 106 seen_types.add(file_type) 107 all_files += 1 108 except (AttributeError, TypeError): 109 bad_files.append(file["filename"]) 110 111 except (ValueError, zipfile.BadZipfile) as e: 112 raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot " 113 "be uploaded to 4CAT.") 114 115 elif "option-data_upload" not in request.files: 116 raise QueryParametersException("No files were offered for upload.") 117 118 elif len(uploaded_files) < 1: 119 raise QueryParametersException("No files were offered for upload.") 120 121 else: 122 # we just have a bunch of separate files 123 # Check file types to ensure all are same type of media 124 for file in uploaded_files: 125 # Allow metadata files and log files to be uploaded 126 if file.filename == ".metadata.json" or file.filename.endswith(".log"): 127 continue 128 129 # Warn on SVG files 130 if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"): 131 raise QueryNeedsExplicitConfirmationException( 132 "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 133 134 # when uploading multiple files, we don't want zips 135 if file.filename.lower().endswith(".zip"): 136 raise QueryParametersException("When uploading media in a zip archive, please upload exactly one " 137 "zip file; 4CAT cannot combine multiple separate zip archives.") 138 139 # Guess mime type from filename; we only have partial files at this point 140 mime_type = mimetypes.guess_type(file.filename)[0] 141 if mime_type is None: 142 bad_files.append(file.filename) 143 continue 144 145 mime_type = mime_type.split('/')[0] 146 if mime_type not in SearchMedia.accepted_file_types: 147 raise QueryParametersException(f"This data source only accepts " 148 f"{andify(SearchMedia.accepted_file_types)} files; " 149 f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot " 150 f"process.") 151 152 seen_types.add(mime_type) 153 all_files += 1 154 155 # we need to at least be able to recognise the extension to know we can 156 # do something with the file... 157 if bad_files: 158 separator = "\n- " 159 raise QueryParametersException("The type of the following files cannot be determined; rename them or " 160 f"remove them from the archive or rename them\n{separator.join(bad_files)}") 161 162 # this is not fool-proof, but uncommon extensions are less likely to work 163 # anyway and the user can still choose to proceed 164 if len(set(seen_types)) > 1: 165 raise QueryParametersException( 166 f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require " 167 "files of a single type to work properly. Please re-upload only a single type of media to proceed." 168 ) 169 170 return { 171 "time": time.time(), 172 "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files 173 "num_files": all_files, 174 }
Step 1: Validate query and files
Confirms that the uploaded files exist and that the media type is valid.
Parameters
- dict query: Query parameters, from client-side.
- request: Flask request
- User user: User object of user who has submitted the query
Returns
Safe query parameters
176 @staticmethod 177 def after_create(query, dataset, request): 178 """ 179 Step 2: Hook to execute after the dataset for this source has been created 180 181 In this case, save the files in a zip archive. 182 183 :param dict query: Sanitised query parameters 184 :param DataSet dataset: Dataset created for this query 185 :param request: Flask request submitted for its creation 186 """ 187 mime_type = query.get("media_type") 188 saved_files = 0 189 skipped_files = [] 190 with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive: 191 for file in request.files.getlist("option-data_upload"): 192 # Check if file is zip archive 193 file_mime_type = mimetypes.guess_type(file.filename)[0] 194 if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \ 195 file_mime_type.split('/')[1] == "zip": 196 # Save inner files from zip archive to new zip archive with all files 197 file.seek(0) 198 zip_file_data = BytesIO(file.read()) 199 with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive: 200 for inner_file in inner_zip_archive.infolist(): 201 if inner_file.is_dir(): 202 continue 203 204 guessed_file_mime_type = mimetypes.guess_type(inner_file.filename) 205 if guessed_file_mime_type[0]: 206 mime_type = guessed_file_mime_type[0].split('/')[0] 207 208 # skip useless metadata files 209 # also skip files not recognised as media files 210 clean_file_name = inner_file.filename.split("/")[-1] 211 if not guessed_file_mime_type[0] or ( 212 mime_type not in SearchMedia.accepted_file_types 213 and not clean_file_name.endswith(".log") 214 and not clean_file_name == ".metadata.json" 215 ) or clean_file_name.startswith("__MACOSX") \ 216 or inner_file.filename.startswith("__MACOSX"): 217 print(f"skipping {clean_file_name} ({guessed_file_mime_type})") 218 skipped_files.append(inner_file.filename) 219 continue 220 221 # save inner file from the uploaded zip archive to the new zip with all files 222 new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive) 223 new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file)) 224 225 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 226 saved_files += 1 227 continue 228 229 new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive) 230 with new_zip_archive.open(new_filename, mode='w') as dest_file: 231 file.seek(0) 232 while True: 233 chunk = file.read(1024) 234 if len(chunk) == 0: 235 break 236 dest_file.write(chunk) 237 238 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 239 saved_files += 1 240 241 # update the number of files in the dataset 242 dataset.num_files = saved_files 243 dataset.media_type = mime_type 244 # TODO: use job's 'details' to save data to share/log? 245 if skipped_files: 246 # todo: this now doesn't actually get logged because the log is 247 # re-initialised after after_create runs? 248 dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised" 249 "as media files:" + "\n -".join(skipped_files))
Step 2: Hook to execute after the dataset for this source has been created
In this case, save the files in a zip archive.
Parameters
- dict query: Sanitised query parameters
- DataSet dataset: Dataset created for this query
- request: Flask request submitted for its creation
251 def process(self): 252 """ 253 Step 3: Ummmm, we kinda did everything 254 """ 255 # Check for SVG files 256 svg_warning = 0 257 if self.parameters.get("media_type") == "image": 258 for file in self.iterate_archive_contents(self.dataset.get_results_path()): 259 if file.suffix == ".svg": 260 if svg_warning == 0: 261 self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.") 262 self.dataset.log(f"SVG file detected: {file.name}") 263 svg_warning += 1 264 self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True) 265 self.dataset.finish(self.parameters.get("num_files"))
Step 3: Ummmm, we kinda did everything
267 @staticmethod 268 def get_safe_filename(filename, zip_archive=None): 269 new_filename = SearchMedia.disallowed_characters.sub("", filename) 270 if zip_archive: 271 # check if file is in zip archive 272 index = 1 273 while new_filename in zip_archive.namelist(): 274 new_filename = new_filename + "_" + str(index) 275 index += 1 276 277 return new_filename
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- add_field_to_parent
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- map_item_method_available
- get_mapped_item
- is_filter
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor