datasources.media_import.import_media
1import re 2import json 3import time 4import zipfile 5import mimetypes 6from io import BytesIO 7 8from backend.lib.processor import BasicProcessor 9from common.lib.exceptions import QueryParametersException, QueryNeedsExplicitConfirmationException 10from common.lib.user_input import UserInput 11from common.lib.helpers import andify 12 13class SearchMedia(BasicProcessor): 14 type = "media-import-search" # job ID 15 category = "Search" # category 16 title = "Upload Media" # title displayed in UI 17 description = "Upload your own audio, video, or image files to be used as a dataset" # description displayed in UI 18 extension = "zip" # extension of result file, used internally and in UI 19 is_local = False # Whether this datasource is locally scraped 20 is_static = False # Whether this datasource is still updated 21 22 max_workers = 1 23 24 disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]") 25 accepted_file_types = ["audio", "video", "image"] 26 27 @classmethod 28 def get_options(cls, parent_dataset=None, config=None): 29 # approximate number of files that can be uploaded in a single request rounded to 100 30 #max_files_approx = int((math.floor(config.get('flask.max_form_parts', 1000) - 50) / 100)) * 100 31 32 return { 33 "intro": { 34 "type": UserInput.OPTION_INFO, 35 # Large numbers of files fail possibly due to Flask request size limit (not file size) 36 "help": "Upload media files to make them be available for further analysis. " 37 "Please include only one type of file per dataset (image, audio, or video) and " 38 "4CAT will be able to run various processors on these media collections. " 39 "\n\nFor collections **larger than a few hundred**, please upload a single " 40 "ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)." 41 }, 42 "data_upload": { 43 "type": UserInput.OPTION_FILE, 44 "multiple": True, 45 "help": "Files" 46 }, 47 } 48 49 @staticmethod 50 def validate_query(query, request, config): 51 """ 52 Step 1: Validate query and files 53 54 Confirms that the uploaded files exist and that the media type is valid. 55 56 :param dict query: Query parameters, from client-side. 57 :param request: Flask request 58 :param ConfigManager|None config: Configuration reader (context-aware) 59 """ 60 # do we have uploaded files? 61 bad_files = [] 62 seen_types = set() 63 all_files = 0 64 65 uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file 66 single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip") 67 68 # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file 69 if "option-data_upload-entries" in request.form or single_zip_file: 70 # we have a zip file! 71 try: 72 if single_zip_file: 73 # we have a single uploaded zip file 74 # i.e. the query has already been validated (else we would have 75 # -entries and no file) and we can get the file info from the 76 # zip file itself 77 uploaded_files[0].seek(0) 78 zip_file_data = BytesIO(uploaded_files[0].read()) 79 with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip: 80 files = [{"filename": f} for f in uploaded_zip.namelist()] 81 else: 82 # validating - get file names from entries field 83 files = json.loads(request.form["option-data_upload-entries"]) 84 85 # ignore known metadata files 86 files = [f for f in files if not ( 87 f["filename"].split("/")[-1].startswith(".") 88 or f["filename"].endswith(".log") 89 or f["filename"].split("/")[-1].startswith("__MACOSX") 90 or f["filename"].endswith(".DS_Store") 91 or f["filename"].endswith("/") # sub-directory 92 )] 93 94 # figure out if we have mixed media types 95 seen_types = set() 96 for file in files: 97 if not query.get("frontend-confirm") and file["filename"].endswith(".svg"): 98 raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 99 try: 100 file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0] 101 seen_types.add(file_type) 102 all_files += 1 103 except (AttributeError, TypeError): 104 bad_files.append(file["filename"]) 105 106 except (ValueError, zipfile.BadZipfile): 107 raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot " 108 "be uploaded to 4CAT.") 109 110 elif "option-data_upload" not in request.files: 111 raise QueryParametersException("No files were offered for upload.") 112 113 elif len(uploaded_files) < 1: 114 raise QueryParametersException("No files were offered for upload.") 115 116 else: 117 # we just have a bunch of separate files 118 # Check file types to ensure all are same type of media 119 for file in uploaded_files: 120 # Allow metadata files and log files to be uploaded 121 if file.filename == ".metadata.json" or file.filename.endswith(".log"): 122 continue 123 124 # Warn on SVG files 125 if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"): 126 raise QueryNeedsExplicitConfirmationException( 127 "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 128 129 # when uploading multiple files, we don't want zips 130 if file.filename.lower().endswith(".zip"): 131 raise QueryParametersException("When uploading media in a zip archive, please upload exactly one " 132 "zip file; 4CAT cannot combine multiple separate zip archives.") 133 134 # Guess mime type from filename; we only have partial files at this point 135 mime_type = mimetypes.guess_type(file.filename)[0] 136 if mime_type is None: 137 bad_files.append(file.filename) 138 continue 139 140 mime_type = mime_type.split('/')[0] 141 if mime_type not in SearchMedia.accepted_file_types: 142 raise QueryParametersException(f"This data source only accepts " 143 f"{andify(SearchMedia.accepted_file_types)} files; " 144 f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot " 145 f"process.") 146 147 seen_types.add(mime_type) 148 all_files += 1 149 150 # we need to at least be able to recognise the extension to know we can 151 # do something with the file... 152 if bad_files: 153 separator = "\n- " 154 raise QueryParametersException("The type of the following files cannot be determined; rename them or " 155 f"remove them from the archive or rename them\n{separator.join(bad_files)}") 156 157 # this is not fool-proof, but uncommon extensions are less likely to work 158 # anyway and the user can still choose to proceed 159 if len(set(seen_types)) > 1: 160 raise QueryParametersException( 161 f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require " 162 "files of a single type to work properly. Please re-upload only a single type of media to proceed." 163 ) 164 165 return { 166 "time": time.time(), 167 "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files 168 "num_files": all_files, 169 } 170 171 @staticmethod 172 def after_create(query, dataset, request): 173 """ 174 Step 2: Hook to execute after the dataset for this source has been created 175 176 In this case, save the files in a zip archive. 177 178 :param dict query: Sanitised query parameters 179 :param DataSet dataset: Dataset created for this query 180 :param request: Flask request submitted for its creation 181 """ 182 mime_type = query.get("media_type") 183 saved_files = 0 184 skipped_files = [] 185 with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive: 186 for file in request.files.getlist("option-data_upload"): 187 # Check if file is zip archive 188 file_mime_type = mimetypes.guess_type(file.filename)[0] 189 if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \ 190 file_mime_type.split('/')[1] == "zip": 191 # Save inner files from zip archive to new zip archive with all files 192 file.seek(0) 193 zip_file_data = BytesIO(file.read()) 194 with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive: 195 for inner_file in inner_zip_archive.infolist(): 196 if inner_file.is_dir(): 197 continue 198 199 guessed_file_mime_type = mimetypes.guess_type(inner_file.filename) 200 if guessed_file_mime_type[0]: 201 mime_type = guessed_file_mime_type[0].split('/')[0] 202 203 # skip useless metadata files 204 # also skip files not recognised as media files 205 clean_file_name = inner_file.filename.split("/")[-1] 206 if not guessed_file_mime_type[0] or ( 207 mime_type not in SearchMedia.accepted_file_types 208 and not clean_file_name.endswith(".log") 209 and not clean_file_name == ".metadata.json" 210 ) or clean_file_name.startswith("__MACOSX") \ 211 or inner_file.filename.startswith("__MACOSX"): 212 print(f"skipping {clean_file_name} ({guessed_file_mime_type})") 213 skipped_files.append(inner_file.filename) 214 continue 215 216 # save inner file from the uploaded zip archive to the new zip with all files 217 new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive) 218 new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file)) 219 220 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 221 saved_files += 1 222 continue 223 224 new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive) 225 with new_zip_archive.open(new_filename, mode='w') as dest_file: 226 file.seek(0) 227 while True: 228 chunk = file.read(1024) 229 if len(chunk) == 0: 230 break 231 dest_file.write(chunk) 232 233 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 234 saved_files += 1 235 236 # update the number of files in the dataset 237 dataset.num_files = saved_files 238 dataset.media_type = mime_type 239 # TODO: use job's 'details' to save data to share/log? 240 if skipped_files: 241 # todo: this now doesn't actually get logged because the log is 242 # re-initialised after after_create runs? 243 dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised" 244 "as media files:" + "\n -".join(skipped_files)) 245 246 def process(self): 247 """ 248 Step 3: Ummmm, we kinda did everything 249 """ 250 # Check for SVG files 251 svg_warning = 0 252 if self.parameters.get("media_type") == "image": 253 for file in self.iterate_archive_contents(self.dataset.get_results_path()): 254 if file.suffix == ".svg": 255 if svg_warning == 0: 256 self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.") 257 self.dataset.log(f"SVG file detected: {file.name}") 258 svg_warning += 1 259 self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True) 260 self.dataset.finish(self.parameters.get("num_files")) 261 262 @staticmethod 263 def get_safe_filename(filename, zip_archive=None): 264 new_filename = SearchMedia.disallowed_characters.sub("", filename) 265 if zip_archive: 266 # check if file is in zip archive 267 index = 1 268 while new_filename in zip_archive.namelist(): 269 new_filename = new_filename + "_" + str(index) 270 index += 1 271 272 return new_filename
14class SearchMedia(BasicProcessor): 15 type = "media-import-search" # job ID 16 category = "Search" # category 17 title = "Upload Media" # title displayed in UI 18 description = "Upload your own audio, video, or image files to be used as a dataset" # description displayed in UI 19 extension = "zip" # extension of result file, used internally and in UI 20 is_local = False # Whether this datasource is locally scraped 21 is_static = False # Whether this datasource is still updated 22 23 max_workers = 1 24 25 disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]") 26 accepted_file_types = ["audio", "video", "image"] 27 28 @classmethod 29 def get_options(cls, parent_dataset=None, config=None): 30 # approximate number of files that can be uploaded in a single request rounded to 100 31 #max_files_approx = int((math.floor(config.get('flask.max_form_parts', 1000) - 50) / 100)) * 100 32 33 return { 34 "intro": { 35 "type": UserInput.OPTION_INFO, 36 # Large numbers of files fail possibly due to Flask request size limit (not file size) 37 "help": "Upload media files to make them be available for further analysis. " 38 "Please include only one type of file per dataset (image, audio, or video) and " 39 "4CAT will be able to run various processors on these media collections. " 40 "\n\nFor collections **larger than a few hundred**, please upload a single " 41 "ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)." 42 }, 43 "data_upload": { 44 "type": UserInput.OPTION_FILE, 45 "multiple": True, 46 "help": "Files" 47 }, 48 } 49 50 @staticmethod 51 def validate_query(query, request, config): 52 """ 53 Step 1: Validate query and files 54 55 Confirms that the uploaded files exist and that the media type is valid. 56 57 :param dict query: Query parameters, from client-side. 58 :param request: Flask request 59 :param ConfigManager|None config: Configuration reader (context-aware) 60 """ 61 # do we have uploaded files? 62 bad_files = [] 63 seen_types = set() 64 all_files = 0 65 66 uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file 67 single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip") 68 69 # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file 70 if "option-data_upload-entries" in request.form or single_zip_file: 71 # we have a zip file! 72 try: 73 if single_zip_file: 74 # we have a single uploaded zip file 75 # i.e. the query has already been validated (else we would have 76 # -entries and no file) and we can get the file info from the 77 # zip file itself 78 uploaded_files[0].seek(0) 79 zip_file_data = BytesIO(uploaded_files[0].read()) 80 with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip: 81 files = [{"filename": f} for f in uploaded_zip.namelist()] 82 else: 83 # validating - get file names from entries field 84 files = json.loads(request.form["option-data_upload-entries"]) 85 86 # ignore known metadata files 87 files = [f for f in files if not ( 88 f["filename"].split("/")[-1].startswith(".") 89 or f["filename"].endswith(".log") 90 or f["filename"].split("/")[-1].startswith("__MACOSX") 91 or f["filename"].endswith(".DS_Store") 92 or f["filename"].endswith("/") # sub-directory 93 )] 94 95 # figure out if we have mixed media types 96 seen_types = set() 97 for file in files: 98 if not query.get("frontend-confirm") and file["filename"].endswith(".svg"): 99 raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 100 try: 101 file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0] 102 seen_types.add(file_type) 103 all_files += 1 104 except (AttributeError, TypeError): 105 bad_files.append(file["filename"]) 106 107 except (ValueError, zipfile.BadZipfile): 108 raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot " 109 "be uploaded to 4CAT.") 110 111 elif "option-data_upload" not in request.files: 112 raise QueryParametersException("No files were offered for upload.") 113 114 elif len(uploaded_files) < 1: 115 raise QueryParametersException("No files were offered for upload.") 116 117 else: 118 # we just have a bunch of separate files 119 # Check file types to ensure all are same type of media 120 for file in uploaded_files: 121 # Allow metadata files and log files to be uploaded 122 if file.filename == ".metadata.json" or file.filename.endswith(".log"): 123 continue 124 125 # Warn on SVG files 126 if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"): 127 raise QueryNeedsExplicitConfirmationException( 128 "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 129 130 # when uploading multiple files, we don't want zips 131 if file.filename.lower().endswith(".zip"): 132 raise QueryParametersException("When uploading media in a zip archive, please upload exactly one " 133 "zip file; 4CAT cannot combine multiple separate zip archives.") 134 135 # Guess mime type from filename; we only have partial files at this point 136 mime_type = mimetypes.guess_type(file.filename)[0] 137 if mime_type is None: 138 bad_files.append(file.filename) 139 continue 140 141 mime_type = mime_type.split('/')[0] 142 if mime_type not in SearchMedia.accepted_file_types: 143 raise QueryParametersException(f"This data source only accepts " 144 f"{andify(SearchMedia.accepted_file_types)} files; " 145 f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot " 146 f"process.") 147 148 seen_types.add(mime_type) 149 all_files += 1 150 151 # we need to at least be able to recognise the extension to know we can 152 # do something with the file... 153 if bad_files: 154 separator = "\n- " 155 raise QueryParametersException("The type of the following files cannot be determined; rename them or " 156 f"remove them from the archive or rename them\n{separator.join(bad_files)}") 157 158 # this is not fool-proof, but uncommon extensions are less likely to work 159 # anyway and the user can still choose to proceed 160 if len(set(seen_types)) > 1: 161 raise QueryParametersException( 162 f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require " 163 "files of a single type to work properly. Please re-upload only a single type of media to proceed." 164 ) 165 166 return { 167 "time": time.time(), 168 "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files 169 "num_files": all_files, 170 } 171 172 @staticmethod 173 def after_create(query, dataset, request): 174 """ 175 Step 2: Hook to execute after the dataset for this source has been created 176 177 In this case, save the files in a zip archive. 178 179 :param dict query: Sanitised query parameters 180 :param DataSet dataset: Dataset created for this query 181 :param request: Flask request submitted for its creation 182 """ 183 mime_type = query.get("media_type") 184 saved_files = 0 185 skipped_files = [] 186 with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive: 187 for file in request.files.getlist("option-data_upload"): 188 # Check if file is zip archive 189 file_mime_type = mimetypes.guess_type(file.filename)[0] 190 if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \ 191 file_mime_type.split('/')[1] == "zip": 192 # Save inner files from zip archive to new zip archive with all files 193 file.seek(0) 194 zip_file_data = BytesIO(file.read()) 195 with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive: 196 for inner_file in inner_zip_archive.infolist(): 197 if inner_file.is_dir(): 198 continue 199 200 guessed_file_mime_type = mimetypes.guess_type(inner_file.filename) 201 if guessed_file_mime_type[0]: 202 mime_type = guessed_file_mime_type[0].split('/')[0] 203 204 # skip useless metadata files 205 # also skip files not recognised as media files 206 clean_file_name = inner_file.filename.split("/")[-1] 207 if not guessed_file_mime_type[0] or ( 208 mime_type not in SearchMedia.accepted_file_types 209 and not clean_file_name.endswith(".log") 210 and not clean_file_name == ".metadata.json" 211 ) or clean_file_name.startswith("__MACOSX") \ 212 or inner_file.filename.startswith("__MACOSX"): 213 print(f"skipping {clean_file_name} ({guessed_file_mime_type})") 214 skipped_files.append(inner_file.filename) 215 continue 216 217 # save inner file from the uploaded zip archive to the new zip with all files 218 new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive) 219 new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file)) 220 221 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 222 saved_files += 1 223 continue 224 225 new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive) 226 with new_zip_archive.open(new_filename, mode='w') as dest_file: 227 file.seek(0) 228 while True: 229 chunk = file.read(1024) 230 if len(chunk) == 0: 231 break 232 dest_file.write(chunk) 233 234 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 235 saved_files += 1 236 237 # update the number of files in the dataset 238 dataset.num_files = saved_files 239 dataset.media_type = mime_type 240 # TODO: use job's 'details' to save data to share/log? 241 if skipped_files: 242 # todo: this now doesn't actually get logged because the log is 243 # re-initialised after after_create runs? 244 dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised" 245 "as media files:" + "\n -".join(skipped_files)) 246 247 def process(self): 248 """ 249 Step 3: Ummmm, we kinda did everything 250 """ 251 # Check for SVG files 252 svg_warning = 0 253 if self.parameters.get("media_type") == "image": 254 for file in self.iterate_archive_contents(self.dataset.get_results_path()): 255 if file.suffix == ".svg": 256 if svg_warning == 0: 257 self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.") 258 self.dataset.log(f"SVG file detected: {file.name}") 259 svg_warning += 1 260 self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True) 261 self.dataset.finish(self.parameters.get("num_files")) 262 263 @staticmethod 264 def get_safe_filename(filename, zip_archive=None): 265 new_filename = SearchMedia.disallowed_characters.sub("", filename) 266 if zip_archive: 267 # check if file is in zip archive 268 index = 1 269 while new_filename in zip_archive.namelist(): 270 new_filename = new_filename + "_" + str(index) 271 index += 1 272 273 return new_filename
Abstract processor class
A processor takes a finished dataset as input and processes its result in some way, with another dataset set as output. The input thus is a file, and the output (usually) as well. In other words, the result of a processor can be used as input for another processor (though whether and when this is useful is another question).
To determine whether a processor can process a given dataset, you can
define a is_compatible_with(FourcatModule module=None, config=None):) -> bool
class
method which takes a dataset as argument and returns a bool that determines
if this processor is considered compatible with that dataset. For example:
@classmethod
def is_compatible_with(cls, module=None, config=None):
return module.type == "linguistic-features"
28 @classmethod 29 def get_options(cls, parent_dataset=None, config=None): 30 # approximate number of files that can be uploaded in a single request rounded to 100 31 #max_files_approx = int((math.floor(config.get('flask.max_form_parts', 1000) - 50) / 100)) * 100 32 33 return { 34 "intro": { 35 "type": UserInput.OPTION_INFO, 36 # Large numbers of files fail possibly due to Flask request size limit (not file size) 37 "help": "Upload media files to make them be available for further analysis. " 38 "Please include only one type of file per dataset (image, audio, or video) and " 39 "4CAT will be able to run various processors on these media collections. " 40 "\n\nFor collections **larger than a few hundred**, please upload a single " 41 "ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)." 42 }, 43 "data_upload": { 44 "type": UserInput.OPTION_FILE, 45 "multiple": True, 46 "help": "Files" 47 }, 48 }
Get processor options
This method by default returns the class's "options" attribute, or an empty dictionary. It can be redefined by processors that need more fine-grained options, e.g. in cases where the availability of options is partially determined by the parent dataset's parameters.
Parameters
- config:
- DataSet parent_dataset: An object representing the dataset that the processor would be run on
50 @staticmethod 51 def validate_query(query, request, config): 52 """ 53 Step 1: Validate query and files 54 55 Confirms that the uploaded files exist and that the media type is valid. 56 57 :param dict query: Query parameters, from client-side. 58 :param request: Flask request 59 :param ConfigManager|None config: Configuration reader (context-aware) 60 """ 61 # do we have uploaded files? 62 bad_files = [] 63 seen_types = set() 64 all_files = 0 65 66 uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file 67 single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip") 68 69 # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file 70 if "option-data_upload-entries" in request.form or single_zip_file: 71 # we have a zip file! 72 try: 73 if single_zip_file: 74 # we have a single uploaded zip file 75 # i.e. the query has already been validated (else we would have 76 # -entries and no file) and we can get the file info from the 77 # zip file itself 78 uploaded_files[0].seek(0) 79 zip_file_data = BytesIO(uploaded_files[0].read()) 80 with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip: 81 files = [{"filename": f} for f in uploaded_zip.namelist()] 82 else: 83 # validating - get file names from entries field 84 files = json.loads(request.form["option-data_upload-entries"]) 85 86 # ignore known metadata files 87 files = [f for f in files if not ( 88 f["filename"].split("/")[-1].startswith(".") 89 or f["filename"].endswith(".log") 90 or f["filename"].split("/")[-1].startswith("__MACOSX") 91 or f["filename"].endswith(".DS_Store") 92 or f["filename"].endswith("/") # sub-directory 93 )] 94 95 # figure out if we have mixed media types 96 seen_types = set() 97 for file in files: 98 if not query.get("frontend-confirm") and file["filename"].endswith(".svg"): 99 raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 100 try: 101 file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0] 102 seen_types.add(file_type) 103 all_files += 1 104 except (AttributeError, TypeError): 105 bad_files.append(file["filename"]) 106 107 except (ValueError, zipfile.BadZipfile): 108 raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot " 109 "be uploaded to 4CAT.") 110 111 elif "option-data_upload" not in request.files: 112 raise QueryParametersException("No files were offered for upload.") 113 114 elif len(uploaded_files) < 1: 115 raise QueryParametersException("No files were offered for upload.") 116 117 else: 118 # we just have a bunch of separate files 119 # Check file types to ensure all are same type of media 120 for file in uploaded_files: 121 # Allow metadata files and log files to be uploaded 122 if file.filename == ".metadata.json" or file.filename.endswith(".log"): 123 continue 124 125 # Warn on SVG files 126 if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"): 127 raise QueryNeedsExplicitConfirmationException( 128 "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?") 129 130 # when uploading multiple files, we don't want zips 131 if file.filename.lower().endswith(".zip"): 132 raise QueryParametersException("When uploading media in a zip archive, please upload exactly one " 133 "zip file; 4CAT cannot combine multiple separate zip archives.") 134 135 # Guess mime type from filename; we only have partial files at this point 136 mime_type = mimetypes.guess_type(file.filename)[0] 137 if mime_type is None: 138 bad_files.append(file.filename) 139 continue 140 141 mime_type = mime_type.split('/')[0] 142 if mime_type not in SearchMedia.accepted_file_types: 143 raise QueryParametersException(f"This data source only accepts " 144 f"{andify(SearchMedia.accepted_file_types)} files; " 145 f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot " 146 f"process.") 147 148 seen_types.add(mime_type) 149 all_files += 1 150 151 # we need to at least be able to recognise the extension to know we can 152 # do something with the file... 153 if bad_files: 154 separator = "\n- " 155 raise QueryParametersException("The type of the following files cannot be determined; rename them or " 156 f"remove them from the archive or rename them\n{separator.join(bad_files)}") 157 158 # this is not fool-proof, but uncommon extensions are less likely to work 159 # anyway and the user can still choose to proceed 160 if len(set(seen_types)) > 1: 161 raise QueryParametersException( 162 f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require " 163 "files of a single type to work properly. Please re-upload only a single type of media to proceed." 164 ) 165 166 return { 167 "time": time.time(), 168 "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files 169 "num_files": all_files, 170 }
Step 1: Validate query and files
Confirms that the uploaded files exist and that the media type is valid.
Parameters
- dict query: Query parameters, from client-side.
- request: Flask request
- ConfigManager|None config: Configuration reader (context-aware)
172 @staticmethod 173 def after_create(query, dataset, request): 174 """ 175 Step 2: Hook to execute after the dataset for this source has been created 176 177 In this case, save the files in a zip archive. 178 179 :param dict query: Sanitised query parameters 180 :param DataSet dataset: Dataset created for this query 181 :param request: Flask request submitted for its creation 182 """ 183 mime_type = query.get("media_type") 184 saved_files = 0 185 skipped_files = [] 186 with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive: 187 for file in request.files.getlist("option-data_upload"): 188 # Check if file is zip archive 189 file_mime_type = mimetypes.guess_type(file.filename)[0] 190 if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \ 191 file_mime_type.split('/')[1] == "zip": 192 # Save inner files from zip archive to new zip archive with all files 193 file.seek(0) 194 zip_file_data = BytesIO(file.read()) 195 with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive: 196 for inner_file in inner_zip_archive.infolist(): 197 if inner_file.is_dir(): 198 continue 199 200 guessed_file_mime_type = mimetypes.guess_type(inner_file.filename) 201 if guessed_file_mime_type[0]: 202 mime_type = guessed_file_mime_type[0].split('/')[0] 203 204 # skip useless metadata files 205 # also skip files not recognised as media files 206 clean_file_name = inner_file.filename.split("/")[-1] 207 if not guessed_file_mime_type[0] or ( 208 mime_type not in SearchMedia.accepted_file_types 209 and not clean_file_name.endswith(".log") 210 and not clean_file_name == ".metadata.json" 211 ) or clean_file_name.startswith("__MACOSX") \ 212 or inner_file.filename.startswith("__MACOSX"): 213 print(f"skipping {clean_file_name} ({guessed_file_mime_type})") 214 skipped_files.append(inner_file.filename) 215 continue 216 217 # save inner file from the uploaded zip archive to the new zip with all files 218 new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive) 219 new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file)) 220 221 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 222 saved_files += 1 223 continue 224 225 new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive) 226 with new_zip_archive.open(new_filename, mode='w') as dest_file: 227 file.seek(0) 228 while True: 229 chunk = file.read(1024) 230 if len(chunk) == 0: 231 break 232 dest_file.write(chunk) 233 234 if not new_filename == ".metadata.json" or not new_filename.endswith(".log"): 235 saved_files += 1 236 237 # update the number of files in the dataset 238 dataset.num_files = saved_files 239 dataset.media_type = mime_type 240 # TODO: use job's 'details' to save data to share/log? 241 if skipped_files: 242 # todo: this now doesn't actually get logged because the log is 243 # re-initialised after after_create runs? 244 dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised" 245 "as media files:" + "\n -".join(skipped_files))
Step 2: Hook to execute after the dataset for this source has been created
In this case, save the files in a zip archive.
Parameters
- dict query: Sanitised query parameters
- DataSet dataset: Dataset created for this query
- request: Flask request submitted for its creation
247 def process(self): 248 """ 249 Step 3: Ummmm, we kinda did everything 250 """ 251 # Check for SVG files 252 svg_warning = 0 253 if self.parameters.get("media_type") == "image": 254 for file in self.iterate_archive_contents(self.dataset.get_results_path()): 255 if file.suffix == ".svg": 256 if svg_warning == 0: 257 self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.") 258 self.dataset.log(f"SVG file detected: {file.name}") 259 svg_warning += 1 260 self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True) 261 self.dataset.finish(self.parameters.get("num_files"))
Step 3: Ummmm, we kinda did everything
263 @staticmethod 264 def get_safe_filename(filename, zip_archive=None): 265 new_filename = SearchMedia.disallowed_characters.sub("", filename) 266 if zip_archive: 267 # check if file is in zip archive 268 index = 1 269 while new_filename in zip_archive.namelist(): 270 new_filename = new_filename + "_" + str(index) 271 index += 1 272 273 return new_filename
Inherited Members
- backend.lib.worker.BasicWorker
- BasicWorker
- INTERRUPT_NONE
- INTERRUPT_RETRY
- INTERRUPT_CANCEL
- queue
- log
- manager
- interrupted
- modules
- init_time
- name
- run
- clean_up
- request_interrupt
- is_4cat_class
- backend.lib.processor.BasicProcessor
- db
- job
- dataset
- owner
- source_dataset
- source_file
- config
- is_running_in_preset
- filepath
- work
- after_process
- remove_files
- abort
- iterate_proxied_requests
- push_proxied_request
- flush_proxied_requests
- iterate_archive_contents
- unpack_archive_contents
- extract_archived_file_by_name
- write_csv_items_and_finish
- write_archive_and_finish
- create_standalone
- save_annotations
- map_item_method_available
- get_mapped_item
- is_filter
- get_status
- is_top_dataset
- is_from_collector
- get_extension
- is_rankable
- exclude_followup_processors
- is_4cat_processor