Edit on GitHub

datasources.media_import.import_media

  1import re
  2import json
  3import time
  4import zipfile
  5import mimetypes
  6from io import BytesIO
  7
  8from backend.lib.processor import BasicProcessor
  9from common.lib.exceptions import QueryParametersException, QueryNeedsExplicitConfirmationException
 10from common.lib.user_input import UserInput
 11from common.lib.helpers import andify
 12
 13class SearchMedia(BasicProcessor):
 14    type = "media-import-search"  # job ID
 15    category = "Search"  # category
 16    title = "Upload Media"  # title displayed in UI
 17    description = "Upload your own audio, video, or image files to be used as a dataset"  # description displayed in UI
 18    extension = "zip"  # extension of result file, used internally and in UI
 19    is_local = False  # Whether this datasource is locally scraped
 20    is_static = False  # Whether this datasource is still updated
 21
 22    max_workers = 1
 23
 24    disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]")
 25    accepted_file_types = ["audio", "video", "image"]
 26
 27    @classmethod
 28    def get_options(cls, parent_dataset=None, config=None):
 29        # approximate number of files that can be uploaded in a single request rounded to 100
 30        #max_files_approx = int((math.floor(config.get('flask.max_form_parts', 1000) - 50) / 100)) * 100
 31
 32        return {
 33            "intro": {
 34                "type": UserInput.OPTION_INFO,
 35                # Large numbers of files fail possibly due to Flask request size limit (not file size)
 36                "help": "Upload media files to make them be available for further analysis. "
 37                        "Please include only one type of file per dataset (image, audio, or video) and "
 38                        "4CAT will be able to run various processors on these media collections. "
 39                        "\n\nFor collections **larger than a few hundred**, please upload a single "
 40                        "ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)."
 41            },
 42            "data_upload": {
 43                "type": UserInput.OPTION_FILE,
 44                "multiple": True,
 45                "help": "Files"
 46            },
 47        }
 48
 49    @staticmethod
 50    def validate_query(query, request, config):
 51        """
 52        Step 1: Validate query and files
 53
 54        Confirms that the uploaded files exist and that the media type is valid.
 55
 56        :param dict query:  Query parameters, from client-side.
 57        :param request:  Flask request
 58        :param ConfigManager|None config:  Configuration reader (context-aware)
 59        """
 60        # do we have uploaded files?
 61        bad_files = []
 62        seen_types = set()
 63        all_files = 0
 64
 65        uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file
 66        single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip")
 67
 68        # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file
 69        if "option-data_upload-entries" in request.form or single_zip_file:
 70            # we have a zip file!
 71            try:
 72                if single_zip_file:
 73                    # we have a single uploaded zip file
 74                    # i.e. the query has already been validated (else we would have
 75                    # -entries and no file) and we can get the file info from the
 76                    # zip file itself
 77                    uploaded_files[0].seek(0)
 78                    zip_file_data = BytesIO(uploaded_files[0].read())
 79                    with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip:
 80                        files = [{"filename": f} for f in uploaded_zip.namelist()]
 81                else:
 82                    # validating - get file names from entries field
 83                    files = json.loads(request.form["option-data_upload-entries"])
 84
 85                # ignore known metadata files
 86                files = [f for f in files if not (
 87                        f["filename"].split("/")[-1].startswith(".")
 88                        or f["filename"].endswith(".log")
 89                        or f["filename"].split("/")[-1].startswith("__MACOSX")
 90                        or f["filename"].endswith(".DS_Store")
 91                        or f["filename"].endswith("/")  # sub-directory
 92                )]
 93
 94                # figure out if we have mixed media types
 95                seen_types = set()
 96                for file in files:
 97                    if not query.get("frontend-confirm") and file["filename"].endswith(".svg"):
 98                        raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
 99                    try:
100                        file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0]
101                        seen_types.add(file_type)
102                        all_files += 1
103                    except (AttributeError, TypeError):
104                        bad_files.append(file["filename"])
105
106            except (ValueError, zipfile.BadZipfile):
107                raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot "
108                                               "be uploaded to 4CAT.")
109
110        elif "option-data_upload" not in request.files:
111            raise QueryParametersException("No files were offered for upload.")
112
113        elif len(uploaded_files) < 1:
114            raise QueryParametersException("No files were offered for upload.")
115
116        else:
117            # we just have a bunch of separate files
118            # Check file types to ensure all are same type of media
119            for file in uploaded_files:
120                # Allow metadata files and log files to be uploaded
121                if file.filename == ".metadata.json" or file.filename.endswith(".log"):
122                    continue
123
124                # Warn on SVG files
125                if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"):
126                    raise QueryNeedsExplicitConfirmationException(
127                        "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
128
129                # when uploading multiple files, we don't want zips
130                if file.filename.lower().endswith(".zip"):
131                    raise QueryParametersException("When uploading media in a zip archive, please upload exactly one "
132                                                   "zip file; 4CAT cannot combine multiple separate zip archives.")
133
134                # Guess mime type from filename; we only have partial files at this point
135                mime_type = mimetypes.guess_type(file.filename)[0]
136                if mime_type is None:
137                    bad_files.append(file.filename)
138                    continue
139
140                mime_type = mime_type.split('/')[0]
141                if mime_type not in SearchMedia.accepted_file_types:
142                    raise QueryParametersException(f"This data source only accepts "
143                                                   f"{andify(SearchMedia.accepted_file_types)} files; "
144                                                   f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot "
145                                                   f"process.")
146
147                seen_types.add(mime_type)
148                all_files += 1
149
150        # we need to at least be able to recognise the extension to know we can
151        # do something with the file...
152        if bad_files:
153            separator = "\n- "
154            raise QueryParametersException("The type of the following files cannot be determined; rename them or "
155                                           f"remove them from the archive or rename them\n{separator.join(bad_files)}")
156
157        # this is not fool-proof, but uncommon extensions are less likely to work
158        # anyway and the user can still choose to proceed
159        if len(set(seen_types)) > 1:
160            raise QueryParametersException(
161                f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require "
162                "files of a single type to work properly. Please re-upload only a single type of media to proceed."
163            )
164
165        return {
166            "time": time.time(),
167            "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files
168            "num_files": all_files,
169        }
170
171    @staticmethod
172    def after_create(query, dataset, request):
173        """
174        Step 2: Hook to execute after the dataset for this source has been created
175
176        In this case, save the files in a zip archive.
177
178        :param dict query:  Sanitised query parameters
179        :param DataSet dataset:  Dataset created for this query
180        :param request:  Flask request submitted for its creation
181        """
182        mime_type = query.get("media_type")
183        saved_files = 0
184        skipped_files = []
185        with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive:
186            for file in request.files.getlist("option-data_upload"):
187                # Check if file is zip archive
188                file_mime_type = mimetypes.guess_type(file.filename)[0]
189                if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \
190                        file_mime_type.split('/')[1] == "zip":
191                    # Save inner files from zip archive to new zip archive with all files
192                    file.seek(0)
193                    zip_file_data = BytesIO(file.read())
194                    with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive:
195                        for inner_file in inner_zip_archive.infolist():
196                            if inner_file.is_dir():
197                                continue
198
199                            guessed_file_mime_type = mimetypes.guess_type(inner_file.filename)
200                            if guessed_file_mime_type[0]:
201                                mime_type = guessed_file_mime_type[0].split('/')[0]
202
203                            # skip useless metadata files
204                            # also skip files not recognised as media files
205                            clean_file_name = inner_file.filename.split("/")[-1]
206                            if not guessed_file_mime_type[0] or (
207                                    mime_type not in SearchMedia.accepted_file_types
208                                    and not clean_file_name.endswith(".log")
209                                    and not clean_file_name == ".metadata.json"
210                            ) or clean_file_name.startswith("__MACOSX") \
211                              or inner_file.filename.startswith("__MACOSX"):
212                                print(f"skipping {clean_file_name} ({guessed_file_mime_type})")
213                                skipped_files.append(inner_file.filename)
214                                continue
215
216                            # save inner file from the uploaded zip archive to the new zip with all files
217                            new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive)
218                            new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file))
219
220                            if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
221                                saved_files += 1
222                    continue
223
224                new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive)
225                with new_zip_archive.open(new_filename, mode='w') as dest_file:
226                    file.seek(0)
227                    while True:
228                        chunk = file.read(1024)
229                        if len(chunk) == 0:
230                            break
231                        dest_file.write(chunk)
232
233                if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
234                    saved_files += 1
235
236        # update the number of files in the dataset
237        dataset.num_files = saved_files
238        dataset.media_type = mime_type
239        # TODO: use job's 'details' to save data to share/log?
240        if skipped_files:
241            # todo: this now doesn't actually get logged because the log is
242            # re-initialised after after_create runs?
243            dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised"
244                        "as media files:" + "\n  -".join(skipped_files))
245
246    def process(self):
247        """
248        Step 3: Ummmm, we kinda did everything
249        """
250        # Check for SVG files
251        svg_warning = 0
252        if self.parameters.get("media_type") == "image":
253            for file in self.iterate_archive_contents(self.dataset.get_results_path()):
254                if file.suffix == ".svg":
255                    if svg_warning == 0:
256                        self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.")
257                    self.dataset.log(f"SVG file detected: {file.name}")
258                    svg_warning += 1
259        self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True)
260        self.dataset.finish(self.parameters.get("num_files"))
261
262    @staticmethod
263    def get_safe_filename(filename, zip_archive=None):
264        new_filename = SearchMedia.disallowed_characters.sub("", filename)
265        if zip_archive:
266            # check if file is in zip archive
267            index = 1
268            while new_filename in zip_archive.namelist():
269                new_filename = new_filename + "_" + str(index)
270                index += 1
271
272        return new_filename
class SearchMedia(backend.lib.processor.BasicProcessor):
 14class SearchMedia(BasicProcessor):
 15    type = "media-import-search"  # job ID
 16    category = "Search"  # category
 17    title = "Upload Media"  # title displayed in UI
 18    description = "Upload your own audio, video, or image files to be used as a dataset"  # description displayed in UI
 19    extension = "zip"  # extension of result file, used internally and in UI
 20    is_local = False  # Whether this datasource is locally scraped
 21    is_static = False  # Whether this datasource is still updated
 22
 23    max_workers = 1
 24
 25    disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]")
 26    accepted_file_types = ["audio", "video", "image"]
 27
 28    @classmethod
 29    def get_options(cls, parent_dataset=None, config=None):
 30        # approximate number of files that can be uploaded in a single request rounded to 100
 31        #max_files_approx = int((math.floor(config.get('flask.max_form_parts', 1000) - 50) / 100)) * 100
 32
 33        return {
 34            "intro": {
 35                "type": UserInput.OPTION_INFO,
 36                # Large numbers of files fail possibly due to Flask request size limit (not file size)
 37                "help": "Upload media files to make them be available for further analysis. "
 38                        "Please include only one type of file per dataset (image, audio, or video) and "
 39                        "4CAT will be able to run various processors on these media collections. "
 40                        "\n\nFor collections **larger than a few hundred**, please upload a single "
 41                        "ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)."
 42            },
 43            "data_upload": {
 44                "type": UserInput.OPTION_FILE,
 45                "multiple": True,
 46                "help": "Files"
 47            },
 48        }
 49
 50    @staticmethod
 51    def validate_query(query, request, config):
 52        """
 53        Step 1: Validate query and files
 54
 55        Confirms that the uploaded files exist and that the media type is valid.
 56
 57        :param dict query:  Query parameters, from client-side.
 58        :param request:  Flask request
 59        :param ConfigManager|None config:  Configuration reader (context-aware)
 60        """
 61        # do we have uploaded files?
 62        bad_files = []
 63        seen_types = set()
 64        all_files = 0
 65
 66        uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file
 67        single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip")
 68
 69        # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file
 70        if "option-data_upload-entries" in request.form or single_zip_file:
 71            # we have a zip file!
 72            try:
 73                if single_zip_file:
 74                    # we have a single uploaded zip file
 75                    # i.e. the query has already been validated (else we would have
 76                    # -entries and no file) and we can get the file info from the
 77                    # zip file itself
 78                    uploaded_files[0].seek(0)
 79                    zip_file_data = BytesIO(uploaded_files[0].read())
 80                    with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip:
 81                        files = [{"filename": f} for f in uploaded_zip.namelist()]
 82                else:
 83                    # validating - get file names from entries field
 84                    files = json.loads(request.form["option-data_upload-entries"])
 85
 86                # ignore known metadata files
 87                files = [f for f in files if not (
 88                        f["filename"].split("/")[-1].startswith(".")
 89                        or f["filename"].endswith(".log")
 90                        or f["filename"].split("/")[-1].startswith("__MACOSX")
 91                        or f["filename"].endswith(".DS_Store")
 92                        or f["filename"].endswith("/")  # sub-directory
 93                )]
 94
 95                # figure out if we have mixed media types
 96                seen_types = set()
 97                for file in files:
 98                    if not query.get("frontend-confirm") and file["filename"].endswith(".svg"):
 99                        raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
100                    try:
101                        file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0]
102                        seen_types.add(file_type)
103                        all_files += 1
104                    except (AttributeError, TypeError):
105                        bad_files.append(file["filename"])
106
107            except (ValueError, zipfile.BadZipfile):
108                raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot "
109                                               "be uploaded to 4CAT.")
110
111        elif "option-data_upload" not in request.files:
112            raise QueryParametersException("No files were offered for upload.")
113
114        elif len(uploaded_files) < 1:
115            raise QueryParametersException("No files were offered for upload.")
116
117        else:
118            # we just have a bunch of separate files
119            # Check file types to ensure all are same type of media
120            for file in uploaded_files:
121                # Allow metadata files and log files to be uploaded
122                if file.filename == ".metadata.json" or file.filename.endswith(".log"):
123                    continue
124
125                # Warn on SVG files
126                if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"):
127                    raise QueryNeedsExplicitConfirmationException(
128                        "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
129
130                # when uploading multiple files, we don't want zips
131                if file.filename.lower().endswith(".zip"):
132                    raise QueryParametersException("When uploading media in a zip archive, please upload exactly one "
133                                                   "zip file; 4CAT cannot combine multiple separate zip archives.")
134
135                # Guess mime type from filename; we only have partial files at this point
136                mime_type = mimetypes.guess_type(file.filename)[0]
137                if mime_type is None:
138                    bad_files.append(file.filename)
139                    continue
140
141                mime_type = mime_type.split('/')[0]
142                if mime_type not in SearchMedia.accepted_file_types:
143                    raise QueryParametersException(f"This data source only accepts "
144                                                   f"{andify(SearchMedia.accepted_file_types)} files; "
145                                                   f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot "
146                                                   f"process.")
147
148                seen_types.add(mime_type)
149                all_files += 1
150
151        # we need to at least be able to recognise the extension to know we can
152        # do something with the file...
153        if bad_files:
154            separator = "\n- "
155            raise QueryParametersException("The type of the following files cannot be determined; rename them or "
156                                           f"remove them from the archive or rename them\n{separator.join(bad_files)}")
157
158        # this is not fool-proof, but uncommon extensions are less likely to work
159        # anyway and the user can still choose to proceed
160        if len(set(seen_types)) > 1:
161            raise QueryParametersException(
162                f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require "
163                "files of a single type to work properly. Please re-upload only a single type of media to proceed."
164            )
165
166        return {
167            "time": time.time(),
168            "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files
169            "num_files": all_files,
170        }
171
172    @staticmethod
173    def after_create(query, dataset, request):
174        """
175        Step 2: Hook to execute after the dataset for this source has been created
176
177        In this case, save the files in a zip archive.
178
179        :param dict query:  Sanitised query parameters
180        :param DataSet dataset:  Dataset created for this query
181        :param request:  Flask request submitted for its creation
182        """
183        mime_type = query.get("media_type")
184        saved_files = 0
185        skipped_files = []
186        with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive:
187            for file in request.files.getlist("option-data_upload"):
188                # Check if file is zip archive
189                file_mime_type = mimetypes.guess_type(file.filename)[0]
190                if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \
191                        file_mime_type.split('/')[1] == "zip":
192                    # Save inner files from zip archive to new zip archive with all files
193                    file.seek(0)
194                    zip_file_data = BytesIO(file.read())
195                    with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive:
196                        for inner_file in inner_zip_archive.infolist():
197                            if inner_file.is_dir():
198                                continue
199
200                            guessed_file_mime_type = mimetypes.guess_type(inner_file.filename)
201                            if guessed_file_mime_type[0]:
202                                mime_type = guessed_file_mime_type[0].split('/')[0]
203
204                            # skip useless metadata files
205                            # also skip files not recognised as media files
206                            clean_file_name = inner_file.filename.split("/")[-1]
207                            if not guessed_file_mime_type[0] or (
208                                    mime_type not in SearchMedia.accepted_file_types
209                                    and not clean_file_name.endswith(".log")
210                                    and not clean_file_name == ".metadata.json"
211                            ) or clean_file_name.startswith("__MACOSX") \
212                              or inner_file.filename.startswith("__MACOSX"):
213                                print(f"skipping {clean_file_name} ({guessed_file_mime_type})")
214                                skipped_files.append(inner_file.filename)
215                                continue
216
217                            # save inner file from the uploaded zip archive to the new zip with all files
218                            new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive)
219                            new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file))
220
221                            if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
222                                saved_files += 1
223                    continue
224
225                new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive)
226                with new_zip_archive.open(new_filename, mode='w') as dest_file:
227                    file.seek(0)
228                    while True:
229                        chunk = file.read(1024)
230                        if len(chunk) == 0:
231                            break
232                        dest_file.write(chunk)
233
234                if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
235                    saved_files += 1
236
237        # update the number of files in the dataset
238        dataset.num_files = saved_files
239        dataset.media_type = mime_type
240        # TODO: use job's 'details' to save data to share/log?
241        if skipped_files:
242            # todo: this now doesn't actually get logged because the log is
243            # re-initialised after after_create runs?
244            dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised"
245                        "as media files:" + "\n  -".join(skipped_files))
246
247    def process(self):
248        """
249        Step 3: Ummmm, we kinda did everything
250        """
251        # Check for SVG files
252        svg_warning = 0
253        if self.parameters.get("media_type") == "image":
254            for file in self.iterate_archive_contents(self.dataset.get_results_path()):
255                if file.suffix == ".svg":
256                    if svg_warning == 0:
257                        self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.")
258                    self.dataset.log(f"SVG file detected: {file.name}")
259                    svg_warning += 1
260        self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True)
261        self.dataset.finish(self.parameters.get("num_files"))
262
263    @staticmethod
264    def get_safe_filename(filename, zip_archive=None):
265        new_filename = SearchMedia.disallowed_characters.sub("", filename)
266        if zip_archive:
267            # check if file is in zip archive
268            index = 1
269            while new_filename in zip_archive.namelist():
270                new_filename = new_filename + "_" + str(index)
271                index += 1
272
273        return new_filename

Abstract processor class

A processor takes a finished dataset as input and processes its result in some way, with another dataset set as output. The input thus is a file, and the output (usually) as well. In other words, the result of a processor can be used as input for another processor (though whether and when this is useful is another question).

To determine whether a processor can process a given dataset, you can define a is_compatible_with(FourcatModule module=None, config=None):) -> bool class method which takes a dataset as argument and returns a bool that determines if this processor is considered compatible with that dataset. For example:

@classmethod
def is_compatible_with(cls, module=None, config=None):
    return module.type == "linguistic-features"
type = 'media-import-search'
category = 'Search'
title = 'Upload Media'
description = 'Upload your own audio, video, or image files to be used as a dataset'
extension = 'zip'
is_local = False
is_static = False
max_workers = 1
disallowed_characters = re.compile('[^a-zA-Z0-9._+-]')
accepted_file_types = ['audio', 'video', 'image']
@classmethod
def get_options(cls, parent_dataset=None, config=None):
28    @classmethod
29    def get_options(cls, parent_dataset=None, config=None):
30        # approximate number of files that can be uploaded in a single request rounded to 100
31        #max_files_approx = int((math.floor(config.get('flask.max_form_parts', 1000) - 50) / 100)) * 100
32
33        return {
34            "intro": {
35                "type": UserInput.OPTION_INFO,
36                # Large numbers of files fail possibly due to Flask request size limit (not file size)
37                "help": "Upload media files to make them be available for further analysis. "
38                        "Please include only one type of file per dataset (image, audio, or video) and "
39                        "4CAT will be able to run various processors on these media collections. "
40                        "\n\nFor collections **larger than a few hundred**, please upload a single "
41                        "ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)."
42            },
43            "data_upload": {
44                "type": UserInput.OPTION_FILE,
45                "multiple": True,
46                "help": "Files"
47            },
48        }

Get processor options

This method by default returns the class's "options" attribute, or an empty dictionary. It can be redefined by processors that need more fine-grained options, e.g. in cases where the availability of options is partially determined by the parent dataset's parameters.

Parameters
  • config:
  • DataSet parent_dataset: An object representing the dataset that the processor would be run on
@staticmethod
def validate_query(query, request, config):
 50    @staticmethod
 51    def validate_query(query, request, config):
 52        """
 53        Step 1: Validate query and files
 54
 55        Confirms that the uploaded files exist and that the media type is valid.
 56
 57        :param dict query:  Query parameters, from client-side.
 58        :param request:  Flask request
 59        :param ConfigManager|None config:  Configuration reader (context-aware)
 60        """
 61        # do we have uploaded files?
 62        bad_files = []
 63        seen_types = set()
 64        all_files = 0
 65
 66        uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file
 67        single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip")
 68
 69        # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file
 70        if "option-data_upload-entries" in request.form or single_zip_file:
 71            # we have a zip file!
 72            try:
 73                if single_zip_file:
 74                    # we have a single uploaded zip file
 75                    # i.e. the query has already been validated (else we would have
 76                    # -entries and no file) and we can get the file info from the
 77                    # zip file itself
 78                    uploaded_files[0].seek(0)
 79                    zip_file_data = BytesIO(uploaded_files[0].read())
 80                    with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip:
 81                        files = [{"filename": f} for f in uploaded_zip.namelist()]
 82                else:
 83                    # validating - get file names from entries field
 84                    files = json.loads(request.form["option-data_upload-entries"])
 85
 86                # ignore known metadata files
 87                files = [f for f in files if not (
 88                        f["filename"].split("/")[-1].startswith(".")
 89                        or f["filename"].endswith(".log")
 90                        or f["filename"].split("/")[-1].startswith("__MACOSX")
 91                        or f["filename"].endswith(".DS_Store")
 92                        or f["filename"].endswith("/")  # sub-directory
 93                )]
 94
 95                # figure out if we have mixed media types
 96                seen_types = set()
 97                for file in files:
 98                    if not query.get("frontend-confirm") and file["filename"].endswith(".svg"):
 99                        raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
100                    try:
101                        file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0]
102                        seen_types.add(file_type)
103                        all_files += 1
104                    except (AttributeError, TypeError):
105                        bad_files.append(file["filename"])
106
107            except (ValueError, zipfile.BadZipfile):
108                raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot "
109                                               "be uploaded to 4CAT.")
110
111        elif "option-data_upload" not in request.files:
112            raise QueryParametersException("No files were offered for upload.")
113
114        elif len(uploaded_files) < 1:
115            raise QueryParametersException("No files were offered for upload.")
116
117        else:
118            # we just have a bunch of separate files
119            # Check file types to ensure all are same type of media
120            for file in uploaded_files:
121                # Allow metadata files and log files to be uploaded
122                if file.filename == ".metadata.json" or file.filename.endswith(".log"):
123                    continue
124
125                # Warn on SVG files
126                if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"):
127                    raise QueryNeedsExplicitConfirmationException(
128                        "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
129
130                # when uploading multiple files, we don't want zips
131                if file.filename.lower().endswith(".zip"):
132                    raise QueryParametersException("When uploading media in a zip archive, please upload exactly one "
133                                                   "zip file; 4CAT cannot combine multiple separate zip archives.")
134
135                # Guess mime type from filename; we only have partial files at this point
136                mime_type = mimetypes.guess_type(file.filename)[0]
137                if mime_type is None:
138                    bad_files.append(file.filename)
139                    continue
140
141                mime_type = mime_type.split('/')[0]
142                if mime_type not in SearchMedia.accepted_file_types:
143                    raise QueryParametersException(f"This data source only accepts "
144                                                   f"{andify(SearchMedia.accepted_file_types)} files; "
145                                                   f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot "
146                                                   f"process.")
147
148                seen_types.add(mime_type)
149                all_files += 1
150
151        # we need to at least be able to recognise the extension to know we can
152        # do something with the file...
153        if bad_files:
154            separator = "\n- "
155            raise QueryParametersException("The type of the following files cannot be determined; rename them or "
156                                           f"remove them from the archive or rename them\n{separator.join(bad_files)}")
157
158        # this is not fool-proof, but uncommon extensions are less likely to work
159        # anyway and the user can still choose to proceed
160        if len(set(seen_types)) > 1:
161            raise QueryParametersException(
162                f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require "
163                "files of a single type to work properly. Please re-upload only a single type of media to proceed."
164            )
165
166        return {
167            "time": time.time(),
168            "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files
169            "num_files": all_files,
170        }

Step 1: Validate query and files

Confirms that the uploaded files exist and that the media type is valid.

Parameters
  • dict query: Query parameters, from client-side.
  • request: Flask request
  • ConfigManager|None config: Configuration reader (context-aware)
@staticmethod
def after_create(query, dataset, request):
172    @staticmethod
173    def after_create(query, dataset, request):
174        """
175        Step 2: Hook to execute after the dataset for this source has been created
176
177        In this case, save the files in a zip archive.
178
179        :param dict query:  Sanitised query parameters
180        :param DataSet dataset:  Dataset created for this query
181        :param request:  Flask request submitted for its creation
182        """
183        mime_type = query.get("media_type")
184        saved_files = 0
185        skipped_files = []
186        with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive:
187            for file in request.files.getlist("option-data_upload"):
188                # Check if file is zip archive
189                file_mime_type = mimetypes.guess_type(file.filename)[0]
190                if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \
191                        file_mime_type.split('/')[1] == "zip":
192                    # Save inner files from zip archive to new zip archive with all files
193                    file.seek(0)
194                    zip_file_data = BytesIO(file.read())
195                    with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive:
196                        for inner_file in inner_zip_archive.infolist():
197                            if inner_file.is_dir():
198                                continue
199
200                            guessed_file_mime_type = mimetypes.guess_type(inner_file.filename)
201                            if guessed_file_mime_type[0]:
202                                mime_type = guessed_file_mime_type[0].split('/')[0]
203
204                            # skip useless metadata files
205                            # also skip files not recognised as media files
206                            clean_file_name = inner_file.filename.split("/")[-1]
207                            if not guessed_file_mime_type[0] or (
208                                    mime_type not in SearchMedia.accepted_file_types
209                                    and not clean_file_name.endswith(".log")
210                                    and not clean_file_name == ".metadata.json"
211                            ) or clean_file_name.startswith("__MACOSX") \
212                              or inner_file.filename.startswith("__MACOSX"):
213                                print(f"skipping {clean_file_name} ({guessed_file_mime_type})")
214                                skipped_files.append(inner_file.filename)
215                                continue
216
217                            # save inner file from the uploaded zip archive to the new zip with all files
218                            new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive)
219                            new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file))
220
221                            if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
222                                saved_files += 1
223                    continue
224
225                new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive)
226                with new_zip_archive.open(new_filename, mode='w') as dest_file:
227                    file.seek(0)
228                    while True:
229                        chunk = file.read(1024)
230                        if len(chunk) == 0:
231                            break
232                        dest_file.write(chunk)
233
234                if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
235                    saved_files += 1
236
237        # update the number of files in the dataset
238        dataset.num_files = saved_files
239        dataset.media_type = mime_type
240        # TODO: use job's 'details' to save data to share/log?
241        if skipped_files:
242            # todo: this now doesn't actually get logged because the log is
243            # re-initialised after after_create runs?
244            dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised"
245                        "as media files:" + "\n  -".join(skipped_files))

Step 2: Hook to execute after the dataset for this source has been created

In this case, save the files in a zip archive.

Parameters
  • dict query: Sanitised query parameters
  • DataSet dataset: Dataset created for this query
  • request: Flask request submitted for its creation
def process(self):
247    def process(self):
248        """
249        Step 3: Ummmm, we kinda did everything
250        """
251        # Check for SVG files
252        svg_warning = 0
253        if self.parameters.get("media_type") == "image":
254            for file in self.iterate_archive_contents(self.dataset.get_results_path()):
255                if file.suffix == ".svg":
256                    if svg_warning == 0:
257                        self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.")
258                    self.dataset.log(f"SVG file detected: {file.name}")
259                    svg_warning += 1
260        self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True)
261        self.dataset.finish(self.parameters.get("num_files"))

Step 3: Ummmm, we kinda did everything

@staticmethod
def get_safe_filename(filename, zip_archive=None):
263    @staticmethod
264    def get_safe_filename(filename, zip_archive=None):
265        new_filename = SearchMedia.disallowed_characters.sub("", filename)
266        if zip_archive:
267            # check if file is in zip archive
268            index = 1
269            while new_filename in zip_archive.namelist():
270                new_filename = new_filename + "_" + str(index)
271                index += 1
272
273        return new_filename