Edit on GitHub

datasources.media_import.import_media

  1import re
  2import math
  3import json
  4import time
  5import zipfile
  6import mimetypes
  7from io import BytesIO
  8
  9from backend.lib.processor import BasicProcessor
 10from common.config_manager import config
 11from common.lib.exceptions import QueryParametersException, QueryNeedsExplicitConfirmationException
 12from common.lib.user_input import UserInput
 13from common.lib.helpers import andify
 14
 15# approximate number of files that can be uploaded in a single request rounded to 100
 16# todo: bring this back into get_options after merging #455
 17# max_files_approx = int((math.floor(config.get('flask.max_form_parts', 1000)-50)/100)) * 100
 18
 19class SearchMedia(BasicProcessor):
 20    type = "media-import-search"  # job ID
 21    category = "Search"  # category
 22    title = "Upload Media"  # title displayed in UI
 23    description = "Upload your own audio, video, or image files to be used as a dataset"  # description displayed in UI
 24    extension = "zip"  # extension of result file, used internally and in UI
 25    is_local = False  # Whether this datasource is locally scraped
 26    is_static = False  # Whether this datasource is still updated
 27
 28    max_workers = 1
 29
 30    disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]")
 31    accepted_file_types = ["audio", "video", "image"]
 32
 33    @classmethod
 34    def get_options(cls, parent_dataset=None, user=None):
 35        return {
 36            "intro": {
 37                "type": UserInput.OPTION_INFO,
 38                # Large numbers of files fail possibly due to Flask request size limit (not file size)
 39                "help": "Upload media files to make them be available for further analysis. "
 40                        "Please include only one type of file per dataset (image, audio, or video) and "
 41                        "4CAT will be able to run various processors on these media collections. "
 42                        f"\n\nFor collections **larger than a few hundred**, please upload a single "
 43                        f"ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)."
 44            },
 45            "data_upload": {
 46                "type": UserInput.OPTION_FILE,
 47                "multiple": True,
 48                "help": "Files"
 49            },
 50        }
 51
 52    @staticmethod
 53    def validate_query(query, request, user):
 54        """
 55        Step 1: Validate query and files
 56
 57        Confirms that the uploaded files exist and that the media type is valid.
 58
 59        :param dict query:  Query parameters, from client-side.
 60        :param request:  Flask request
 61        :param User user:  User object of user who has submitted the query
 62        :return dict:  Safe query parameters
 63        """
 64        # do we have uploaded files?
 65        bad_files = []
 66        seen_types = set()
 67        all_files = 0
 68
 69        uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file
 70        single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip")
 71
 72        # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file
 73        if "option-data_upload-entries" in request.form or single_zip_file:
 74            # we have a zip file!
 75            try:
 76                if single_zip_file:
 77                    # we have a single uploaded zip file
 78                    # i.e. the query has already been validated (else we would have
 79                    # -entries and no file) and we can get the file info from the
 80                    # zip file itself
 81                    uploaded_files[0].seek(0)
 82                    zip_file_data = BytesIO(uploaded_files[0].read())
 83                    with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip:
 84                        files = [{"filename": f} for f in uploaded_zip.namelist()]
 85                else:
 86                    # validating - get file names from entries field
 87                    files = json.loads(request.form["option-data_upload-entries"])
 88
 89                # ignore known metadata files
 90                files = [f for f in files if not (
 91                        f["filename"].split("/")[-1].startswith(".")
 92                        or f["filename"].endswith(".log")
 93                        or f["filename"].split("/")[-1].startswith("__MACOSX")
 94                        or f["filename"].endswith(".DS_Store")
 95                        or f["filename"].endswith("/")  # sub-directory
 96                )]
 97
 98                # figure out if we have mixed media types
 99                seen_types = set()
100                for file in files:
101                    if not query.get("frontend-confirm") and file["filename"].endswith(".svg"):
102                        raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
103                    try:
104                        file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0]
105                        seen_types.add(file_type)
106                        all_files += 1
107                    except (AttributeError, TypeError):
108                        bad_files.append(file["filename"])
109
110            except (ValueError, zipfile.BadZipfile) as e:
111                raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot "
112                                               "be uploaded to 4CAT.")
113
114        elif "option-data_upload" not in request.files:
115            raise QueryParametersException("No files were offered for upload.")
116
117        elif len(uploaded_files) < 1:
118            raise QueryParametersException("No files were offered for upload.")
119
120        else:
121            # we just have a bunch of separate files
122            # Check file types to ensure all are same type of media
123            for file in uploaded_files:
124                # Allow metadata files and log files to be uploaded
125                if file.filename == ".metadata.json" or file.filename.endswith(".log"):
126                    continue
127
128                # Warn on SVG files
129                if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"):
130                    raise QueryNeedsExplicitConfirmationException(
131                        "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
132
133                # when uploading multiple files, we don't want zips
134                if file.filename.lower().endswith(".zip"):
135                    raise QueryParametersException("When uploading media in a zip archive, please upload exactly one "
136                                                   "zip file; 4CAT cannot combine multiple separate zip archives.")
137
138                # Guess mime type from filename; we only have partial files at this point
139                mime_type = mimetypes.guess_type(file.filename)[0]
140                if mime_type is None:
141                    bad_files.append(file.filename)
142                    continue
143
144                mime_type = mime_type.split('/')[0]
145                if mime_type not in SearchMedia.accepted_file_types:
146                    raise QueryParametersException(f"This data source only accepts "
147                                                   f"{andify(SearchMedia.accepted_file_types)} files; "
148                                                   f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot "
149                                                   f"process.")
150
151                seen_types.add(mime_type)
152                all_files += 1
153
154        # we need to at least be able to recognise the extension to know we can
155        # do something with the file...
156        if bad_files:
157            separator = "\n- "
158            raise QueryParametersException("The type of the following files cannot be determined; rename them or "
159                                           f"remove them from the archive or rename them\n{separator.join(bad_files)}")
160
161        # this is not fool-proof, but uncommon extensions are less likely to work
162        # anyway and the user can still choose to proceed
163        if len(set(seen_types)) > 1:
164            raise QueryParametersException(
165                f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require "
166                "files of a single type to work properly. Please re-upload only a single type of media to proceed."
167            )
168
169        return {
170            "time": time.time(),
171            "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files
172            "num_files": all_files,
173        }
174
175    @staticmethod
176    def after_create(query, dataset, request):
177        """
178        Step 2: Hook to execute after the dataset for this source has been created
179
180        In this case, save the files in a zip archive.
181
182        :param dict query:  Sanitised query parameters
183        :param DataSet dataset:  Dataset created for this query
184        :param request:  Flask request submitted for its creation
185        """
186        mime_type = query.get("media_type")
187        saved_files = 0
188        skipped_files = []
189        with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive:
190            for file in request.files.getlist("option-data_upload"):
191                # Check if file is zip archive
192                file_mime_type = mimetypes.guess_type(file.filename)[0]
193                if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \
194                        file_mime_type.split('/')[1] == "zip":
195                    # Save inner files from zip archive to new zip archive with all files
196                    file.seek(0)
197                    zip_file_data = BytesIO(file.read())
198                    with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive:
199                        for inner_file in inner_zip_archive.infolist():
200                            if inner_file.is_dir():
201                                continue
202
203                            guessed_file_mime_type = mimetypes.guess_type(inner_file.filename)
204                            if guessed_file_mime_type[0]:
205                                mime_type = guessed_file_mime_type[0].split('/')[0]
206
207                            # skip useless metadata files
208                            # also skip files not recognised as media files
209                            clean_file_name = inner_file.filename.split("/")[-1]
210                            if not guessed_file_mime_type[0] or (
211                                    mime_type not in SearchMedia.accepted_file_types
212                                    and not clean_file_name.endswith(".log")
213                                    and not clean_file_name == ".metadata.json"
214                            ) or clean_file_name.startswith("__MACOSX") \
215                              or inner_file.filename.startswith("__MACOSX"):
216                                print(f"skipping {clean_file_name} ({guessed_file_mime_type})")
217                                skipped_files.append(inner_file.filename)
218                                continue
219
220                            # save inner file from the uploaded zip archive to the new zip with all files
221                            new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive)
222                            new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file))
223
224                            if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
225                                saved_files += 1
226                    continue
227
228                new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive)
229                with new_zip_archive.open(new_filename, mode='w') as dest_file:
230                    file.seek(0)
231                    while True:
232                        chunk = file.read(1024)
233                        if len(chunk) == 0:
234                            break
235                        dest_file.write(chunk)
236
237                if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
238                    saved_files += 1
239
240        # update the number of files in the dataset
241        dataset.num_files = saved_files
242        dataset.media_type = mime_type
243        # TODO: use job's 'details' to save data to share/log?
244        if skipped_files:
245            # todo: this now doesn't actually get logged because the log is
246            # re-initialised after after_create runs?
247            dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised"
248                        "as media files:" + "\n  -".join(skipped_files))
249
250    def process(self):
251        """
252        Step 3: Ummmm, we kinda did everything
253        """
254        # Check for SVG files
255        svg_warning = 0
256        if self.parameters.get("media_type") == "image":
257            for file in self.iterate_archive_contents(self.dataset.get_results_path()):
258                if file.suffix == ".svg":
259                    if svg_warning == 0:
260                        self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.")
261                    self.dataset.log(f"SVG file detected: {file.name}")
262                    svg_warning += 1
263        self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True)
264        self.dataset.finish(self.parameters.get("num_files"))
265
266    @staticmethod
267    def get_safe_filename(filename, zip_archive=None):
268        new_filename = SearchMedia.disallowed_characters.sub("", filename)
269        if zip_archive:
270            # check if file is in zip archive
271            index = 1
272            while new_filename in zip_archive.namelist():
273                new_filename = new_filename + "_" + str(index)
274                index += 1
275
276        return new_filename
class SearchMedia(backend.lib.processor.BasicProcessor):
 20class SearchMedia(BasicProcessor):
 21    type = "media-import-search"  # job ID
 22    category = "Search"  # category
 23    title = "Upload Media"  # title displayed in UI
 24    description = "Upload your own audio, video, or image files to be used as a dataset"  # description displayed in UI
 25    extension = "zip"  # extension of result file, used internally and in UI
 26    is_local = False  # Whether this datasource is locally scraped
 27    is_static = False  # Whether this datasource is still updated
 28
 29    max_workers = 1
 30
 31    disallowed_characters = re.compile(r"[^a-zA-Z0-9._+-]")
 32    accepted_file_types = ["audio", "video", "image"]
 33
 34    @classmethod
 35    def get_options(cls, parent_dataset=None, user=None):
 36        return {
 37            "intro": {
 38                "type": UserInput.OPTION_INFO,
 39                # Large numbers of files fail possibly due to Flask request size limit (not file size)
 40                "help": "Upload media files to make them be available for further analysis. "
 41                        "Please include only one type of file per dataset (image, audio, or video) and "
 42                        "4CAT will be able to run various processors on these media collections. "
 43                        f"\n\nFor collections **larger than a few hundred**, please upload a single "
 44                        f"ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)."
 45            },
 46            "data_upload": {
 47                "type": UserInput.OPTION_FILE,
 48                "multiple": True,
 49                "help": "Files"
 50            },
 51        }
 52
 53    @staticmethod
 54    def validate_query(query, request, user):
 55        """
 56        Step 1: Validate query and files
 57
 58        Confirms that the uploaded files exist and that the media type is valid.
 59
 60        :param dict query:  Query parameters, from client-side.
 61        :param request:  Flask request
 62        :param User user:  User object of user who has submitted the query
 63        :return dict:  Safe query parameters
 64        """
 65        # do we have uploaded files?
 66        bad_files = []
 67        seen_types = set()
 68        all_files = 0
 69
 70        uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file
 71        single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip")
 72
 73        # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file
 74        if "option-data_upload-entries" in request.form or single_zip_file:
 75            # we have a zip file!
 76            try:
 77                if single_zip_file:
 78                    # we have a single uploaded zip file
 79                    # i.e. the query has already been validated (else we would have
 80                    # -entries and no file) and we can get the file info from the
 81                    # zip file itself
 82                    uploaded_files[0].seek(0)
 83                    zip_file_data = BytesIO(uploaded_files[0].read())
 84                    with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip:
 85                        files = [{"filename": f} for f in uploaded_zip.namelist()]
 86                else:
 87                    # validating - get file names from entries field
 88                    files = json.loads(request.form["option-data_upload-entries"])
 89
 90                # ignore known metadata files
 91                files = [f for f in files if not (
 92                        f["filename"].split("/")[-1].startswith(".")
 93                        or f["filename"].endswith(".log")
 94                        or f["filename"].split("/")[-1].startswith("__MACOSX")
 95                        or f["filename"].endswith(".DS_Store")
 96                        or f["filename"].endswith("/")  # sub-directory
 97                )]
 98
 99                # figure out if we have mixed media types
100                seen_types = set()
101                for file in files:
102                    if not query.get("frontend-confirm") and file["filename"].endswith(".svg"):
103                        raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
104                    try:
105                        file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0]
106                        seen_types.add(file_type)
107                        all_files += 1
108                    except (AttributeError, TypeError):
109                        bad_files.append(file["filename"])
110
111            except (ValueError, zipfile.BadZipfile) as e:
112                raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot "
113                                               "be uploaded to 4CAT.")
114
115        elif "option-data_upload" not in request.files:
116            raise QueryParametersException("No files were offered for upload.")
117
118        elif len(uploaded_files) < 1:
119            raise QueryParametersException("No files were offered for upload.")
120
121        else:
122            # we just have a bunch of separate files
123            # Check file types to ensure all are same type of media
124            for file in uploaded_files:
125                # Allow metadata files and log files to be uploaded
126                if file.filename == ".metadata.json" or file.filename.endswith(".log"):
127                    continue
128
129                # Warn on SVG files
130                if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"):
131                    raise QueryNeedsExplicitConfirmationException(
132                        "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
133
134                # when uploading multiple files, we don't want zips
135                if file.filename.lower().endswith(".zip"):
136                    raise QueryParametersException("When uploading media in a zip archive, please upload exactly one "
137                                                   "zip file; 4CAT cannot combine multiple separate zip archives.")
138
139                # Guess mime type from filename; we only have partial files at this point
140                mime_type = mimetypes.guess_type(file.filename)[0]
141                if mime_type is None:
142                    bad_files.append(file.filename)
143                    continue
144
145                mime_type = mime_type.split('/')[0]
146                if mime_type not in SearchMedia.accepted_file_types:
147                    raise QueryParametersException(f"This data source only accepts "
148                                                   f"{andify(SearchMedia.accepted_file_types)} files; "
149                                                   f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot "
150                                                   f"process.")
151
152                seen_types.add(mime_type)
153                all_files += 1
154
155        # we need to at least be able to recognise the extension to know we can
156        # do something with the file...
157        if bad_files:
158            separator = "\n- "
159            raise QueryParametersException("The type of the following files cannot be determined; rename them or "
160                                           f"remove them from the archive or rename them\n{separator.join(bad_files)}")
161
162        # this is not fool-proof, but uncommon extensions are less likely to work
163        # anyway and the user can still choose to proceed
164        if len(set(seen_types)) > 1:
165            raise QueryParametersException(
166                f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require "
167                "files of a single type to work properly. Please re-upload only a single type of media to proceed."
168            )
169
170        return {
171            "time": time.time(),
172            "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files
173            "num_files": all_files,
174        }
175
176    @staticmethod
177    def after_create(query, dataset, request):
178        """
179        Step 2: Hook to execute after the dataset for this source has been created
180
181        In this case, save the files in a zip archive.
182
183        :param dict query:  Sanitised query parameters
184        :param DataSet dataset:  Dataset created for this query
185        :param request:  Flask request submitted for its creation
186        """
187        mime_type = query.get("media_type")
188        saved_files = 0
189        skipped_files = []
190        with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive:
191            for file in request.files.getlist("option-data_upload"):
192                # Check if file is zip archive
193                file_mime_type = mimetypes.guess_type(file.filename)[0]
194                if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \
195                        file_mime_type.split('/')[1] == "zip":
196                    # Save inner files from zip archive to new zip archive with all files
197                    file.seek(0)
198                    zip_file_data = BytesIO(file.read())
199                    with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive:
200                        for inner_file in inner_zip_archive.infolist():
201                            if inner_file.is_dir():
202                                continue
203
204                            guessed_file_mime_type = mimetypes.guess_type(inner_file.filename)
205                            if guessed_file_mime_type[0]:
206                                mime_type = guessed_file_mime_type[0].split('/')[0]
207
208                            # skip useless metadata files
209                            # also skip files not recognised as media files
210                            clean_file_name = inner_file.filename.split("/")[-1]
211                            if not guessed_file_mime_type[0] or (
212                                    mime_type not in SearchMedia.accepted_file_types
213                                    and not clean_file_name.endswith(".log")
214                                    and not clean_file_name == ".metadata.json"
215                            ) or clean_file_name.startswith("__MACOSX") \
216                              or inner_file.filename.startswith("__MACOSX"):
217                                print(f"skipping {clean_file_name} ({guessed_file_mime_type})")
218                                skipped_files.append(inner_file.filename)
219                                continue
220
221                            # save inner file from the uploaded zip archive to the new zip with all files
222                            new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive)
223                            new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file))
224
225                            if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
226                                saved_files += 1
227                    continue
228
229                new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive)
230                with new_zip_archive.open(new_filename, mode='w') as dest_file:
231                    file.seek(0)
232                    while True:
233                        chunk = file.read(1024)
234                        if len(chunk) == 0:
235                            break
236                        dest_file.write(chunk)
237
238                if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
239                    saved_files += 1
240
241        # update the number of files in the dataset
242        dataset.num_files = saved_files
243        dataset.media_type = mime_type
244        # TODO: use job's 'details' to save data to share/log?
245        if skipped_files:
246            # todo: this now doesn't actually get logged because the log is
247            # re-initialised after after_create runs?
248            dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised"
249                        "as media files:" + "\n  -".join(skipped_files))
250
251    def process(self):
252        """
253        Step 3: Ummmm, we kinda did everything
254        """
255        # Check for SVG files
256        svg_warning = 0
257        if self.parameters.get("media_type") == "image":
258            for file in self.iterate_archive_contents(self.dataset.get_results_path()):
259                if file.suffix == ".svg":
260                    if svg_warning == 0:
261                        self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.")
262                    self.dataset.log(f"SVG file detected: {file.name}")
263                    svg_warning += 1
264        self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True)
265        self.dataset.finish(self.parameters.get("num_files"))
266
267    @staticmethod
268    def get_safe_filename(filename, zip_archive=None):
269        new_filename = SearchMedia.disallowed_characters.sub("", filename)
270        if zip_archive:
271            # check if file is in zip archive
272            index = 1
273            while new_filename in zip_archive.namelist():
274                new_filename = new_filename + "_" + str(index)
275                index += 1
276
277        return new_filename

Abstract processor class

A processor takes a finished dataset as input and processes its result in some way, with another dataset set as output. The input thus is a file, and the output (usually) as well. In other words, the result of a processor can be used as input for another processor (though whether and when this is useful is another question).

To determine whether a processor can process a given dataset, you can define a is_compatible_with(FourcatModule module=None, str user=None):) -> bool class method which takes a dataset as argument and returns a bool that determines if this processor is considered compatible with that dataset. For example:


@classmethod def is_compatible_with(cls, module=None, user=None): return module.type == "linguistic-features"

type = 'media-import-search'
category = 'Search'
title = 'Upload Media'
description = 'Upload your own audio, video, or image files to be used as a dataset'
extension = 'zip'
is_local = False
is_static = False
max_workers = 1
disallowed_characters = re.compile('[^a-zA-Z0-9._+-]')
accepted_file_types = ['audio', 'video', 'image']
@classmethod
def get_options(cls, parent_dataset=None, user=None):
34    @classmethod
35    def get_options(cls, parent_dataset=None, user=None):
36        return {
37            "intro": {
38                "type": UserInput.OPTION_INFO,
39                # Large numbers of files fail possibly due to Flask request size limit (not file size)
40                "help": "Upload media files to make them be available for further analysis. "
41                        "Please include only one type of file per dataset (image, audio, or video) and "
42                        "4CAT will be able to run various processors on these media collections. "
43                        f"\n\nFor collections **larger than a few hundred**, please upload a single "
44                        f"ZIP file. More than ~500 files will fail (and a ZIP file will also load much faster)."
45            },
46            "data_upload": {
47                "type": UserInput.OPTION_FILE,
48                "multiple": True,
49                "help": "Files"
50            },
51        }

Get processor options

This method by default returns the class's "options" attribute, or an empty dictionary. It can be redefined by processors that need more fine-grained options, e.g. in cases where the availability of options is partially determined by the parent dataset's parameters.

Parameters
  • DataSet parent_dataset: An object representing the dataset that the processor would be run on
  • User user: Flask user the options will be displayed for, in case they are requested for display in the 4CAT web interface. This can be used to show some options only to privileges users.
@staticmethod
def validate_query(query, request, user):
 53    @staticmethod
 54    def validate_query(query, request, user):
 55        """
 56        Step 1: Validate query and files
 57
 58        Confirms that the uploaded files exist and that the media type is valid.
 59
 60        :param dict query:  Query parameters, from client-side.
 61        :param request:  Flask request
 62        :param User user:  User object of user who has submitted the query
 63        :return dict:  Safe query parameters
 64        """
 65        # do we have uploaded files?
 66        bad_files = []
 67        seen_types = set()
 68        all_files = 0
 69
 70        uploaded_files = request.files.getlist("option-data_upload") # if multiple files are uploaded; the first validate_query only has a single file
 71        single_zip_file = uploaded_files and len(uploaded_files) == 1 and uploaded_files[0].filename.lower().endswith(".zip")
 72
 73        # option-data_upload-entries is passed the first time the form is submitted if the user has uploaded a zip file
 74        if "option-data_upload-entries" in request.form or single_zip_file:
 75            # we have a zip file!
 76            try:
 77                if single_zip_file:
 78                    # we have a single uploaded zip file
 79                    # i.e. the query has already been validated (else we would have
 80                    # -entries and no file) and we can get the file info from the
 81                    # zip file itself
 82                    uploaded_files[0].seek(0)
 83                    zip_file_data = BytesIO(uploaded_files[0].read())
 84                    with zipfile.ZipFile(zip_file_data, "r") as uploaded_zip:
 85                        files = [{"filename": f} for f in uploaded_zip.namelist()]
 86                else:
 87                    # validating - get file names from entries field
 88                    files = json.loads(request.form["option-data_upload-entries"])
 89
 90                # ignore known metadata files
 91                files = [f for f in files if not (
 92                        f["filename"].split("/")[-1].startswith(".")
 93                        or f["filename"].endswith(".log")
 94                        or f["filename"].split("/")[-1].startswith("__MACOSX")
 95                        or f["filename"].endswith(".DS_Store")
 96                        or f["filename"].endswith("/")  # sub-directory
 97                )]
 98
 99                # figure out if we have mixed media types
100                seen_types = set()
101                for file in files:
102                    if not query.get("frontend-confirm") and file["filename"].endswith(".svg"):
103                        raise QueryNeedsExplicitConfirmationException("SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
104                    try:
105                        file_type = mimetypes.guess_type(file["filename"])[0].split("/")[0]
106                        seen_types.add(file_type)
107                        all_files += 1
108                    except (AttributeError, TypeError):
109                        bad_files.append(file["filename"])
110
111            except (ValueError, zipfile.BadZipfile) as e:
112                raise QueryParametersException("Cannot read zip file - it may be encrypted or corrupted and cannot "
113                                               "be uploaded to 4CAT.")
114
115        elif "option-data_upload" not in request.files:
116            raise QueryParametersException("No files were offered for upload.")
117
118        elif len(uploaded_files) < 1:
119            raise QueryParametersException("No files were offered for upload.")
120
121        else:
122            # we just have a bunch of separate files
123            # Check file types to ensure all are same type of media
124            for file in uploaded_files:
125                # Allow metadata files and log files to be uploaded
126                if file.filename == ".metadata.json" or file.filename.endswith(".log"):
127                    continue
128
129                # Warn on SVG files
130                if not query.get("frontend-confirm") and file.filename.lower().endswith(".svg"):
131                    raise QueryNeedsExplicitConfirmationException(
132                        "SVG files may not be processed correctly by some 4CAT processors. Are you sure you want to proceed?")
133
134                # when uploading multiple files, we don't want zips
135                if file.filename.lower().endswith(".zip"):
136                    raise QueryParametersException("When uploading media in a zip archive, please upload exactly one "
137                                                   "zip file; 4CAT cannot combine multiple separate zip archives.")
138
139                # Guess mime type from filename; we only have partial files at this point
140                mime_type = mimetypes.guess_type(file.filename)[0]
141                if mime_type is None:
142                    bad_files.append(file.filename)
143                    continue
144
145                mime_type = mime_type.split('/')[0]
146                if mime_type not in SearchMedia.accepted_file_types:
147                    raise QueryParametersException(f"This data source only accepts "
148                                                   f"{andify(SearchMedia.accepted_file_types)} files; "
149                                                   f"'{file.filename}' was detected as {mime_type}, which 4CAT cannot "
150                                                   f"process.")
151
152                seen_types.add(mime_type)
153                all_files += 1
154
155        # we need to at least be able to recognise the extension to know we can
156        # do something with the file...
157        if bad_files:
158            separator = "\n- "
159            raise QueryParametersException("The type of the following files cannot be determined; rename them or "
160                                           f"remove them from the archive or rename them\n{separator.join(bad_files)}")
161
162        # this is not fool-proof, but uncommon extensions are less likely to work
163        # anyway and the user can still choose to proceed
164        if len(set(seen_types)) > 1:
165            raise QueryParametersException(
166                f"The zip file contains files of multiple media types ({andify(seen_types)}). 4CAT processors require "
167                "files of a single type to work properly. Please re-upload only a single type of media to proceed."
168            )
169
170        return {
171            "time": time.time(),
172            "media_type": seen_types.pop() if seen_types else None, # first validation check may not have seen any valid files
173            "num_files": all_files,
174        }

Step 1: Validate query and files

Confirms that the uploaded files exist and that the media type is valid.

Parameters
  • dict query: Query parameters, from client-side.
  • request: Flask request
  • User user: User object of user who has submitted the query
Returns

Safe query parameters

@staticmethod
def after_create(query, dataset, request):
176    @staticmethod
177    def after_create(query, dataset, request):
178        """
179        Step 2: Hook to execute after the dataset for this source has been created
180
181        In this case, save the files in a zip archive.
182
183        :param dict query:  Sanitised query parameters
184        :param DataSet dataset:  Dataset created for this query
185        :param request:  Flask request submitted for its creation
186        """
187        mime_type = query.get("media_type")
188        saved_files = 0
189        skipped_files = []
190        with zipfile.ZipFile(dataset.get_results_path(), "w", compression=zipfile.ZIP_STORED) as new_zip_archive:
191            for file in request.files.getlist("option-data_upload"):
192                # Check if file is zip archive
193                file_mime_type = mimetypes.guess_type(file.filename)[0]
194                if file_mime_type is not None and file_mime_type.split('/')[0] == "application" and \
195                        file_mime_type.split('/')[1] == "zip":
196                    # Save inner files from zip archive to new zip archive with all files
197                    file.seek(0)
198                    zip_file_data = BytesIO(file.read())
199                    with zipfile.ZipFile(zip_file_data, "r") as inner_zip_archive:
200                        for inner_file in inner_zip_archive.infolist():
201                            if inner_file.is_dir():
202                                continue
203
204                            guessed_file_mime_type = mimetypes.guess_type(inner_file.filename)
205                            if guessed_file_mime_type[0]:
206                                mime_type = guessed_file_mime_type[0].split('/')[0]
207
208                            # skip useless metadata files
209                            # also skip files not recognised as media files
210                            clean_file_name = inner_file.filename.split("/")[-1]
211                            if not guessed_file_mime_type[0] or (
212                                    mime_type not in SearchMedia.accepted_file_types
213                                    and not clean_file_name.endswith(".log")
214                                    and not clean_file_name == ".metadata.json"
215                            ) or clean_file_name.startswith("__MACOSX") \
216                              or inner_file.filename.startswith("__MACOSX"):
217                                print(f"skipping {clean_file_name} ({guessed_file_mime_type})")
218                                skipped_files.append(inner_file.filename)
219                                continue
220
221                            # save inner file from the uploaded zip archive to the new zip with all files
222                            new_filename = SearchMedia.get_safe_filename(inner_file.filename, new_zip_archive)
223                            new_zip_archive.writestr(new_filename, inner_zip_archive.read(inner_file))
224
225                            if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
226                                saved_files += 1
227                    continue
228
229                new_filename = SearchMedia.get_safe_filename(file.filename, new_zip_archive)
230                with new_zip_archive.open(new_filename, mode='w') as dest_file:
231                    file.seek(0)
232                    while True:
233                        chunk = file.read(1024)
234                        if len(chunk) == 0:
235                            break
236                        dest_file.write(chunk)
237
238                if not new_filename == ".metadata.json" or not new_filename.endswith(".log"):
239                    saved_files += 1
240
241        # update the number of files in the dataset
242        dataset.num_files = saved_files
243        dataset.media_type = mime_type
244        # TODO: use job's 'details' to save data to share/log?
245        if skipped_files:
246            # todo: this now doesn't actually get logged because the log is
247            # re-initialised after after_create runs?
248            dataset.log("The following files in the uploaded zip archive were skipped because they were not recognised"
249                        "as media files:" + "\n  -".join(skipped_files))

Step 2: Hook to execute after the dataset for this source has been created

In this case, save the files in a zip archive.

Parameters
  • dict query: Sanitised query parameters
  • DataSet dataset: Dataset created for this query
  • request: Flask request submitted for its creation
def process(self):
251    def process(self):
252        """
253        Step 3: Ummmm, we kinda did everything
254        """
255        # Check for SVG files
256        svg_warning = 0
257        if self.parameters.get("media_type") == "image":
258            for file in self.iterate_archive_contents(self.dataset.get_results_path()):
259                if file.suffix == ".svg":
260                    if svg_warning == 0:
261                        self.dataset.log("SVG files may not be processed correctly by some 4CAT processors.")
262                    self.dataset.log(f"SVG file detected: {file.name}")
263                    svg_warning += 1
264        self.dataset.update_status(f"Uploaded {self.parameters.get('num_files')} files of type {self.parameters.get('media_type')}{'' if svg_warning == 0 else f' ({svg_warning} SVG files; see log)'}", is_final=True)
265        self.dataset.finish(self.parameters.get("num_files"))

Step 3: Ummmm, we kinda did everything

@staticmethod
def get_safe_filename(filename, zip_archive=None):
267    @staticmethod
268    def get_safe_filename(filename, zip_archive=None):
269        new_filename = SearchMedia.disallowed_characters.sub("", filename)
270        if zip_archive:
271            # check if file is in zip archive
272            index = 1
273            while new_filename in zip_archive.namelist():
274                new_filename = new_filename + "_" + str(index)
275                index += 1
276
277        return new_filename