Edit on GitHub

common.lib.module_loader

Load modules and datasources dynamically

  1"""
  2Load modules and datasources dynamically
  3"""
  4from pathlib import Path
  5import importlib
  6import inspect
  7import pickle
  8import sys
  9import re
 10import os
 11
 12
 13class ModuleCollector:
 14    """
 15    Collects all modular appendages of 4CAT
 16
 17    On init, an object of this class collects all datasources and workers that
 18    have been added to this 4CAT installation. The metadata of these is then
 19    stored for later access.
 20
 21    Datasources are found in the "datasources" folder in root. Workers are
 22    found in datasource folders or the default "processors" and
 23    "backend/workers" folder. All these folders are scanned for both
 24    processors and workers (processors being a specific kind of worker).
 25    """
 26    ignore = []
 27    missing_modules = {}
 28    log_buffer = None
 29    config = None
 30
 31    PROCESSOR = 1
 32    WORKER = 2
 33
 34    workers = {}
 35    processors = {}
 36    datasources = {}
 37
 38    def __init__(self, config, write_cache=False):
 39        """
 40        Load data sources and workers
 41
 42        Datasources are loaded first so that the datasource folders may be
 43        scanned for workers subsequently.
 44
 45        :param config:  Configuration manager, shared with the rest of the
 46        context
 47        :param bool write_cache:  Write modules to cache file?
 48        """
 49        # this can be flushed later once the logger is available
 50        self.log_buffer = ""
 51        self.config = config
 52
 53        self.load_datasources()
 54        self.load_modules()
 55
 56        # now we know all workers, we can add some extra metadata to the
 57        # datasources, e.g. whether they have an associated search worker
 58        self.expand_datasources()
 59
 60        # cache module-defined config options for use by the config manager
 61        if write_cache:
 62            module_config = {}
 63            for worker in self.workers.values():
 64                if hasattr(worker, "config") and type(worker.config) is dict:
 65                    module_config.update(worker.config)
 66
 67            with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile:
 68                pickle.dump(module_config, outfile)
 69
 70        # load from cache
 71        self.config.load_user_settings()
 72
 73    @staticmethod
 74    def is_4cat_class(object, only_processors=False):
 75        """
 76        Determine if a module member is a worker class we can use
 77        """
 78        if inspect.isclass(object):
 79            if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object):
 80                # ignore abstract and base classes
 81                return False
 82
 83            if hasattr(object, "is_4cat_class"):
 84                if only_processors:
 85                    if hasattr(object, "is_4cat_processor"):
 86                        return object.is_4cat_processor()
 87                    else:
 88                        return False
 89                else:
 90                    return object.is_4cat_class()
 91
 92        return False
 93
 94    def load_modules(self):
 95        """
 96        Load modules
 97
 98        Modules are workers and (as a subset of workers) postprocessors. These
 99        are found by importing any python files found in the given locations,
100        and looking for relevant classes within those python files, that extend
101        `BasicProcessor` or `BasicWorker` and are not abstract.
102        """
103        # look for workers and processors in pre-defined folders and datasources
104
105        extension_path = Path(self.config.get('PATH_ROOT'), "extensions")
106
107        paths = [Path(self.config.get('PATH_ROOT'), "processors"),
108                 Path(self.config.get('PATH_ROOT'), "backend", "workers"),
109                 extension_path,
110                 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line...
111
112        root_match = re.compile(r"^%s" % re.escape(str(self.config.get('PATH_ROOT'))))
113        root_path = Path(self.config.get('PATH_ROOT'))
114
115        for folder in paths:
116            # loop through folders, and files in those folders, recursively
117            is_extension = extension_path in folder.parents or folder == extension_path
118            for file in folder.rglob("*.py"):
119                # determine module name for file
120                # reduce path to be relative to 4CAT root
121                module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,))
122
123                # check if we've already loaded this module
124                if module_name in self.ignore:
125                    continue
126
127                if module_name in sys.modules:
128                    # This skips processors/datasources that were loaded by others and may not yet be captured
129                    pass
130
131                # try importing
132                try:
133                    module = importlib.import_module(module_name)
134                except (SyntaxError, ImportError) as e:
135                    # this is fine, just ignore this data source and give a heads up
136                    self.ignore.append(module_name)
137                    key_name = e.name if hasattr(e, "name") else module_name
138                    if key_name not in self.missing_modules:
139                        self.missing_modules[key_name] = [module_name]
140                    else:
141                        self.missing_modules[key_name].append(module_name)
142                    continue
143
144                # see if module contains the right type of content by looping
145                # through all of its members
146                components = inspect.getmembers(module, predicate=self.is_4cat_class)
147                for component in components:
148                    if component[1].type in self.workers:
149                        # already indexed
150                        continue
151
152                    # extract data that is useful for the scheduler and other
153                    # parts of 4CAT
154                    relative_path = root_match.sub("", str(file))
155
156                    self.workers[component[1].type] = component[1]
157                    self.workers[component[1].type].filepath = relative_path
158                    self.workers[component[1].type].is_extension = is_extension
159
160                    # we can't use issubclass() because for that we would need
161                    # to import BasicProcessor, which would lead to a circular
162                    # import
163                    if self.is_4cat_class(component[1], only_processors=True):
164                        self.processors[component[1].type] = self.workers[component[1].type]
165
166        # sort by category for more convenient display in interfaces
167        sorted_processors = {id: self.processors[id] for id in
168                             sorted(self.processors)}
169        categorised_processors = {id: sorted_processors[id] for id in
170                                  sorted(sorted_processors,
171                                         key=lambda item: "0" if sorted_processors[item].category == "Presets" else
172                                         sorted_processors[item].category)}
173
174        # Give a heads-up if not all modules were installed properly
175        if self.missing_modules:
176            warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \
177                      "function.\nMissing modules:\n"
178            for missing_module, processor_list in self.missing_modules.items():
179                warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list))
180
181            self.log_buffer += warning
182
183        self.processors = categorised_processors
184
185    def load_datasources(self):
186        """
187        Load datasources
188
189        This looks for folders within the datasource root folder that contain
190        an `__init__.py` defining an `init_datasource` function and a
191        `DATASOURCE` constant. The latter is taken as the ID for this
192        datasource.
193        """
194        def _load_datasource(subdirectory, expiration):
195            """
196            Load a single datasource
197            """
198            # determine module name (path relative to 4CAT root w/ periods)
199            module_name = ".".join(subdirectory.relative_to(Path(self.config.get("PATH_ROOT"))).parts)
200            try:
201                datasource = importlib.import_module(module_name)
202            except ImportError as e:
203                self.log_buffer += "Could not import %s: %s\n" % (module_name, e)
204                return
205
206            if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"):
207                self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory
208                return
209
210            datasource_id = datasource.DATASOURCE
211
212            self.datasources[datasource_id] = {
213                "expire-datasets": expiration.get(datasource_id, None),
214                "path": subdirectory,
215                "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id,
216                "id": subdirectory.parts[-1],
217                "init": datasource.init_datasource,
218                "config": {} if not hasattr(datasource, "config") else datasource.config
219            }
220
221        expiration = self.config.get("datasources.expiration", {})
222
223        # Load 4CAT core datasources
224        expiration = self.config.get("datasources.expiration", {})
225        for subdirectory in Path(self.config.get('PATH_ROOT'), "datasources").iterdir():
226            if subdirectory.is_dir():
227                _load_datasource(subdirectory, expiration)
228
229        # Load extension datasources
230        # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders
231        for root, dirs, files in os.walk(Path(self.config.get('PATH_ROOT'), "extensions"), followlinks=True):
232            if "datasources" in dirs:
233                for subdirectory in Path(root, "datasources").iterdir():
234                    if subdirectory.is_dir():
235                        _load_datasource(subdirectory, expiration)
236
237        sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in
238                              sorted(self.datasources, key=lambda id: self.datasources[id]["name"])}
239        self.datasources = sorted_datasources
240
241    def expand_datasources(self):
242        """
243        Expand datasource metadata
244
245        Some datasource metadata can only be known after all workers have been
246        loaded, e.g. whether there is a search worker for the datasource. This
247        function takes care of populating those values.
248        """
249        for datasource_id in self.datasources:
250            worker = self.workers.get("%s-search" % datasource_id)
251            self.datasources[datasource_id]["has_worker"] = bool(worker)
252            self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \
253                                                             bool(self.workers[
254                                                                      "%s-search" % datasource_id].get_options(
255                                                                 config=self.config))
256            self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer
257
258    def load_worker_class(self, worker):
259        """
260        Get class for worker
261
262        This import worker modules on-demand, so the code is only loaded if a
263        worker that needs the code is actually queued and run
264
265        :return:  Worker class for the given worker metadata
266        """
267        module = worker["module"]
268        if module not in sys.modules:
269            importlib.import_module(module)
270
271        return getattr(sys.modules[module], worker["class_name"])
class ModuleCollector:
 14class ModuleCollector:
 15    """
 16    Collects all modular appendages of 4CAT
 17
 18    On init, an object of this class collects all datasources and workers that
 19    have been added to this 4CAT installation. The metadata of these is then
 20    stored for later access.
 21
 22    Datasources are found in the "datasources" folder in root. Workers are
 23    found in datasource folders or the default "processors" and
 24    "backend/workers" folder. All these folders are scanned for both
 25    processors and workers (processors being a specific kind of worker).
 26    """
 27    ignore = []
 28    missing_modules = {}
 29    log_buffer = None
 30    config = None
 31
 32    PROCESSOR = 1
 33    WORKER = 2
 34
 35    workers = {}
 36    processors = {}
 37    datasources = {}
 38
 39    def __init__(self, config, write_cache=False):
 40        """
 41        Load data sources and workers
 42
 43        Datasources are loaded first so that the datasource folders may be
 44        scanned for workers subsequently.
 45
 46        :param config:  Configuration manager, shared with the rest of the
 47        context
 48        :param bool write_cache:  Write modules to cache file?
 49        """
 50        # this can be flushed later once the logger is available
 51        self.log_buffer = ""
 52        self.config = config
 53
 54        self.load_datasources()
 55        self.load_modules()
 56
 57        # now we know all workers, we can add some extra metadata to the
 58        # datasources, e.g. whether they have an associated search worker
 59        self.expand_datasources()
 60
 61        # cache module-defined config options for use by the config manager
 62        if write_cache:
 63            module_config = {}
 64            for worker in self.workers.values():
 65                if hasattr(worker, "config") and type(worker.config) is dict:
 66                    module_config.update(worker.config)
 67
 68            with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile:
 69                pickle.dump(module_config, outfile)
 70
 71        # load from cache
 72        self.config.load_user_settings()
 73
 74    @staticmethod
 75    def is_4cat_class(object, only_processors=False):
 76        """
 77        Determine if a module member is a worker class we can use
 78        """
 79        if inspect.isclass(object):
 80            if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object):
 81                # ignore abstract and base classes
 82                return False
 83
 84            if hasattr(object, "is_4cat_class"):
 85                if only_processors:
 86                    if hasattr(object, "is_4cat_processor"):
 87                        return object.is_4cat_processor()
 88                    else:
 89                        return False
 90                else:
 91                    return object.is_4cat_class()
 92
 93        return False
 94
 95    def load_modules(self):
 96        """
 97        Load modules
 98
 99        Modules are workers and (as a subset of workers) postprocessors. These
100        are found by importing any python files found in the given locations,
101        and looking for relevant classes within those python files, that extend
102        `BasicProcessor` or `BasicWorker` and are not abstract.
103        """
104        # look for workers and processors in pre-defined folders and datasources
105
106        extension_path = Path(self.config.get('PATH_ROOT'), "extensions")
107
108        paths = [Path(self.config.get('PATH_ROOT'), "processors"),
109                 Path(self.config.get('PATH_ROOT'), "backend", "workers"),
110                 extension_path,
111                 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line...
112
113        root_match = re.compile(r"^%s" % re.escape(str(self.config.get('PATH_ROOT'))))
114        root_path = Path(self.config.get('PATH_ROOT'))
115
116        for folder in paths:
117            # loop through folders, and files in those folders, recursively
118            is_extension = extension_path in folder.parents or folder == extension_path
119            for file in folder.rglob("*.py"):
120                # determine module name for file
121                # reduce path to be relative to 4CAT root
122                module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,))
123
124                # check if we've already loaded this module
125                if module_name in self.ignore:
126                    continue
127
128                if module_name in sys.modules:
129                    # This skips processors/datasources that were loaded by others and may not yet be captured
130                    pass
131
132                # try importing
133                try:
134                    module = importlib.import_module(module_name)
135                except (SyntaxError, ImportError) as e:
136                    # this is fine, just ignore this data source and give a heads up
137                    self.ignore.append(module_name)
138                    key_name = e.name if hasattr(e, "name") else module_name
139                    if key_name not in self.missing_modules:
140                        self.missing_modules[key_name] = [module_name]
141                    else:
142                        self.missing_modules[key_name].append(module_name)
143                    continue
144
145                # see if module contains the right type of content by looping
146                # through all of its members
147                components = inspect.getmembers(module, predicate=self.is_4cat_class)
148                for component in components:
149                    if component[1].type in self.workers:
150                        # already indexed
151                        continue
152
153                    # extract data that is useful for the scheduler and other
154                    # parts of 4CAT
155                    relative_path = root_match.sub("", str(file))
156
157                    self.workers[component[1].type] = component[1]
158                    self.workers[component[1].type].filepath = relative_path
159                    self.workers[component[1].type].is_extension = is_extension
160
161                    # we can't use issubclass() because for that we would need
162                    # to import BasicProcessor, which would lead to a circular
163                    # import
164                    if self.is_4cat_class(component[1], only_processors=True):
165                        self.processors[component[1].type] = self.workers[component[1].type]
166
167        # sort by category for more convenient display in interfaces
168        sorted_processors = {id: self.processors[id] for id in
169                             sorted(self.processors)}
170        categorised_processors = {id: sorted_processors[id] for id in
171                                  sorted(sorted_processors,
172                                         key=lambda item: "0" if sorted_processors[item].category == "Presets" else
173                                         sorted_processors[item].category)}
174
175        # Give a heads-up if not all modules were installed properly
176        if self.missing_modules:
177            warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \
178                      "function.\nMissing modules:\n"
179            for missing_module, processor_list in self.missing_modules.items():
180                warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list))
181
182            self.log_buffer += warning
183
184        self.processors = categorised_processors
185
186    def load_datasources(self):
187        """
188        Load datasources
189
190        This looks for folders within the datasource root folder that contain
191        an `__init__.py` defining an `init_datasource` function and a
192        `DATASOURCE` constant. The latter is taken as the ID for this
193        datasource.
194        """
195        def _load_datasource(subdirectory, expiration):
196            """
197            Load a single datasource
198            """
199            # determine module name (path relative to 4CAT root w/ periods)
200            module_name = ".".join(subdirectory.relative_to(Path(self.config.get("PATH_ROOT"))).parts)
201            try:
202                datasource = importlib.import_module(module_name)
203            except ImportError as e:
204                self.log_buffer += "Could not import %s: %s\n" % (module_name, e)
205                return
206
207            if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"):
208                self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory
209                return
210
211            datasource_id = datasource.DATASOURCE
212
213            self.datasources[datasource_id] = {
214                "expire-datasets": expiration.get(datasource_id, None),
215                "path": subdirectory,
216                "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id,
217                "id": subdirectory.parts[-1],
218                "init": datasource.init_datasource,
219                "config": {} if not hasattr(datasource, "config") else datasource.config
220            }
221
222        expiration = self.config.get("datasources.expiration", {})
223
224        # Load 4CAT core datasources
225        expiration = self.config.get("datasources.expiration", {})
226        for subdirectory in Path(self.config.get('PATH_ROOT'), "datasources").iterdir():
227            if subdirectory.is_dir():
228                _load_datasource(subdirectory, expiration)
229
230        # Load extension datasources
231        # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders
232        for root, dirs, files in os.walk(Path(self.config.get('PATH_ROOT'), "extensions"), followlinks=True):
233            if "datasources" in dirs:
234                for subdirectory in Path(root, "datasources").iterdir():
235                    if subdirectory.is_dir():
236                        _load_datasource(subdirectory, expiration)
237
238        sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in
239                              sorted(self.datasources, key=lambda id: self.datasources[id]["name"])}
240        self.datasources = sorted_datasources
241
242    def expand_datasources(self):
243        """
244        Expand datasource metadata
245
246        Some datasource metadata can only be known after all workers have been
247        loaded, e.g. whether there is a search worker for the datasource. This
248        function takes care of populating those values.
249        """
250        for datasource_id in self.datasources:
251            worker = self.workers.get("%s-search" % datasource_id)
252            self.datasources[datasource_id]["has_worker"] = bool(worker)
253            self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \
254                                                             bool(self.workers[
255                                                                      "%s-search" % datasource_id].get_options(
256                                                                 config=self.config))
257            self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer
258
259    def load_worker_class(self, worker):
260        """
261        Get class for worker
262
263        This import worker modules on-demand, so the code is only loaded if a
264        worker that needs the code is actually queued and run
265
266        :return:  Worker class for the given worker metadata
267        """
268        module = worker["module"]
269        if module not in sys.modules:
270            importlib.import_module(module)
271
272        return getattr(sys.modules[module], worker["class_name"])

Collects all modular appendages of 4CAT

On init, an object of this class collects all datasources and workers that have been added to this 4CAT installation. The metadata of these is then stored for later access.

Datasources are found in the "datasources" folder in root. Workers are found in datasource folders or the default "processors" and "backend/workers" folder. All these folders are scanned for both processors and workers (processors being a specific kind of worker).

ModuleCollector(config, write_cache=False)
39    def __init__(self, config, write_cache=False):
40        """
41        Load data sources and workers
42
43        Datasources are loaded first so that the datasource folders may be
44        scanned for workers subsequently.
45
46        :param config:  Configuration manager, shared with the rest of the
47        context
48        :param bool write_cache:  Write modules to cache file?
49        """
50        # this can be flushed later once the logger is available
51        self.log_buffer = ""
52        self.config = config
53
54        self.load_datasources()
55        self.load_modules()
56
57        # now we know all workers, we can add some extra metadata to the
58        # datasources, e.g. whether they have an associated search worker
59        self.expand_datasources()
60
61        # cache module-defined config options for use by the config manager
62        if write_cache:
63            module_config = {}
64            for worker in self.workers.values():
65                if hasattr(worker, "config") and type(worker.config) is dict:
66                    module_config.update(worker.config)
67
68            with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile:
69                pickle.dump(module_config, outfile)
70
71        # load from cache
72        self.config.load_user_settings()

Load data sources and workers

Datasources are loaded first so that the datasource folders may be scanned for workers subsequently.

Parameters
  • config: Configuration manager, shared with the rest of the context
  • bool write_cache: Write modules to cache file?
ignore = []
missing_modules = {}
log_buffer = None
config = None
PROCESSOR = 1
WORKER = 2
workers = {}
processors = {}
datasources = {}
@staticmethod
def is_4cat_class(object, only_processors=False):
74    @staticmethod
75    def is_4cat_class(object, only_processors=False):
76        """
77        Determine if a module member is a worker class we can use
78        """
79        if inspect.isclass(object):
80            if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object):
81                # ignore abstract and base classes
82                return False
83
84            if hasattr(object, "is_4cat_class"):
85                if only_processors:
86                    if hasattr(object, "is_4cat_processor"):
87                        return object.is_4cat_processor()
88                    else:
89                        return False
90                else:
91                    return object.is_4cat_class()
92
93        return False

Determine if a module member is a worker class we can use

def load_modules(self):
 95    def load_modules(self):
 96        """
 97        Load modules
 98
 99        Modules are workers and (as a subset of workers) postprocessors. These
100        are found by importing any python files found in the given locations,
101        and looking for relevant classes within those python files, that extend
102        `BasicProcessor` or `BasicWorker` and are not abstract.
103        """
104        # look for workers and processors in pre-defined folders and datasources
105
106        extension_path = Path(self.config.get('PATH_ROOT'), "extensions")
107
108        paths = [Path(self.config.get('PATH_ROOT'), "processors"),
109                 Path(self.config.get('PATH_ROOT'), "backend", "workers"),
110                 extension_path,
111                 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line...
112
113        root_match = re.compile(r"^%s" % re.escape(str(self.config.get('PATH_ROOT'))))
114        root_path = Path(self.config.get('PATH_ROOT'))
115
116        for folder in paths:
117            # loop through folders, and files in those folders, recursively
118            is_extension = extension_path in folder.parents or folder == extension_path
119            for file in folder.rglob("*.py"):
120                # determine module name for file
121                # reduce path to be relative to 4CAT root
122                module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,))
123
124                # check if we've already loaded this module
125                if module_name in self.ignore:
126                    continue
127
128                if module_name in sys.modules:
129                    # This skips processors/datasources that were loaded by others and may not yet be captured
130                    pass
131
132                # try importing
133                try:
134                    module = importlib.import_module(module_name)
135                except (SyntaxError, ImportError) as e:
136                    # this is fine, just ignore this data source and give a heads up
137                    self.ignore.append(module_name)
138                    key_name = e.name if hasattr(e, "name") else module_name
139                    if key_name not in self.missing_modules:
140                        self.missing_modules[key_name] = [module_name]
141                    else:
142                        self.missing_modules[key_name].append(module_name)
143                    continue
144
145                # see if module contains the right type of content by looping
146                # through all of its members
147                components = inspect.getmembers(module, predicate=self.is_4cat_class)
148                for component in components:
149                    if component[1].type in self.workers:
150                        # already indexed
151                        continue
152
153                    # extract data that is useful for the scheduler and other
154                    # parts of 4CAT
155                    relative_path = root_match.sub("", str(file))
156
157                    self.workers[component[1].type] = component[1]
158                    self.workers[component[1].type].filepath = relative_path
159                    self.workers[component[1].type].is_extension = is_extension
160
161                    # we can't use issubclass() because for that we would need
162                    # to import BasicProcessor, which would lead to a circular
163                    # import
164                    if self.is_4cat_class(component[1], only_processors=True):
165                        self.processors[component[1].type] = self.workers[component[1].type]
166
167        # sort by category for more convenient display in interfaces
168        sorted_processors = {id: self.processors[id] for id in
169                             sorted(self.processors)}
170        categorised_processors = {id: sorted_processors[id] for id in
171                                  sorted(sorted_processors,
172                                         key=lambda item: "0" if sorted_processors[item].category == "Presets" else
173                                         sorted_processors[item].category)}
174
175        # Give a heads-up if not all modules were installed properly
176        if self.missing_modules:
177            warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \
178                      "function.\nMissing modules:\n"
179            for missing_module, processor_list in self.missing_modules.items():
180                warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list))
181
182            self.log_buffer += warning
183
184        self.processors = categorised_processors

Load modules

Modules are workers and (as a subset of workers) postprocessors. These are found by importing any python files found in the given locations, and looking for relevant classes within those python files, that extend BasicProcessor or BasicWorker and are not abstract.

def load_datasources(self):
186    def load_datasources(self):
187        """
188        Load datasources
189
190        This looks for folders within the datasource root folder that contain
191        an `__init__.py` defining an `init_datasource` function and a
192        `DATASOURCE` constant. The latter is taken as the ID for this
193        datasource.
194        """
195        def _load_datasource(subdirectory, expiration):
196            """
197            Load a single datasource
198            """
199            # determine module name (path relative to 4CAT root w/ periods)
200            module_name = ".".join(subdirectory.relative_to(Path(self.config.get("PATH_ROOT"))).parts)
201            try:
202                datasource = importlib.import_module(module_name)
203            except ImportError as e:
204                self.log_buffer += "Could not import %s: %s\n" % (module_name, e)
205                return
206
207            if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"):
208                self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory
209                return
210
211            datasource_id = datasource.DATASOURCE
212
213            self.datasources[datasource_id] = {
214                "expire-datasets": expiration.get(datasource_id, None),
215                "path": subdirectory,
216                "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id,
217                "id": subdirectory.parts[-1],
218                "init": datasource.init_datasource,
219                "config": {} if not hasattr(datasource, "config") else datasource.config
220            }
221
222        expiration = self.config.get("datasources.expiration", {})
223
224        # Load 4CAT core datasources
225        expiration = self.config.get("datasources.expiration", {})
226        for subdirectory in Path(self.config.get('PATH_ROOT'), "datasources").iterdir():
227            if subdirectory.is_dir():
228                _load_datasource(subdirectory, expiration)
229
230        # Load extension datasources
231        # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders
232        for root, dirs, files in os.walk(Path(self.config.get('PATH_ROOT'), "extensions"), followlinks=True):
233            if "datasources" in dirs:
234                for subdirectory in Path(root, "datasources").iterdir():
235                    if subdirectory.is_dir():
236                        _load_datasource(subdirectory, expiration)
237
238        sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in
239                              sorted(self.datasources, key=lambda id: self.datasources[id]["name"])}
240        self.datasources = sorted_datasources

Load datasources

This looks for folders within the datasource root folder that contain an __init__.py defining an init_datasource function and a DATASOURCE constant. The latter is taken as the ID for this datasource.

def expand_datasources(self):
242    def expand_datasources(self):
243        """
244        Expand datasource metadata
245
246        Some datasource metadata can only be known after all workers have been
247        loaded, e.g. whether there is a search worker for the datasource. This
248        function takes care of populating those values.
249        """
250        for datasource_id in self.datasources:
251            worker = self.workers.get("%s-search" % datasource_id)
252            self.datasources[datasource_id]["has_worker"] = bool(worker)
253            self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \
254                                                             bool(self.workers[
255                                                                      "%s-search" % datasource_id].get_options(
256                                                                 config=self.config))
257            self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer

Expand datasource metadata

Some datasource metadata can only be known after all workers have been loaded, e.g. whether there is a search worker for the datasource. This function takes care of populating those values.

def load_worker_class(self, worker):
259    def load_worker_class(self, worker):
260        """
261        Get class for worker
262
263        This import worker modules on-demand, so the code is only loaded if a
264        worker that needs the code is actually queued and run
265
266        :return:  Worker class for the given worker metadata
267        """
268        module = worker["module"]
269        if module not in sys.modules:
270            importlib.import_module(module)
271
272        return getattr(sys.modules[module], worker["class_name"])

Get class for worker

This import worker modules on-demand, so the code is only loaded if a worker that needs the code is actually queued and run

Returns

Worker class for the given worker metadata