Edit on GitHub

common.lib.module_loader

Load modules and datasources dynamically

View Source

  1"""
  2Load modules and datasources dynamically
  3"""
  4from pathlib import Path
  5import importlib
  6import inspect
  7import pickle
  8import sys
  9import re
 10import os
 11
 12from common.config_manager import config
 13
 14
 15class ModuleCollector:
 16    """
 17    Collects all modular appendages of 4CAT
 18
 19    On init, an object of this class collects all datasources and workers that
 20    have been added to this 4CAT installation. The metadata of these is then
 21    stored for later access.
 22
 23    Datasources are found in the "datasources" folder in root. Workers are
 24    found in datasource folders or the default "processors" and
 25    "backend/workers" folder. All these folders are scanned for both
 26    processors and workers (processors being a specific kind of worker).
 27    """
 28    ignore = []
 29    missing_modules = {}
 30    log_buffer = None
 31
 32    PROCESSOR = 1
 33    WORKER = 2
 34
 35    workers = {}
 36    processors = {}
 37    datasources = {}
 38
 39    def __init__(self, write_config=False):
 40        """
 41        Load data sources and workers
 42
 43        Datasources are loaded first so that the datasource folders may be
 44        scanned for workers subsequently.
 45        """
 46        # this can be flushed later once the logger is available
 47        self.log_buffer = ""
 48
 49        self.load_datasources()
 50        self.load_modules()
 51
 52        # now we know all workers, we can add some extra metadata to the
 53        # datasources, e.g. whether they have an associated search worker
 54        self.expand_datasources()
 55
 56        # cache module-defined config options for use by the config manager
 57        if write_config:
 58            module_config = {}
 59            for worker in self.workers.values():
 60                if hasattr(worker, "config") and type(worker.config) is dict:
 61                    module_config.update(worker.config)
 62
 63            with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile:
 64                pickle.dump(module_config, outfile)
 65
 66        # load from cache
 67        config.load_user_settings()
 68
 69    @staticmethod
 70    def is_4cat_class(object, only_processors=False):
 71        """
 72        Determine if a module member is a worker class we can use
 73        """
 74        if inspect.isclass(object):
 75            if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object):
 76                # ignore abstract and base classes
 77                return False
 78
 79            if hasattr(object, "is_4cat_class"):
 80                if only_processors:
 81                    if hasattr(object, "is_4cat_processor"):
 82                        return object.is_4cat_processor()
 83                    else:
 84                        return False
 85                else:
 86                    return object.is_4cat_class()
 87
 88        return False
 89
 90    def load_modules(self):
 91        """
 92        Load modules
 93
 94        Modules are workers and (as a subset of workers) postprocessors. These
 95        are found by importing any python files found in the given locations,
 96        and looking for relevant classes within those python files, that extend
 97        `BasicProcessor` or `BasicWorker` and are not abstract.
 98        """
 99        # look for workers and processors in pre-defined folders and datasources
100
101        extension_path = Path(config.get('PATH_ROOT'), "extensions")
102
103        paths = [Path(config.get('PATH_ROOT'), "processors"),
104                 Path(config.get('PATH_ROOT'), "backend", "workers"),
105                 extension_path,
106                 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line...
107
108        root_match = re.compile(r"^%s" % re.escape(str(config.get('PATH_ROOT'))))
109        root_path = Path(config.get('PATH_ROOT'))
110
111        for folder in paths:
112            # loop through folders, and files in those folders, recursively
113            is_extension = extension_path in folder.parents or folder == extension_path
114            for file in folder.rglob("*.py"):
115                # determine module name for file
116                # reduce path to be relative to 4CAT root
117                module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,))
118
119                # check if we've already loaded this module
120                if module_name in self.ignore:
121                    continue
122
123                if module_name in sys.modules:
124                    # This skips processors/datasources that were loaded by others and may not yet be captured
125                    pass
126
127                # try importing
128                try:
129                    module = importlib.import_module(module_name)
130                except (SyntaxError, ImportError) as e:
131                    # this is fine, just ignore this data source and give a heads up
132                    self.ignore.append(module_name)
133                    key_name = e.name if hasattr(e, "name") else module_name
134                    if key_name not in self.missing_modules:
135                        self.missing_modules[key_name] = [module_name]
136                    else:
137                        self.missing_modules[key_name].append(module_name)
138                    continue
139
140                # see if module contains the right type of content by looping
141                # through all of its members
142                components = inspect.getmembers(module, predicate=self.is_4cat_class)
143                for component in components:
144                    if component[1].type in self.workers:
145                        # already indexed
146                        continue
147
148                    # extract data that is useful for the scheduler and other
149                    # parts of 4CAT
150                    relative_path = root_match.sub("", str(file))
151
152                    self.workers[component[1].type] = component[1]
153                    self.workers[component[1].type].filepath = relative_path
154                    self.workers[component[1].type].is_extension = is_extension
155
156                    # we can't use issubclass() because for that we would need
157                    # to import BasicProcessor, which would lead to a circular
158                    # import
159                    if self.is_4cat_class(component[1], only_processors=True):
160                        self.processors[component[1].type] = self.workers[component[1].type]
161
162        # sort by category for more convenient display in interfaces
163        sorted_processors = {id: self.processors[id] for id in
164                             sorted(self.processors)}
165        categorised_processors = {id: sorted_processors[id] for id in
166                                  sorted(sorted_processors,
167                                         key=lambda item: "0" if sorted_processors[item].category == "Presets" else
168                                         sorted_processors[item].category)}
169
170        # Give a heads-up if not all modules were installed properly
171        if self.missing_modules:
172            warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \
173                      "function.\nMissing modules:\n"
174            for missing_module, processor_list in self.missing_modules.items():
175                warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list))
176
177            self.log_buffer += warning
178
179        self.processors = categorised_processors
180
181    def load_datasources(self):
182        """
183        Load datasources
184
185        This looks for folders within the datasource root folder that contain
186        an `__init__.py` defining an `init_datasource` function and a
187        `DATASOURCE` constant. The latter is taken as the ID for this
188        datasource.
189        """
190        def _load_datasource(subdirectory):
191            """
192            Load a single datasource
193            """
194            # determine module name (path relative to 4CAT root w/ periods)
195            module_name = ".".join(subdirectory.relative_to(Path(config.get("PATH_ROOT"))).parts)
196            try:
197                datasource = importlib.import_module(module_name)
198            except ImportError as e:
199                self.log_buffer += "Could not import %s: %s\n" % (module_name, e)
200                return
201
202            if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"):
203                self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory
204                return
205
206            datasource_id = datasource.DATASOURCE
207
208            self.datasources[datasource_id] = {
209                "expire-datasets": config.get("datasources.expiration", {}).get(datasource_id, None),
210                "path": subdirectory,
211                "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id,
212                "id": subdirectory.parts[-1],
213                "init": datasource.init_datasource,
214                "config": {} if not hasattr(datasource, "config") else datasource.config
215            }
216
217        # Load 4CAT core datasources
218        for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir():
219            if subdirectory.is_dir():
220                _load_datasource(subdirectory)
221
222        # Load extension datasources
223        # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders
224        for root, dirs, files in os.walk(Path(config.get('PATH_ROOT'), "extensions"), followlinks=True):
225            if "datasources" in dirs:
226                for subdirectory in Path(root, "datasources").iterdir():
227                    if subdirectory.is_dir():
228                        _load_datasource(subdirectory)
229
230        sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in
231                              sorted(self.datasources, key=lambda id: self.datasources[id]["name"])}
232        self.datasources = sorted_datasources
233
234    def expand_datasources(self):
235        """
236        Expand datasource metadata
237
238        Some datasource metadata can only be known after all workers have been
239        loaded, e.g. whether there is a search worker for the datasource. This
240        function takes care of populating those values.
241        """
242        for datasource_id in self.datasources:
243            worker = self.workers.get("%s-search" % datasource_id)
244            self.datasources[datasource_id]["has_worker"] = bool(worker)
245            self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \
246                                                             bool(self.workers["%s-search" % datasource_id].get_options())
247            self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer
248
249    def load_worker_class(self, worker):
250        """
251        Get class for worker
252
253        This import worker modules on-demand, so the code is only loaded if a
254        worker that needs the code is actually queued and run
255
256        :return:  Worker class for the given worker metadata
257        """
258        module = worker["module"]
259        if module not in sys.modules:
260            importlib.import_module(module)
261
262        return getattr(sys.modules[module], worker["class_name"])

class ModuleCollector: View Source

 16class ModuleCollector:
 17    """
 18    Collects all modular appendages of 4CAT
 19
 20    On init, an object of this class collects all datasources and workers that
 21    have been added to this 4CAT installation. The metadata of these is then
 22    stored for later access.
 23
 24    Datasources are found in the "datasources" folder in root. Workers are
 25    found in datasource folders or the default "processors" and
 26    "backend/workers" folder. All these folders are scanned for both
 27    processors and workers (processors being a specific kind of worker).
 28    """
 29    ignore = []
 30    missing_modules = {}
 31    log_buffer = None
 32
 33    PROCESSOR = 1
 34    WORKER = 2
 35
 36    workers = {}
 37    processors = {}
 38    datasources = {}
 39
 40    def __init__(self, write_config=False):
 41        """
 42        Load data sources and workers
 43
 44        Datasources are loaded first so that the datasource folders may be
 45        scanned for workers subsequently.
 46        """
 47        # this can be flushed later once the logger is available
 48        self.log_buffer = ""
 49
 50        self.load_datasources()
 51        self.load_modules()
 52
 53        # now we know all workers, we can add some extra metadata to the
 54        # datasources, e.g. whether they have an associated search worker
 55        self.expand_datasources()
 56
 57        # cache module-defined config options for use by the config manager
 58        if write_config:
 59            module_config = {}
 60            for worker in self.workers.values():
 61                if hasattr(worker, "config") and type(worker.config) is dict:
 62                    module_config.update(worker.config)
 63
 64            with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile:
 65                pickle.dump(module_config, outfile)
 66
 67        # load from cache
 68        config.load_user_settings()
 69
 70    @staticmethod
 71    def is_4cat_class(object, only_processors=False):
 72        """
 73        Determine if a module member is a worker class we can use
 74        """
 75        if inspect.isclass(object):
 76            if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object):
 77                # ignore abstract and base classes
 78                return False
 79
 80            if hasattr(object, "is_4cat_class"):
 81                if only_processors:
 82                    if hasattr(object, "is_4cat_processor"):
 83                        return object.is_4cat_processor()
 84                    else:
 85                        return False
 86                else:
 87                    return object.is_4cat_class()
 88
 89        return False
 90
 91    def load_modules(self):
 92        """
 93        Load modules
 94
 95        Modules are workers and (as a subset of workers) postprocessors. These
 96        are found by importing any python files found in the given locations,
 97        and looking for relevant classes within those python files, that extend
 98        `BasicProcessor` or `BasicWorker` and are not abstract.
 99        """
100        # look for workers and processors in pre-defined folders and datasources
101
102        extension_path = Path(config.get('PATH_ROOT'), "extensions")
103
104        paths = [Path(config.get('PATH_ROOT'), "processors"),
105                 Path(config.get('PATH_ROOT'), "backend", "workers"),
106                 extension_path,
107                 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line...
108
109        root_match = re.compile(r"^%s" % re.escape(str(config.get('PATH_ROOT'))))
110        root_path = Path(config.get('PATH_ROOT'))
111
112        for folder in paths:
113            # loop through folders, and files in those folders, recursively
114            is_extension = extension_path in folder.parents or folder == extension_path
115            for file in folder.rglob("*.py"):
116                # determine module name for file
117                # reduce path to be relative to 4CAT root
118                module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,))
119
120                # check if we've already loaded this module
121                if module_name in self.ignore:
122                    continue
123
124                if module_name in sys.modules:
125                    # This skips processors/datasources that were loaded by others and may not yet be captured
126                    pass
127
128                # try importing
129                try:
130                    module = importlib.import_module(module_name)
131                except (SyntaxError, ImportError) as e:
132                    # this is fine, just ignore this data source and give a heads up
133                    self.ignore.append(module_name)
134                    key_name = e.name if hasattr(e, "name") else module_name
135                    if key_name not in self.missing_modules:
136                        self.missing_modules[key_name] = [module_name]
137                    else:
138                        self.missing_modules[key_name].append(module_name)
139                    continue
140
141                # see if module contains the right type of content by looping
142                # through all of its members
143                components = inspect.getmembers(module, predicate=self.is_4cat_class)
144                for component in components:
145                    if component[1].type in self.workers:
146                        # already indexed
147                        continue
148
149                    # extract data that is useful for the scheduler and other
150                    # parts of 4CAT
151                    relative_path = root_match.sub("", str(file))
152
153                    self.workers[component[1].type] = component[1]
154                    self.workers[component[1].type].filepath = relative_path
155                    self.workers[component[1].type].is_extension = is_extension
156
157                    # we can't use issubclass() because for that we would need
158                    # to import BasicProcessor, which would lead to a circular
159                    # import
160                    if self.is_4cat_class(component[1], only_processors=True):
161                        self.processors[component[1].type] = self.workers[component[1].type]
162
163        # sort by category for more convenient display in interfaces
164        sorted_processors = {id: self.processors[id] for id in
165                             sorted(self.processors)}
166        categorised_processors = {id: sorted_processors[id] for id in
167                                  sorted(sorted_processors,
168                                         key=lambda item: "0" if sorted_processors[item].category == "Presets" else
169                                         sorted_processors[item].category)}
170
171        # Give a heads-up if not all modules were installed properly
172        if self.missing_modules:
173            warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \
174                      "function.\nMissing modules:\n"
175            for missing_module, processor_list in self.missing_modules.items():
176                warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list))
177
178            self.log_buffer += warning
179
180        self.processors = categorised_processors
181
182    def load_datasources(self):
183        """
184        Load datasources
185
186        This looks for folders within the datasource root folder that contain
187        an `__init__.py` defining an `init_datasource` function and a
188        `DATASOURCE` constant. The latter is taken as the ID for this
189        datasource.
190        """
191        def _load_datasource(subdirectory):
192            """
193            Load a single datasource
194            """
195            # determine module name (path relative to 4CAT root w/ periods)
196            module_name = ".".join(subdirectory.relative_to(Path(config.get("PATH_ROOT"))).parts)
197            try:
198                datasource = importlib.import_module(module_name)
199            except ImportError as e:
200                self.log_buffer += "Could not import %s: %s\n" % (module_name, e)
201                return
202
203            if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"):
204                self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory
205                return
206
207            datasource_id = datasource.DATASOURCE
208
209            self.datasources[datasource_id] = {
210                "expire-datasets": config.get("datasources.expiration", {}).get(datasource_id, None),
211                "path": subdirectory,
212                "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id,
213                "id": subdirectory.parts[-1],
214                "init": datasource.init_datasource,
215                "config": {} if not hasattr(datasource, "config") else datasource.config
216            }
217
218        # Load 4CAT core datasources
219        for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir():
220            if subdirectory.is_dir():
221                _load_datasource(subdirectory)
222
223        # Load extension datasources
224        # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders
225        for root, dirs, files in os.walk(Path(config.get('PATH_ROOT'), "extensions"), followlinks=True):
226            if "datasources" in dirs:
227                for subdirectory in Path(root, "datasources").iterdir():
228                    if subdirectory.is_dir():
229                        _load_datasource(subdirectory)
230
231        sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in
232                              sorted(self.datasources, key=lambda id: self.datasources[id]["name"])}
233        self.datasources = sorted_datasources
234
235    def expand_datasources(self):
236        """
237        Expand datasource metadata
238
239        Some datasource metadata can only be known after all workers have been
240        loaded, e.g. whether there is a search worker for the datasource. This
241        function takes care of populating those values.
242        """
243        for datasource_id in self.datasources:
244            worker = self.workers.get("%s-search" % datasource_id)
245            self.datasources[datasource_id]["has_worker"] = bool(worker)
246            self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \
247                                                             bool(self.workers["%s-search" % datasource_id].get_options())
248            self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer
249
250    def load_worker_class(self, worker):
251        """
252        Get class for worker
253
254        This import worker modules on-demand, so the code is only loaded if a
255        worker that needs the code is actually queued and run
256
257        :return:  Worker class for the given worker metadata
258        """
259        module = worker["module"]
260        if module not in sys.modules:
261            importlib.import_module(module)
262
263        return getattr(sys.modules[module], worker["class_name"])

Collects all modular appendages of 4CAT

On init, an object of this class collects all datasources and workers that have been added to this 4CAT installation. The metadata of these is then stored for later access.

Datasources are found in the "datasources" folder in root. Workers are found in datasource folders or the default "processors" and "backend/workers" folder. All these folders are scanned for both processors and workers (processors being a specific kind of worker).

ModuleCollector(write_config=False) View Source

40    def __init__(self, write_config=False):
41        """
42        Load data sources and workers
43
44        Datasources are loaded first so that the datasource folders may be
45        scanned for workers subsequently.
46        """
47        # this can be flushed later once the logger is available
48        self.log_buffer = ""
49
50        self.load_datasources()
51        self.load_modules()
52
53        # now we know all workers, we can add some extra metadata to the
54        # datasources, e.g. whether they have an associated search worker
55        self.expand_datasources()
56
57        # cache module-defined config options for use by the config manager
58        if write_config:
59            module_config = {}
60            for worker in self.workers.values():
61                if hasattr(worker, "config") and type(worker.config) is dict:
62                    module_config.update(worker.config)
63
64            with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile:
65                pickle.dump(module_config, outfile)
66
67        # load from cache
68        config.load_user_settings()

Load data sources and workers

Datasources are loaded first so that the datasource folders may be scanned for workers subsequently.

ignore = []

missing_modules = {}

log_buffer = None

PROCESSOR = 1

WORKER = 2

workers = {}

processors = {}

datasources = {}

@staticmethod

def is_4cat_class(object, only_processors=False): View Source

70    @staticmethod
71    def is_4cat_class(object, only_processors=False):
72        """
73        Determine if a module member is a worker class we can use
74        """
75        if inspect.isclass(object):
76            if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object):
77                # ignore abstract and base classes
78                return False
79
80            if hasattr(object, "is_4cat_class"):
81                if only_processors:
82                    if hasattr(object, "is_4cat_processor"):
83                        return object.is_4cat_processor()
84                    else:
85                        return False
86                else:
87                    return object.is_4cat_class()
88
89        return False

Determine if a module member is a worker class we can use

def load_modules(self): View Source

 91    def load_modules(self):
 92        """
 93        Load modules
 94
 95        Modules are workers and (as a subset of workers) postprocessors. These
 96        are found by importing any python files found in the given locations,
 97        and looking for relevant classes within those python files, that extend
 98        `BasicProcessor` or `BasicWorker` and are not abstract.
 99        """
100        # look for workers and processors in pre-defined folders and datasources
101
102        extension_path = Path(config.get('PATH_ROOT'), "extensions")
103
104        paths = [Path(config.get('PATH_ROOT'), "processors"),
105                 Path(config.get('PATH_ROOT'), "backend", "workers"),
106                 extension_path,
107                 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line...
108
109        root_match = re.compile(r"^%s" % re.escape(str(config.get('PATH_ROOT'))))
110        root_path = Path(config.get('PATH_ROOT'))
111
112        for folder in paths:
113            # loop through folders, and files in those folders, recursively
114            is_extension = extension_path in folder.parents or folder == extension_path
115            for file in folder.rglob("*.py"):
116                # determine module name for file
117                # reduce path to be relative to 4CAT root
118                module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,))
119
120                # check if we've already loaded this module
121                if module_name in self.ignore:
122                    continue
123
124                if module_name in sys.modules:
125                    # This skips processors/datasources that were loaded by others and may not yet be captured
126                    pass
127
128                # try importing
129                try:
130                    module = importlib.import_module(module_name)
131                except (SyntaxError, ImportError) as e:
132                    # this is fine, just ignore this data source and give a heads up
133                    self.ignore.append(module_name)
134                    key_name = e.name if hasattr(e, "name") else module_name
135                    if key_name not in self.missing_modules:
136                        self.missing_modules[key_name] = [module_name]
137                    else:
138                        self.missing_modules[key_name].append(module_name)
139                    continue
140
141                # see if module contains the right type of content by looping
142                # through all of its members
143                components = inspect.getmembers(module, predicate=self.is_4cat_class)
144                for component in components:
145                    if component[1].type in self.workers:
146                        # already indexed
147                        continue
148
149                    # extract data that is useful for the scheduler and other
150                    # parts of 4CAT
151                    relative_path = root_match.sub("", str(file))
152
153                    self.workers[component[1].type] = component[1]
154                    self.workers[component[1].type].filepath = relative_path
155                    self.workers[component[1].type].is_extension = is_extension
156
157                    # we can't use issubclass() because for that we would need
158                    # to import BasicProcessor, which would lead to a circular
159                    # import
160                    if self.is_4cat_class(component[1], only_processors=True):
161                        self.processors[component[1].type] = self.workers[component[1].type]
162
163        # sort by category for more convenient display in interfaces
164        sorted_processors = {id: self.processors[id] for id in
165                             sorted(self.processors)}
166        categorised_processors = {id: sorted_processors[id] for id in
167                                  sorted(sorted_processors,
168                                         key=lambda item: "0" if sorted_processors[item].category == "Presets" else
169                                         sorted_processors[item].category)}
170
171        # Give a heads-up if not all modules were installed properly
172        if self.missing_modules:
173            warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \
174                      "function.\nMissing modules:\n"
175            for missing_module, processor_list in self.missing_modules.items():
176                warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list))
177
178            self.log_buffer += warning
179
180        self.processors = categorised_processors

Load modules

Modules are workers and (as a subset of workers) postprocessors. These are found by importing any python files found in the given locations, and looking for relevant classes within those python files, that extend BasicProcessor or BasicWorker and are not abstract.

def load_datasources(self): View Source

182    def load_datasources(self):
183        """
184        Load datasources
185
186        This looks for folders within the datasource root folder that contain
187        an `__init__.py` defining an `init_datasource` function and a
188        `DATASOURCE` constant. The latter is taken as the ID for this
189        datasource.
190        """
191        def _load_datasource(subdirectory):
192            """
193            Load a single datasource
194            """
195            # determine module name (path relative to 4CAT root w/ periods)
196            module_name = ".".join(subdirectory.relative_to(Path(config.get("PATH_ROOT"))).parts)
197            try:
198                datasource = importlib.import_module(module_name)
199            except ImportError as e:
200                self.log_buffer += "Could not import %s: %s\n" % (module_name, e)
201                return
202
203            if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"):
204                self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory
205                return
206
207            datasource_id = datasource.DATASOURCE
208
209            self.datasources[datasource_id] = {
210                "expire-datasets": config.get("datasources.expiration", {}).get(datasource_id, None),
211                "path": subdirectory,
212                "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id,
213                "id": subdirectory.parts[-1],
214                "init": datasource.init_datasource,
215                "config": {} if not hasattr(datasource, "config") else datasource.config
216            }
217
218        # Load 4CAT core datasources
219        for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir():
220            if subdirectory.is_dir():
221                _load_datasource(subdirectory)
222
223        # Load extension datasources
224        # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders
225        for root, dirs, files in os.walk(Path(config.get('PATH_ROOT'), "extensions"), followlinks=True):
226            if "datasources" in dirs:
227                for subdirectory in Path(root, "datasources").iterdir():
228                    if subdirectory.is_dir():
229                        _load_datasource(subdirectory)
230
231        sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in
232                              sorted(self.datasources, key=lambda id: self.datasources[id]["name"])}
233        self.datasources = sorted_datasources

Load datasources

This looks for folders within the datasource root folder that contain an __init__.py defining an init_datasource function and a DATASOURCE constant. The latter is taken as the ID for this datasource.

def expand_datasources(self): View Source

235    def expand_datasources(self):
236        """
237        Expand datasource metadata
238
239        Some datasource metadata can only be known after all workers have been
240        loaded, e.g. whether there is a search worker for the datasource. This
241        function takes care of populating those values.
242        """
243        for datasource_id in self.datasources:
244            worker = self.workers.get("%s-search" % datasource_id)
245            self.datasources[datasource_id]["has_worker"] = bool(worker)
246            self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \
247                                                             bool(self.workers["%s-search" % datasource_id].get_options())
248            self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer

Expand datasource metadata

Some datasource metadata can only be known after all workers have been loaded, e.g. whether there is a search worker for the datasource. This function takes care of populating those values.

def load_worker_class(self, worker): View Source

250    def load_worker_class(self, worker):
251        """
252        Get class for worker
253
254        This import worker modules on-demand, so the code is only loaded if a
255        worker that needs the code is actually queued and run
256
257        :return:  Worker class for the given worker metadata
258        """
259        module = worker["module"]
260        if module not in sys.modules:
261            importlib.import_module(module)
262
263        return getattr(sys.modules[module], worker["class_name"])

Get class for worker

This import worker modules on-demand, so the code is only loaded if a worker that needs the code is actually queued and run

Returns

Worker class for the given worker metadata