common.lib.module_loader
Load modules and datasources dynamically
1""" 2Load modules and datasources dynamically 3""" 4from pathlib import Path 5import importlib 6import inspect 7import pickle 8import sys 9import re 10import os 11 12from common.config_manager import config 13 14 15class ModuleCollector: 16 """ 17 Collects all modular appendages of 4CAT 18 19 On init, an object of this class collects all datasources and workers that 20 have been added to this 4CAT installation. The metadata of these is then 21 stored for later access. 22 23 Datasources are found in the "datasources" folder in root. Workers are 24 found in datasource folders or the default "processors" and 25 "backend/workers" folder. All these folders are scanned for both 26 processors and workers (processors being a specific kind of worker). 27 """ 28 ignore = [] 29 missing_modules = {} 30 log_buffer = None 31 32 PROCESSOR = 1 33 WORKER = 2 34 35 workers = {} 36 processors = {} 37 datasources = {} 38 39 def __init__(self, write_config=False): 40 """ 41 Load data sources and workers 42 43 Datasources are loaded first so that the datasource folders may be 44 scanned for workers subsequently. 45 """ 46 # this can be flushed later once the logger is available 47 self.log_buffer = "" 48 49 self.load_datasources() 50 self.load_modules() 51 52 # now we know all workers, we can add some extra metadata to the 53 # datasources, e.g. whether they have an associated search worker 54 self.expand_datasources() 55 56 # cache module-defined config options for use by the config manager 57 if write_config: 58 module_config = {} 59 for worker in self.workers.values(): 60 if hasattr(worker, "config") and type(worker.config) is dict: 61 module_config.update(worker.config) 62 63 with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile: 64 pickle.dump(module_config, outfile) 65 66 # load from cache 67 config.load_user_settings() 68 69 @staticmethod 70 def is_4cat_class(object, only_processors=False): 71 """ 72 Determine if a module member is a worker class we can use 73 """ 74 if inspect.isclass(object): 75 if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object): 76 # ignore abstract and base classes 77 return False 78 79 if hasattr(object, "is_4cat_class"): 80 if only_processors: 81 if hasattr(object, "is_4cat_processor"): 82 return object.is_4cat_processor() 83 else: 84 return False 85 else: 86 return object.is_4cat_class() 87 88 return False 89 90 def load_modules(self): 91 """ 92 Load modules 93 94 Modules are workers and (as a subset of workers) postprocessors. These 95 are found by importing any python files found in the given locations, 96 and looking for relevant classes within those python files, that extend 97 `BasicProcessor` or `BasicWorker` and are not abstract. 98 """ 99 # look for workers and processors in pre-defined folders and datasources 100 101 extension_path = Path(config.get('PATH_ROOT'), "extensions") 102 103 paths = [Path(config.get('PATH_ROOT'), "processors"), 104 Path(config.get('PATH_ROOT'), "backend", "workers"), 105 extension_path, 106 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line... 107 108 root_match = re.compile(r"^%s" % re.escape(str(config.get('PATH_ROOT')))) 109 root_path = Path(config.get('PATH_ROOT')) 110 111 for folder in paths: 112 # loop through folders, and files in those folders, recursively 113 is_extension = extension_path in folder.parents or folder == extension_path 114 for file in folder.rglob("*.py"): 115 # determine module name for file 116 # reduce path to be relative to 4CAT root 117 module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,)) 118 119 # check if we've already loaded this module 120 if module_name in self.ignore: 121 continue 122 123 if module_name in sys.modules: 124 # This skips processors/datasources that were loaded by others and may not yet be captured 125 pass 126 127 # try importing 128 try: 129 module = importlib.import_module(module_name) 130 except (SyntaxError, ImportError) as e: 131 # this is fine, just ignore this data source and give a heads up 132 self.ignore.append(module_name) 133 key_name = e.name if hasattr(e, "name") else module_name 134 if key_name not in self.missing_modules: 135 self.missing_modules[key_name] = [module_name] 136 else: 137 self.missing_modules[key_name].append(module_name) 138 continue 139 140 # see if module contains the right type of content by looping 141 # through all of its members 142 components = inspect.getmembers(module, predicate=self.is_4cat_class) 143 for component in components: 144 if component[1].type in self.workers: 145 # already indexed 146 continue 147 148 # extract data that is useful for the scheduler and other 149 # parts of 4CAT 150 relative_path = root_match.sub("", str(file)) 151 152 self.workers[component[1].type] = component[1] 153 self.workers[component[1].type].filepath = relative_path 154 self.workers[component[1].type].is_extension = is_extension 155 156 # we can't use issubclass() because for that we would need 157 # to import BasicProcessor, which would lead to a circular 158 # import 159 if self.is_4cat_class(component[1], only_processors=True): 160 self.processors[component[1].type] = self.workers[component[1].type] 161 162 # sort by category for more convenient display in interfaces 163 sorted_processors = {id: self.processors[id] for id in 164 sorted(self.processors)} 165 categorised_processors = {id: sorted_processors[id] for id in 166 sorted(sorted_processors, 167 key=lambda item: "0" if sorted_processors[item].category == "Presets" else 168 sorted_processors[item].category)} 169 170 # Give a heads-up if not all modules were installed properly 171 if self.missing_modules: 172 warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \ 173 "function.\nMissing modules:\n" 174 for missing_module, processor_list in self.missing_modules.items(): 175 warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list)) 176 177 self.log_buffer += warning 178 179 self.processors = categorised_processors 180 181 def load_datasources(self): 182 """ 183 Load datasources 184 185 This looks for folders within the datasource root folder that contain 186 an `__init__.py` defining an `init_datasource` function and a 187 `DATASOURCE` constant. The latter is taken as the ID for this 188 datasource. 189 """ 190 def _load_datasource(subdirectory): 191 """ 192 Load a single datasource 193 """ 194 # determine module name (path relative to 4CAT root w/ periods) 195 module_name = ".".join(subdirectory.relative_to(Path(config.get("PATH_ROOT"))).parts) 196 try: 197 datasource = importlib.import_module(module_name) 198 except ImportError as e: 199 self.log_buffer += "Could not import %s: %s\n" % (module_name, e) 200 return 201 202 if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"): 203 self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory 204 return 205 206 datasource_id = datasource.DATASOURCE 207 208 self.datasources[datasource_id] = { 209 "expire-datasets": config.get("datasources.expiration", {}).get(datasource_id, None), 210 "path": subdirectory, 211 "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id, 212 "id": subdirectory.parts[-1], 213 "init": datasource.init_datasource, 214 "config": {} if not hasattr(datasource, "config") else datasource.config 215 } 216 217 # Load 4CAT core datasources 218 for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir(): 219 if subdirectory.is_dir(): 220 _load_datasource(subdirectory) 221 222 # Load extension datasources 223 # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders 224 for root, dirs, files in os.walk(Path(config.get('PATH_ROOT'), "extensions"), followlinks=True): 225 if "datasources" in dirs: 226 for subdirectory in Path(root, "datasources").iterdir(): 227 if subdirectory.is_dir(): 228 _load_datasource(subdirectory) 229 230 sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in 231 sorted(self.datasources, key=lambda id: self.datasources[id]["name"])} 232 self.datasources = sorted_datasources 233 234 def expand_datasources(self): 235 """ 236 Expand datasource metadata 237 238 Some datasource metadata can only be known after all workers have been 239 loaded, e.g. whether there is a search worker for the datasource. This 240 function takes care of populating those values. 241 """ 242 for datasource_id in self.datasources: 243 worker = self.workers.get("%s-search" % datasource_id) 244 self.datasources[datasource_id]["has_worker"] = bool(worker) 245 self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \ 246 bool(self.workers["%s-search" % datasource_id].get_options()) 247 self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer 248 249 def load_worker_class(self, worker): 250 """ 251 Get class for worker 252 253 This import worker modules on-demand, so the code is only loaded if a 254 worker that needs the code is actually queued and run 255 256 :return: Worker class for the given worker metadata 257 """ 258 module = worker["module"] 259 if module not in sys.modules: 260 importlib.import_module(module) 261 262 return getattr(sys.modules[module], worker["class_name"])
16class ModuleCollector: 17 """ 18 Collects all modular appendages of 4CAT 19 20 On init, an object of this class collects all datasources and workers that 21 have been added to this 4CAT installation. The metadata of these is then 22 stored for later access. 23 24 Datasources are found in the "datasources" folder in root. Workers are 25 found in datasource folders or the default "processors" and 26 "backend/workers" folder. All these folders are scanned for both 27 processors and workers (processors being a specific kind of worker). 28 """ 29 ignore = [] 30 missing_modules = {} 31 log_buffer = None 32 33 PROCESSOR = 1 34 WORKER = 2 35 36 workers = {} 37 processors = {} 38 datasources = {} 39 40 def __init__(self, write_config=False): 41 """ 42 Load data sources and workers 43 44 Datasources are loaded first so that the datasource folders may be 45 scanned for workers subsequently. 46 """ 47 # this can be flushed later once the logger is available 48 self.log_buffer = "" 49 50 self.load_datasources() 51 self.load_modules() 52 53 # now we know all workers, we can add some extra metadata to the 54 # datasources, e.g. whether they have an associated search worker 55 self.expand_datasources() 56 57 # cache module-defined config options for use by the config manager 58 if write_config: 59 module_config = {} 60 for worker in self.workers.values(): 61 if hasattr(worker, "config") and type(worker.config) is dict: 62 module_config.update(worker.config) 63 64 with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile: 65 pickle.dump(module_config, outfile) 66 67 # load from cache 68 config.load_user_settings() 69 70 @staticmethod 71 def is_4cat_class(object, only_processors=False): 72 """ 73 Determine if a module member is a worker class we can use 74 """ 75 if inspect.isclass(object): 76 if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object): 77 # ignore abstract and base classes 78 return False 79 80 if hasattr(object, "is_4cat_class"): 81 if only_processors: 82 if hasattr(object, "is_4cat_processor"): 83 return object.is_4cat_processor() 84 else: 85 return False 86 else: 87 return object.is_4cat_class() 88 89 return False 90 91 def load_modules(self): 92 """ 93 Load modules 94 95 Modules are workers and (as a subset of workers) postprocessors. These 96 are found by importing any python files found in the given locations, 97 and looking for relevant classes within those python files, that extend 98 `BasicProcessor` or `BasicWorker` and are not abstract. 99 """ 100 # look for workers and processors in pre-defined folders and datasources 101 102 extension_path = Path(config.get('PATH_ROOT'), "extensions") 103 104 paths = [Path(config.get('PATH_ROOT'), "processors"), 105 Path(config.get('PATH_ROOT'), "backend", "workers"), 106 extension_path, 107 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line... 108 109 root_match = re.compile(r"^%s" % re.escape(str(config.get('PATH_ROOT')))) 110 root_path = Path(config.get('PATH_ROOT')) 111 112 for folder in paths: 113 # loop through folders, and files in those folders, recursively 114 is_extension = extension_path in folder.parents or folder == extension_path 115 for file in folder.rglob("*.py"): 116 # determine module name for file 117 # reduce path to be relative to 4CAT root 118 module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,)) 119 120 # check if we've already loaded this module 121 if module_name in self.ignore: 122 continue 123 124 if module_name in sys.modules: 125 # This skips processors/datasources that were loaded by others and may not yet be captured 126 pass 127 128 # try importing 129 try: 130 module = importlib.import_module(module_name) 131 except (SyntaxError, ImportError) as e: 132 # this is fine, just ignore this data source and give a heads up 133 self.ignore.append(module_name) 134 key_name = e.name if hasattr(e, "name") else module_name 135 if key_name not in self.missing_modules: 136 self.missing_modules[key_name] = [module_name] 137 else: 138 self.missing_modules[key_name].append(module_name) 139 continue 140 141 # see if module contains the right type of content by looping 142 # through all of its members 143 components = inspect.getmembers(module, predicate=self.is_4cat_class) 144 for component in components: 145 if component[1].type in self.workers: 146 # already indexed 147 continue 148 149 # extract data that is useful for the scheduler and other 150 # parts of 4CAT 151 relative_path = root_match.sub("", str(file)) 152 153 self.workers[component[1].type] = component[1] 154 self.workers[component[1].type].filepath = relative_path 155 self.workers[component[1].type].is_extension = is_extension 156 157 # we can't use issubclass() because for that we would need 158 # to import BasicProcessor, which would lead to a circular 159 # import 160 if self.is_4cat_class(component[1], only_processors=True): 161 self.processors[component[1].type] = self.workers[component[1].type] 162 163 # sort by category for more convenient display in interfaces 164 sorted_processors = {id: self.processors[id] for id in 165 sorted(self.processors)} 166 categorised_processors = {id: sorted_processors[id] for id in 167 sorted(sorted_processors, 168 key=lambda item: "0" if sorted_processors[item].category == "Presets" else 169 sorted_processors[item].category)} 170 171 # Give a heads-up if not all modules were installed properly 172 if self.missing_modules: 173 warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \ 174 "function.\nMissing modules:\n" 175 for missing_module, processor_list in self.missing_modules.items(): 176 warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list)) 177 178 self.log_buffer += warning 179 180 self.processors = categorised_processors 181 182 def load_datasources(self): 183 """ 184 Load datasources 185 186 This looks for folders within the datasource root folder that contain 187 an `__init__.py` defining an `init_datasource` function and a 188 `DATASOURCE` constant. The latter is taken as the ID for this 189 datasource. 190 """ 191 def _load_datasource(subdirectory): 192 """ 193 Load a single datasource 194 """ 195 # determine module name (path relative to 4CAT root w/ periods) 196 module_name = ".".join(subdirectory.relative_to(Path(config.get("PATH_ROOT"))).parts) 197 try: 198 datasource = importlib.import_module(module_name) 199 except ImportError as e: 200 self.log_buffer += "Could not import %s: %s\n" % (module_name, e) 201 return 202 203 if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"): 204 self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory 205 return 206 207 datasource_id = datasource.DATASOURCE 208 209 self.datasources[datasource_id] = { 210 "expire-datasets": config.get("datasources.expiration", {}).get(datasource_id, None), 211 "path": subdirectory, 212 "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id, 213 "id": subdirectory.parts[-1], 214 "init": datasource.init_datasource, 215 "config": {} if not hasattr(datasource, "config") else datasource.config 216 } 217 218 # Load 4CAT core datasources 219 for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir(): 220 if subdirectory.is_dir(): 221 _load_datasource(subdirectory) 222 223 # Load extension datasources 224 # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders 225 for root, dirs, files in os.walk(Path(config.get('PATH_ROOT'), "extensions"), followlinks=True): 226 if "datasources" in dirs: 227 for subdirectory in Path(root, "datasources").iterdir(): 228 if subdirectory.is_dir(): 229 _load_datasource(subdirectory) 230 231 sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in 232 sorted(self.datasources, key=lambda id: self.datasources[id]["name"])} 233 self.datasources = sorted_datasources 234 235 def expand_datasources(self): 236 """ 237 Expand datasource metadata 238 239 Some datasource metadata can only be known after all workers have been 240 loaded, e.g. whether there is a search worker for the datasource. This 241 function takes care of populating those values. 242 """ 243 for datasource_id in self.datasources: 244 worker = self.workers.get("%s-search" % datasource_id) 245 self.datasources[datasource_id]["has_worker"] = bool(worker) 246 self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \ 247 bool(self.workers["%s-search" % datasource_id].get_options()) 248 self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer 249 250 def load_worker_class(self, worker): 251 """ 252 Get class for worker 253 254 This import worker modules on-demand, so the code is only loaded if a 255 worker that needs the code is actually queued and run 256 257 :return: Worker class for the given worker metadata 258 """ 259 module = worker["module"] 260 if module not in sys.modules: 261 importlib.import_module(module) 262 263 return getattr(sys.modules[module], worker["class_name"])
Collects all modular appendages of 4CAT
On init, an object of this class collects all datasources and workers that have been added to this 4CAT installation. The metadata of these is then stored for later access.
Datasources are found in the "datasources" folder in root. Workers are found in datasource folders or the default "processors" and "backend/workers" folder. All these folders are scanned for both processors and workers (processors being a specific kind of worker).
40 def __init__(self, write_config=False): 41 """ 42 Load data sources and workers 43 44 Datasources are loaded first so that the datasource folders may be 45 scanned for workers subsequently. 46 """ 47 # this can be flushed later once the logger is available 48 self.log_buffer = "" 49 50 self.load_datasources() 51 self.load_modules() 52 53 # now we know all workers, we can add some extra metadata to the 54 # datasources, e.g. whether they have an associated search worker 55 self.expand_datasources() 56 57 # cache module-defined config options for use by the config manager 58 if write_config: 59 module_config = {} 60 for worker in self.workers.values(): 61 if hasattr(worker, "config") and type(worker.config) is dict: 62 module_config.update(worker.config) 63 64 with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile: 65 pickle.dump(module_config, outfile) 66 67 # load from cache 68 config.load_user_settings()
Load data sources and workers
Datasources are loaded first so that the datasource folders may be scanned for workers subsequently.
70 @staticmethod 71 def is_4cat_class(object, only_processors=False): 72 """ 73 Determine if a module member is a worker class we can use 74 """ 75 if inspect.isclass(object): 76 if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object): 77 # ignore abstract and base classes 78 return False 79 80 if hasattr(object, "is_4cat_class"): 81 if only_processors: 82 if hasattr(object, "is_4cat_processor"): 83 return object.is_4cat_processor() 84 else: 85 return False 86 else: 87 return object.is_4cat_class() 88 89 return False
Determine if a module member is a worker class we can use
91 def load_modules(self): 92 """ 93 Load modules 94 95 Modules are workers and (as a subset of workers) postprocessors. These 96 are found by importing any python files found in the given locations, 97 and looking for relevant classes within those python files, that extend 98 `BasicProcessor` or `BasicWorker` and are not abstract. 99 """ 100 # look for workers and processors in pre-defined folders and datasources 101 102 extension_path = Path(config.get('PATH_ROOT'), "extensions") 103 104 paths = [Path(config.get('PATH_ROOT'), "processors"), 105 Path(config.get('PATH_ROOT'), "backend", "workers"), 106 extension_path, 107 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line... 108 109 root_match = re.compile(r"^%s" % re.escape(str(config.get('PATH_ROOT')))) 110 root_path = Path(config.get('PATH_ROOT')) 111 112 for folder in paths: 113 # loop through folders, and files in those folders, recursively 114 is_extension = extension_path in folder.parents or folder == extension_path 115 for file in folder.rglob("*.py"): 116 # determine module name for file 117 # reduce path to be relative to 4CAT root 118 module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,)) 119 120 # check if we've already loaded this module 121 if module_name in self.ignore: 122 continue 123 124 if module_name in sys.modules: 125 # This skips processors/datasources that were loaded by others and may not yet be captured 126 pass 127 128 # try importing 129 try: 130 module = importlib.import_module(module_name) 131 except (SyntaxError, ImportError) as e: 132 # this is fine, just ignore this data source and give a heads up 133 self.ignore.append(module_name) 134 key_name = e.name if hasattr(e, "name") else module_name 135 if key_name not in self.missing_modules: 136 self.missing_modules[key_name] = [module_name] 137 else: 138 self.missing_modules[key_name].append(module_name) 139 continue 140 141 # see if module contains the right type of content by looping 142 # through all of its members 143 components = inspect.getmembers(module, predicate=self.is_4cat_class) 144 for component in components: 145 if component[1].type in self.workers: 146 # already indexed 147 continue 148 149 # extract data that is useful for the scheduler and other 150 # parts of 4CAT 151 relative_path = root_match.sub("", str(file)) 152 153 self.workers[component[1].type] = component[1] 154 self.workers[component[1].type].filepath = relative_path 155 self.workers[component[1].type].is_extension = is_extension 156 157 # we can't use issubclass() because for that we would need 158 # to import BasicProcessor, which would lead to a circular 159 # import 160 if self.is_4cat_class(component[1], only_processors=True): 161 self.processors[component[1].type] = self.workers[component[1].type] 162 163 # sort by category for more convenient display in interfaces 164 sorted_processors = {id: self.processors[id] for id in 165 sorted(self.processors)} 166 categorised_processors = {id: sorted_processors[id] for id in 167 sorted(sorted_processors, 168 key=lambda item: "0" if sorted_processors[item].category == "Presets" else 169 sorted_processors[item].category)} 170 171 # Give a heads-up if not all modules were installed properly 172 if self.missing_modules: 173 warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \ 174 "function.\nMissing modules:\n" 175 for missing_module, processor_list in self.missing_modules.items(): 176 warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list)) 177 178 self.log_buffer += warning 179 180 self.processors = categorised_processors
Load modules
Modules are workers and (as a subset of workers) postprocessors. These
are found by importing any python files found in the given locations,
and looking for relevant classes within those python files, that extend
BasicProcessor
or BasicWorker
and are not abstract.
182 def load_datasources(self): 183 """ 184 Load datasources 185 186 This looks for folders within the datasource root folder that contain 187 an `__init__.py` defining an `init_datasource` function and a 188 `DATASOURCE` constant. The latter is taken as the ID for this 189 datasource. 190 """ 191 def _load_datasource(subdirectory): 192 """ 193 Load a single datasource 194 """ 195 # determine module name (path relative to 4CAT root w/ periods) 196 module_name = ".".join(subdirectory.relative_to(Path(config.get("PATH_ROOT"))).parts) 197 try: 198 datasource = importlib.import_module(module_name) 199 except ImportError as e: 200 self.log_buffer += "Could not import %s: %s\n" % (module_name, e) 201 return 202 203 if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"): 204 self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory 205 return 206 207 datasource_id = datasource.DATASOURCE 208 209 self.datasources[datasource_id] = { 210 "expire-datasets": config.get("datasources.expiration", {}).get(datasource_id, None), 211 "path": subdirectory, 212 "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id, 213 "id": subdirectory.parts[-1], 214 "init": datasource.init_datasource, 215 "config": {} if not hasattr(datasource, "config") else datasource.config 216 } 217 218 # Load 4CAT core datasources 219 for subdirectory in Path(config.get('PATH_ROOT'), "datasources").iterdir(): 220 if subdirectory.is_dir(): 221 _load_datasource(subdirectory) 222 223 # Load extension datasources 224 # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders 225 for root, dirs, files in os.walk(Path(config.get('PATH_ROOT'), "extensions"), followlinks=True): 226 if "datasources" in dirs: 227 for subdirectory in Path(root, "datasources").iterdir(): 228 if subdirectory.is_dir(): 229 _load_datasource(subdirectory) 230 231 sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in 232 sorted(self.datasources, key=lambda id: self.datasources[id]["name"])} 233 self.datasources = sorted_datasources
Load datasources
This looks for folders within the datasource root folder that contain
an __init__.py
defining an init_datasource
function and a
DATASOURCE
constant. The latter is taken as the ID for this
datasource.
235 def expand_datasources(self): 236 """ 237 Expand datasource metadata 238 239 Some datasource metadata can only be known after all workers have been 240 loaded, e.g. whether there is a search worker for the datasource. This 241 function takes care of populating those values. 242 """ 243 for datasource_id in self.datasources: 244 worker = self.workers.get("%s-search" % datasource_id) 245 self.datasources[datasource_id]["has_worker"] = bool(worker) 246 self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \ 247 bool(self.workers["%s-search" % datasource_id].get_options()) 248 self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer
Expand datasource metadata
Some datasource metadata can only be known after all workers have been loaded, e.g. whether there is a search worker for the datasource. This function takes care of populating those values.
250 def load_worker_class(self, worker): 251 """ 252 Get class for worker 253 254 This import worker modules on-demand, so the code is only loaded if a 255 worker that needs the code is actually queued and run 256 257 :return: Worker class for the given worker metadata 258 """ 259 module = worker["module"] 260 if module not in sys.modules: 261 importlib.import_module(module) 262 263 return getattr(sys.modules[module], worker["class_name"])
Get class for worker
This import worker modules on-demand, so the code is only loaded if a worker that needs the code is actually queued and run
Returns
Worker class for the given worker metadata