common.lib.module_loader
Load modules and datasources dynamically
1""" 2Load modules and datasources dynamically 3""" 4from pathlib import Path 5import importlib 6import inspect 7import pickle 8import sys 9import re 10import os 11 12 13class ModuleCollector: 14 """ 15 Collects all modular appendages of 4CAT 16 17 On init, an object of this class collects all datasources and workers that 18 have been added to this 4CAT installation. The metadata of these is then 19 stored for later access. 20 21 Datasources are found in the "datasources" folder in root. Workers are 22 found in datasource folders or the default "processors" and 23 "backend/workers" folder. All these folders are scanned for both 24 processors and workers (processors being a specific kind of worker). 25 """ 26 ignore = [] 27 missing_modules = {} 28 log_buffer = None 29 config = None 30 31 PROCESSOR = 1 32 WORKER = 2 33 34 workers = {} 35 processors = {} 36 datasources = {} 37 38 def __init__(self, config, write_cache=False): 39 """ 40 Load data sources and workers 41 42 Datasources are loaded first so that the datasource folders may be 43 scanned for workers subsequently. 44 45 :param config: Configuration manager, shared with the rest of the 46 context 47 :param bool write_cache: Write modules to cache file? 48 """ 49 # this can be flushed later once the logger is available 50 self.log_buffer = "" 51 self.config = config 52 53 self.load_datasources() 54 self.load_modules() 55 56 # now we know all workers, we can add some extra metadata to the 57 # datasources, e.g. whether they have an associated search worker 58 self.expand_datasources() 59 60 # cache module-defined config options for use by the config manager 61 if write_cache: 62 module_config = {} 63 for worker in self.workers.values(): 64 if hasattr(worker, "config") and type(worker.config) is dict: 65 module_config.update(worker.config) 66 67 with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile: 68 pickle.dump(module_config, outfile) 69 70 # load from cache 71 self.config.load_user_settings() 72 73 @staticmethod 74 def is_4cat_class(object, only_processors=False): 75 """ 76 Determine if a module member is a worker class we can use 77 """ 78 if inspect.isclass(object): 79 if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object): 80 # ignore abstract and base classes 81 return False 82 83 if hasattr(object, "is_4cat_class"): 84 if only_processors: 85 if hasattr(object, "is_4cat_processor"): 86 return object.is_4cat_processor() 87 else: 88 return False 89 else: 90 return object.is_4cat_class() 91 92 return False 93 94 def load_modules(self): 95 """ 96 Load modules 97 98 Modules are workers and (as a subset of workers) postprocessors. These 99 are found by importing any python files found in the given locations, 100 and looking for relevant classes within those python files, that extend 101 `BasicProcessor` or `BasicWorker` and are not abstract. 102 """ 103 # look for workers and processors in pre-defined folders and datasources 104 105 extension_path = Path(self.config.get('PATH_ROOT'), "extensions") 106 107 paths = [Path(self.config.get('PATH_ROOT'), "processors"), 108 Path(self.config.get('PATH_ROOT'), "backend", "workers"), 109 extension_path, 110 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line... 111 112 root_match = re.compile(r"^%s" % re.escape(str(self.config.get('PATH_ROOT')))) 113 root_path = Path(self.config.get('PATH_ROOT')) 114 115 for folder in paths: 116 # loop through folders, and files in those folders, recursively 117 is_extension = extension_path in folder.parents or folder == extension_path 118 for file in folder.rglob("*.py"): 119 # determine module name for file 120 # reduce path to be relative to 4CAT root 121 module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,)) 122 123 # check if we've already loaded this module 124 if module_name in self.ignore: 125 continue 126 127 if module_name in sys.modules: 128 # This skips processors/datasources that were loaded by others and may not yet be captured 129 pass 130 131 # try importing 132 try: 133 module = importlib.import_module(module_name) 134 except (SyntaxError, ImportError) as e: 135 # this is fine, just ignore this data source and give a heads up 136 self.ignore.append(module_name) 137 key_name = e.name if hasattr(e, "name") else module_name 138 if key_name not in self.missing_modules: 139 self.missing_modules[key_name] = [module_name] 140 else: 141 self.missing_modules[key_name].append(module_name) 142 continue 143 144 # see if module contains the right type of content by looping 145 # through all of its members 146 components = inspect.getmembers(module, predicate=self.is_4cat_class) 147 for component in components: 148 if component[1].type in self.workers: 149 # already indexed 150 continue 151 152 # extract data that is useful for the scheduler and other 153 # parts of 4CAT 154 relative_path = root_match.sub("", str(file)) 155 156 self.workers[component[1].type] = component[1] 157 self.workers[component[1].type].filepath = relative_path 158 self.workers[component[1].type].is_extension = is_extension 159 160 # we can't use issubclass() because for that we would need 161 # to import BasicProcessor, which would lead to a circular 162 # import 163 if self.is_4cat_class(component[1], only_processors=True): 164 self.processors[component[1].type] = self.workers[component[1].type] 165 166 # sort by category for more convenient display in interfaces 167 sorted_processors = {id: self.processors[id] for id in 168 sorted(self.processors)} 169 categorised_processors = {id: sorted_processors[id] for id in 170 sorted(sorted_processors, 171 key=lambda item: "0" if sorted_processors[item].category == "Presets" else 172 sorted_processors[item].category)} 173 174 # Give a heads-up if not all modules were installed properly 175 if self.missing_modules: 176 warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \ 177 "function.\nMissing modules:\n" 178 for missing_module, processor_list in self.missing_modules.items(): 179 warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list)) 180 181 self.log_buffer += warning 182 183 self.processors = categorised_processors 184 185 def load_datasources(self): 186 """ 187 Load datasources 188 189 This looks for folders within the datasource root folder that contain 190 an `__init__.py` defining an `init_datasource` function and a 191 `DATASOURCE` constant. The latter is taken as the ID for this 192 datasource. 193 """ 194 def _load_datasource(subdirectory, expiration): 195 """ 196 Load a single datasource 197 """ 198 # determine module name (path relative to 4CAT root w/ periods) 199 module_name = ".".join(subdirectory.relative_to(Path(self.config.get("PATH_ROOT"))).parts) 200 try: 201 datasource = importlib.import_module(module_name) 202 except ImportError as e: 203 self.log_buffer += "Could not import %s: %s\n" % (module_name, e) 204 return 205 206 if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"): 207 self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory 208 return 209 210 datasource_id = datasource.DATASOURCE 211 212 self.datasources[datasource_id] = { 213 "expire-datasets": expiration.get(datasource_id, None), 214 "path": subdirectory, 215 "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id, 216 "id": subdirectory.parts[-1], 217 "init": datasource.init_datasource, 218 "config": {} if not hasattr(datasource, "config") else datasource.config 219 } 220 221 expiration = self.config.get("datasources.expiration", {}) 222 223 # Load 4CAT core datasources 224 expiration = self.config.get("datasources.expiration", {}) 225 for subdirectory in Path(self.config.get('PATH_ROOT'), "datasources").iterdir(): 226 if subdirectory.is_dir(): 227 _load_datasource(subdirectory, expiration) 228 229 # Load extension datasources 230 # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders 231 for root, dirs, files in os.walk(Path(self.config.get('PATH_ROOT'), "extensions"), followlinks=True): 232 if "datasources" in dirs: 233 for subdirectory in Path(root, "datasources").iterdir(): 234 if subdirectory.is_dir(): 235 _load_datasource(subdirectory, expiration) 236 237 sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in 238 sorted(self.datasources, key=lambda id: self.datasources[id]["name"])} 239 self.datasources = sorted_datasources 240 241 def expand_datasources(self): 242 """ 243 Expand datasource metadata 244 245 Some datasource metadata can only be known after all workers have been 246 loaded, e.g. whether there is a search worker for the datasource. This 247 function takes care of populating those values. 248 """ 249 for datasource_id in self.datasources: 250 worker = self.workers.get("%s-search" % datasource_id) 251 self.datasources[datasource_id]["has_worker"] = bool(worker) 252 self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \ 253 bool(self.workers[ 254 "%s-search" % datasource_id].get_options( 255 config=self.config)) 256 self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer 257 258 def load_worker_class(self, worker): 259 """ 260 Get class for worker 261 262 This import worker modules on-demand, so the code is only loaded if a 263 worker that needs the code is actually queued and run 264 265 :return: Worker class for the given worker metadata 266 """ 267 module = worker["module"] 268 if module not in sys.modules: 269 importlib.import_module(module) 270 271 return getattr(sys.modules[module], worker["class_name"])
14class ModuleCollector: 15 """ 16 Collects all modular appendages of 4CAT 17 18 On init, an object of this class collects all datasources and workers that 19 have been added to this 4CAT installation. The metadata of these is then 20 stored for later access. 21 22 Datasources are found in the "datasources" folder in root. Workers are 23 found in datasource folders or the default "processors" and 24 "backend/workers" folder. All these folders are scanned for both 25 processors and workers (processors being a specific kind of worker). 26 """ 27 ignore = [] 28 missing_modules = {} 29 log_buffer = None 30 config = None 31 32 PROCESSOR = 1 33 WORKER = 2 34 35 workers = {} 36 processors = {} 37 datasources = {} 38 39 def __init__(self, config, write_cache=False): 40 """ 41 Load data sources and workers 42 43 Datasources are loaded first so that the datasource folders may be 44 scanned for workers subsequently. 45 46 :param config: Configuration manager, shared with the rest of the 47 context 48 :param bool write_cache: Write modules to cache file? 49 """ 50 # this can be flushed later once the logger is available 51 self.log_buffer = "" 52 self.config = config 53 54 self.load_datasources() 55 self.load_modules() 56 57 # now we know all workers, we can add some extra metadata to the 58 # datasources, e.g. whether they have an associated search worker 59 self.expand_datasources() 60 61 # cache module-defined config options for use by the config manager 62 if write_cache: 63 module_config = {} 64 for worker in self.workers.values(): 65 if hasattr(worker, "config") and type(worker.config) is dict: 66 module_config.update(worker.config) 67 68 with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile: 69 pickle.dump(module_config, outfile) 70 71 # load from cache 72 self.config.load_user_settings() 73 74 @staticmethod 75 def is_4cat_class(object, only_processors=False): 76 """ 77 Determine if a module member is a worker class we can use 78 """ 79 if inspect.isclass(object): 80 if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object): 81 # ignore abstract and base classes 82 return False 83 84 if hasattr(object, "is_4cat_class"): 85 if only_processors: 86 if hasattr(object, "is_4cat_processor"): 87 return object.is_4cat_processor() 88 else: 89 return False 90 else: 91 return object.is_4cat_class() 92 93 return False 94 95 def load_modules(self): 96 """ 97 Load modules 98 99 Modules are workers and (as a subset of workers) postprocessors. These 100 are found by importing any python files found in the given locations, 101 and looking for relevant classes within those python files, that extend 102 `BasicProcessor` or `BasicWorker` and are not abstract. 103 """ 104 # look for workers and processors in pre-defined folders and datasources 105 106 extension_path = Path(self.config.get('PATH_ROOT'), "extensions") 107 108 paths = [Path(self.config.get('PATH_ROOT'), "processors"), 109 Path(self.config.get('PATH_ROOT'), "backend", "workers"), 110 extension_path, 111 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line... 112 113 root_match = re.compile(r"^%s" % re.escape(str(self.config.get('PATH_ROOT')))) 114 root_path = Path(self.config.get('PATH_ROOT')) 115 116 for folder in paths: 117 # loop through folders, and files in those folders, recursively 118 is_extension = extension_path in folder.parents or folder == extension_path 119 for file in folder.rglob("*.py"): 120 # determine module name for file 121 # reduce path to be relative to 4CAT root 122 module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,)) 123 124 # check if we've already loaded this module 125 if module_name in self.ignore: 126 continue 127 128 if module_name in sys.modules: 129 # This skips processors/datasources that were loaded by others and may not yet be captured 130 pass 131 132 # try importing 133 try: 134 module = importlib.import_module(module_name) 135 except (SyntaxError, ImportError) as e: 136 # this is fine, just ignore this data source and give a heads up 137 self.ignore.append(module_name) 138 key_name = e.name if hasattr(e, "name") else module_name 139 if key_name not in self.missing_modules: 140 self.missing_modules[key_name] = [module_name] 141 else: 142 self.missing_modules[key_name].append(module_name) 143 continue 144 145 # see if module contains the right type of content by looping 146 # through all of its members 147 components = inspect.getmembers(module, predicate=self.is_4cat_class) 148 for component in components: 149 if component[1].type in self.workers: 150 # already indexed 151 continue 152 153 # extract data that is useful for the scheduler and other 154 # parts of 4CAT 155 relative_path = root_match.sub("", str(file)) 156 157 self.workers[component[1].type] = component[1] 158 self.workers[component[1].type].filepath = relative_path 159 self.workers[component[1].type].is_extension = is_extension 160 161 # we can't use issubclass() because for that we would need 162 # to import BasicProcessor, which would lead to a circular 163 # import 164 if self.is_4cat_class(component[1], only_processors=True): 165 self.processors[component[1].type] = self.workers[component[1].type] 166 167 # sort by category for more convenient display in interfaces 168 sorted_processors = {id: self.processors[id] for id in 169 sorted(self.processors)} 170 categorised_processors = {id: sorted_processors[id] for id in 171 sorted(sorted_processors, 172 key=lambda item: "0" if sorted_processors[item].category == "Presets" else 173 sorted_processors[item].category)} 174 175 # Give a heads-up if not all modules were installed properly 176 if self.missing_modules: 177 warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \ 178 "function.\nMissing modules:\n" 179 for missing_module, processor_list in self.missing_modules.items(): 180 warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list)) 181 182 self.log_buffer += warning 183 184 self.processors = categorised_processors 185 186 def load_datasources(self): 187 """ 188 Load datasources 189 190 This looks for folders within the datasource root folder that contain 191 an `__init__.py` defining an `init_datasource` function and a 192 `DATASOURCE` constant. The latter is taken as the ID for this 193 datasource. 194 """ 195 def _load_datasource(subdirectory, expiration): 196 """ 197 Load a single datasource 198 """ 199 # determine module name (path relative to 4CAT root w/ periods) 200 module_name = ".".join(subdirectory.relative_to(Path(self.config.get("PATH_ROOT"))).parts) 201 try: 202 datasource = importlib.import_module(module_name) 203 except ImportError as e: 204 self.log_buffer += "Could not import %s: %s\n" % (module_name, e) 205 return 206 207 if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"): 208 self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory 209 return 210 211 datasource_id = datasource.DATASOURCE 212 213 self.datasources[datasource_id] = { 214 "expire-datasets": expiration.get(datasource_id, None), 215 "path": subdirectory, 216 "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id, 217 "id": subdirectory.parts[-1], 218 "init": datasource.init_datasource, 219 "config": {} if not hasattr(datasource, "config") else datasource.config 220 } 221 222 expiration = self.config.get("datasources.expiration", {}) 223 224 # Load 4CAT core datasources 225 expiration = self.config.get("datasources.expiration", {}) 226 for subdirectory in Path(self.config.get('PATH_ROOT'), "datasources").iterdir(): 227 if subdirectory.is_dir(): 228 _load_datasource(subdirectory, expiration) 229 230 # Load extension datasources 231 # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders 232 for root, dirs, files in os.walk(Path(self.config.get('PATH_ROOT'), "extensions"), followlinks=True): 233 if "datasources" in dirs: 234 for subdirectory in Path(root, "datasources").iterdir(): 235 if subdirectory.is_dir(): 236 _load_datasource(subdirectory, expiration) 237 238 sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in 239 sorted(self.datasources, key=lambda id: self.datasources[id]["name"])} 240 self.datasources = sorted_datasources 241 242 def expand_datasources(self): 243 """ 244 Expand datasource metadata 245 246 Some datasource metadata can only be known after all workers have been 247 loaded, e.g. whether there is a search worker for the datasource. This 248 function takes care of populating those values. 249 """ 250 for datasource_id in self.datasources: 251 worker = self.workers.get("%s-search" % datasource_id) 252 self.datasources[datasource_id]["has_worker"] = bool(worker) 253 self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \ 254 bool(self.workers[ 255 "%s-search" % datasource_id].get_options( 256 config=self.config)) 257 self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer 258 259 def load_worker_class(self, worker): 260 """ 261 Get class for worker 262 263 This import worker modules on-demand, so the code is only loaded if a 264 worker that needs the code is actually queued and run 265 266 :return: Worker class for the given worker metadata 267 """ 268 module = worker["module"] 269 if module not in sys.modules: 270 importlib.import_module(module) 271 272 return getattr(sys.modules[module], worker["class_name"])
Collects all modular appendages of 4CAT
On init, an object of this class collects all datasources and workers that have been added to this 4CAT installation. The metadata of these is then stored for later access.
Datasources are found in the "datasources" folder in root. Workers are found in datasource folders or the default "processors" and "backend/workers" folder. All these folders are scanned for both processors and workers (processors being a specific kind of worker).
39 def __init__(self, config, write_cache=False): 40 """ 41 Load data sources and workers 42 43 Datasources are loaded first so that the datasource folders may be 44 scanned for workers subsequently. 45 46 :param config: Configuration manager, shared with the rest of the 47 context 48 :param bool write_cache: Write modules to cache file? 49 """ 50 # this can be flushed later once the logger is available 51 self.log_buffer = "" 52 self.config = config 53 54 self.load_datasources() 55 self.load_modules() 56 57 # now we know all workers, we can add some extra metadata to the 58 # datasources, e.g. whether they have an associated search worker 59 self.expand_datasources() 60 61 # cache module-defined config options for use by the config manager 62 if write_cache: 63 module_config = {} 64 for worker in self.workers.values(): 65 if hasattr(worker, "config") and type(worker.config) is dict: 66 module_config.update(worker.config) 67 68 with config.get("PATH_ROOT").joinpath("config/module_config.bin").open("wb") as outfile: 69 pickle.dump(module_config, outfile) 70 71 # load from cache 72 self.config.load_user_settings()
Load data sources and workers
Datasources are loaded first so that the datasource folders may be scanned for workers subsequently.
Parameters
- config: Configuration manager, shared with the rest of the context
- bool write_cache: Write modules to cache file?
74 @staticmethod 75 def is_4cat_class(object, only_processors=False): 76 """ 77 Determine if a module member is a worker class we can use 78 """ 79 if inspect.isclass(object): 80 if object.__name__ in("BasicProcessor", "BasicWorker") or inspect.isabstract(object): 81 # ignore abstract and base classes 82 return False 83 84 if hasattr(object, "is_4cat_class"): 85 if only_processors: 86 if hasattr(object, "is_4cat_processor"): 87 return object.is_4cat_processor() 88 else: 89 return False 90 else: 91 return object.is_4cat_class() 92 93 return False
Determine if a module member is a worker class we can use
95 def load_modules(self): 96 """ 97 Load modules 98 99 Modules are workers and (as a subset of workers) postprocessors. These 100 are found by importing any python files found in the given locations, 101 and looking for relevant classes within those python files, that extend 102 `BasicProcessor` or `BasicWorker` and are not abstract. 103 """ 104 # look for workers and processors in pre-defined folders and datasources 105 106 extension_path = Path(self.config.get('PATH_ROOT'), "extensions") 107 108 paths = [Path(self.config.get('PATH_ROOT'), "processors"), 109 Path(self.config.get('PATH_ROOT'), "backend", "workers"), 110 extension_path, 111 *[self.datasources[datasource]["path"] for datasource in self.datasources]] # extension datasources will be here and the above line... 112 113 root_match = re.compile(r"^%s" % re.escape(str(self.config.get('PATH_ROOT')))) 114 root_path = Path(self.config.get('PATH_ROOT')) 115 116 for folder in paths: 117 # loop through folders, and files in those folders, recursively 118 is_extension = extension_path in folder.parents or folder == extension_path 119 for file in folder.rglob("*.py"): 120 # determine module name for file 121 # reduce path to be relative to 4CAT root 122 module_name = ".".join(file.parts[len(root_path.parts):-1] + (file.stem,)) 123 124 # check if we've already loaded this module 125 if module_name in self.ignore: 126 continue 127 128 if module_name in sys.modules: 129 # This skips processors/datasources that were loaded by others and may not yet be captured 130 pass 131 132 # try importing 133 try: 134 module = importlib.import_module(module_name) 135 except (SyntaxError, ImportError) as e: 136 # this is fine, just ignore this data source and give a heads up 137 self.ignore.append(module_name) 138 key_name = e.name if hasattr(e, "name") else module_name 139 if key_name not in self.missing_modules: 140 self.missing_modules[key_name] = [module_name] 141 else: 142 self.missing_modules[key_name].append(module_name) 143 continue 144 145 # see if module contains the right type of content by looping 146 # through all of its members 147 components = inspect.getmembers(module, predicate=self.is_4cat_class) 148 for component in components: 149 if component[1].type in self.workers: 150 # already indexed 151 continue 152 153 # extract data that is useful for the scheduler and other 154 # parts of 4CAT 155 relative_path = root_match.sub("", str(file)) 156 157 self.workers[component[1].type] = component[1] 158 self.workers[component[1].type].filepath = relative_path 159 self.workers[component[1].type].is_extension = is_extension 160 161 # we can't use issubclass() because for that we would need 162 # to import BasicProcessor, which would lead to a circular 163 # import 164 if self.is_4cat_class(component[1], only_processors=True): 165 self.processors[component[1].type] = self.workers[component[1].type] 166 167 # sort by category for more convenient display in interfaces 168 sorted_processors = {id: self.processors[id] for id in 169 sorted(self.processors)} 170 categorised_processors = {id: sorted_processors[id] for id in 171 sorted(sorted_processors, 172 key=lambda item: "0" if sorted_processors[item].category == "Presets" else 173 sorted_processors[item].category)} 174 175 # Give a heads-up if not all modules were installed properly 176 if self.missing_modules: 177 warning = "Warning: Not all modules could be found, which might cause data sources and modules to not " \ 178 "function.\nMissing modules:\n" 179 for missing_module, processor_list in self.missing_modules.items(): 180 warning += "\t%s (for %s)\n" % (missing_module, ", ".join(processor_list)) 181 182 self.log_buffer += warning 183 184 self.processors = categorised_processors
Load modules
Modules are workers and (as a subset of workers) postprocessors. These
are found by importing any python files found in the given locations,
and looking for relevant classes within those python files, that extend
BasicProcessor
or BasicWorker
and are not abstract.
186 def load_datasources(self): 187 """ 188 Load datasources 189 190 This looks for folders within the datasource root folder that contain 191 an `__init__.py` defining an `init_datasource` function and a 192 `DATASOURCE` constant. The latter is taken as the ID for this 193 datasource. 194 """ 195 def _load_datasource(subdirectory, expiration): 196 """ 197 Load a single datasource 198 """ 199 # determine module name (path relative to 4CAT root w/ periods) 200 module_name = ".".join(subdirectory.relative_to(Path(self.config.get("PATH_ROOT"))).parts) 201 try: 202 datasource = importlib.import_module(module_name) 203 except ImportError as e: 204 self.log_buffer += "Could not import %s: %s\n" % (module_name, e) 205 return 206 207 if not hasattr(datasource, "init_datasource") or not hasattr(datasource, "DATASOURCE"): 208 self.log_buffer += "Could not load datasource %s: missing init_datasource or DATASOURCE\n" % subdirectory 209 return 210 211 datasource_id = datasource.DATASOURCE 212 213 self.datasources[datasource_id] = { 214 "expire-datasets": expiration.get(datasource_id, None), 215 "path": subdirectory, 216 "name": datasource.NAME if hasattr(datasource, "NAME") else datasource_id, 217 "id": subdirectory.parts[-1], 218 "init": datasource.init_datasource, 219 "config": {} if not hasattr(datasource, "config") else datasource.config 220 } 221 222 expiration = self.config.get("datasources.expiration", {}) 223 224 # Load 4CAT core datasources 225 expiration = self.config.get("datasources.expiration", {}) 226 for subdirectory in Path(self.config.get('PATH_ROOT'), "datasources").iterdir(): 227 if subdirectory.is_dir(): 228 _load_datasource(subdirectory, expiration) 229 230 # Load extension datasources 231 # os.walk is used to allow for the possibility of multiple extensions, with nested "datasources" folders 232 for root, dirs, files in os.walk(Path(self.config.get('PATH_ROOT'), "extensions"), followlinks=True): 233 if "datasources" in dirs: 234 for subdirectory in Path(root, "datasources").iterdir(): 235 if subdirectory.is_dir(): 236 _load_datasource(subdirectory, expiration) 237 238 sorted_datasources = {datasource_id: self.datasources[datasource_id] for datasource_id in 239 sorted(self.datasources, key=lambda id: self.datasources[id]["name"])} 240 self.datasources = sorted_datasources
Load datasources
This looks for folders within the datasource root folder that contain
an __init__.py
defining an init_datasource
function and a
DATASOURCE
constant. The latter is taken as the ID for this
datasource.
242 def expand_datasources(self): 243 """ 244 Expand datasource metadata 245 246 Some datasource metadata can only be known after all workers have been 247 loaded, e.g. whether there is a search worker for the datasource. This 248 function takes care of populating those values. 249 """ 250 for datasource_id in self.datasources: 251 worker = self.workers.get("%s-search" % datasource_id) 252 self.datasources[datasource_id]["has_worker"] = bool(worker) 253 self.datasources[datasource_id]["has_options"] = self.datasources[datasource_id]["has_worker"] and \ 254 bool(self.workers[ 255 "%s-search" % datasource_id].get_options( 256 config=self.config)) 257 self.datasources[datasource_id]["importable"] = worker and hasattr(worker, "is_from_zeeschuimer") and worker.is_from_zeeschuimer
Expand datasource metadata
Some datasource metadata can only be known after all workers have been loaded, e.g. whether there is a search worker for the datasource. This function takes care of populating those values.
259 def load_worker_class(self, worker): 260 """ 261 Get class for worker 262 263 This import worker modules on-demand, so the code is only loaded if a 264 worker that needs the code is actually queued and run 265 266 :return: Worker class for the given worker metadata 267 """ 268 module = worker["module"] 269 if module not in sys.modules: 270 importlib.import_module(module) 271 272 return getattr(sys.modules[module], worker["class_name"])
Get class for worker
This import worker modules on-demand, so the code is only loaded if a worker that needs the code is actually queued and run
Returns
Worker class for the given worker metadata