common.lib.annotation
Annotation class
1""" 2Annotation class 3""" 4 5 6import time 7import json 8 9from common.lib.database import Database 10from common.lib.exceptions import AnnotationException 11 12 13class Annotation: 14 """ 15 Annotation class 16 17 Annotations are always tied to a dataset, a dataset item (e.g. a csv row), 18 an annotation label, and a type ('text', 'multichoice', etc.). 19 20 """ 21 22 # Attributes must be created here to ensure getattr and setattr work properly 23 24 data = None 25 db = None 26 27 id = None # Unique ID for this annotation 28 item_id = None # ID of the item that this annotation was made for, e.g. a post ID. 29 field_id = None # ID for the annotation field 30 dataset = None # Dataset key this annotation is generated from 31 timestamp = None # When this annotation was edited 32 timestamp_created = None # When this annotation was created 33 label = None # Label of annotation 34 type = None # Type of annotation (e.g. `text`) 35 options = None # Possible options 36 value = None # The actual annotation value 37 author = None # Who last edited the annotation 38 author_original = None # Who originally made the annotation 39 by_processor = None # Whether the annotation was made by a processor 40 from_dataset = None # Processor-made dataset key this annotation was generated as part of 41 metadata = None # Misc metadata 42 43 def __init__(self, data=None, annotation_id=None, db=None): 44 """ 45 Instantiate annotation object. 46 47 :param data: Annotation data; should correspond to the annotations table record. 48 :param annotation_id: The ID of an annotation. If given, it retrieves the annotation 49 from the database. 50 :param db: Database connection object 51 """ 52 53 required_fields = ["field_id", "item_id", "dataset"] 54 55 # Must have an ID or data 56 if (annotation_id is None and data is None) or (data is not None and not isinstance(data, dict)): 57 raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.") 58 59 if not db: 60 raise AnnotationException("Annotation() needs a `db` database object") 61 62 self.db = db 63 64 new_or_updated = False 65 66 # Get the annotation data if the ID is given; if an annotation has 67 # an ID, it is guaranteed to be in the database. 68 # IDs can both be explicitly given or present in the data dict. 69 if annotation_id is not None or "id" in data: 70 if data and "id" in data: 71 annotation_id = data["id"] 72 self.id = annotation_id # IDs correspond to unique serial numbers in the database. 73 current = self.get_by_id(annotation_id) 74 if not current: 75 raise AnnotationException( 76 "Annotation() requires a valid ID for an existing annotation, %s given" % annotation_id) 77 78 # If an ID is not given, get or create an Annotation object from its data. 79 # First check if required fields are present in `data`. 80 else: 81 for required_field in required_fields: 82 if required_field not in data or not data[required_field]: 83 raise AnnotationException("Annotation() requires a %s field" % required_field) 84 85 # Check if this annotation already exists, based on dataset key, item id, and label. 86 current = self.get_by_field(data["dataset"], data["item_id"], data["field_id"]) 87 88 # If we were able to retrieve an annotation from the db, it already exists 89 if current: 90 # Check if we have to overwrite old data with new data 91 if data: 92 for key, value in data.items(): 93 # Save unknown fields in metadata 94 if key not in current: 95 current["metadata"][key] = value 96 new_or_updated = True 97 # If values differ, update the value 98 elif current[key] != value: 99 current[key] = value 100 new_or_updated = True 101 102 self.data = current 103 104 # If this is a new annotation, set all the properties. 105 else: 106 # Keep track of when the annotation was made 107 created_timestamp = int(time.time()) 108 109 new_data = { 110 "dataset": data["dataset"], 111 "item_id": data["item_id"], 112 "field_id": data["field_id"], 113 "timestamp": created_timestamp, 114 "timestamp_created": created_timestamp, 115 "label": data["label"], 116 "type": data.get("type", "text"), 117 "options": data.get("options", ""), 118 "value": data.get("value", ""), 119 "author": data.get("author", ""), 120 "author_original": data.get("author", ""), 121 "by_processor": data.get("by_processor", False), 122 "from_dataset": data.get("from_dataset", ""), 123 "metadata": data.get("metadata", {}), 124 } 125 126 self.data = new_data 127 new_or_updated = True 128 129 if isinstance(self.data["metadata"], str): 130 try: 131 self.metadata = json.loads(self.data["metadata"]) 132 except (TypeError, json.JSONDecodeError): 133 self.metadata = {} 134 135 for k, v in self.data.items(): 136 # Some type checking 137 try: 138 if k == "timestamp" or k == "timestamp_created": 139 v = int(v) 140 elif k == "by_processor": 141 v = bool(v) 142 except ValueError as e: 143 raise AnnotationException("Annotation fields are not of the right type (%s)" % e) 144 self.__setattr__(k, v) 145 146 # Write to db if anything changed 147 if new_or_updated: 148 self.timestamp = int(time.time()) 149 self.write_to_db() 150 151 def get_by_id(self, annotation_id: int): 152 """ 153 Get annotation by ID 154 155 :param str annotation_id: ID of annotation. 156 :return: Annotation object, or an empty dict if the ID doesn't exist. 157 """ 158 159 try: 160 int(annotation_id) 161 except ValueError: 162 raise AnnotationException("Id '%s' is not valid" % annotation_id) 163 164 data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % annotation_id) 165 166 if not data: 167 return {} 168 169 if data["type"] == "checkbox": 170 data["value"] = data["value"].split(",") 171 data["metadata"] = json.loads(data["metadata"]) 172 173 return data 174 175 def get_by_field(self, dataset_key: str, item_id: str, field_id: str) -> dict: 176 """ 177 Get the annotation information via its dataset key, item ID, and field_id. 178 This is always a unique combination. 179 180 :param dataset_key: The key of the dataset this annotation was made for. 181 :param item_id: The ID of the item this annotation was made for. 182 :param field_id: The field ID of the annotation. 183 184 :return data: A dict with data of the retrieved annotation, or an empty dict if it doesn't exist. 185 """ 186 187 data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND field_id = %s", 188 (dataset_key, str(item_id), field_id)) 189 if not data: 190 return {} 191 192 if data["type"] == "checkbox": 193 data["value"] = data["value"].split(",") 194 data["metadata"] = json.loads(data["metadata"]) 195 196 return data 197 198 def write_to_db(self): 199 """ 200 Write an annotation to the database. 201 """ 202 db_data = self.data 203 204 db_data["timestamp"] = int(time.time()) 205 m = db_data["metadata"] # To avoid circular reference error 206 db_data["metadata"] = json.dumps(m) 207 if db_data["type"] == "checkbox": 208 db_data["value"] = ",".join(db_data["value"]) 209 210 return self.db.upsert("annotations", data=db_data, constraints=["field_id", "dataset", "item_id"]) 211 212 def delete(self): 213 """ 214 Deletes this annotation 215 """ 216 return self.db.delete("annotations", {"id": self.id}) 217 218 219 @staticmethod 220 def get_annotations_for_dataset(db: Database, dataset_key: str, item_id=None, before=0) -> list: 221 """ 222 Returns all annotations for a dataset. 223 :param db: Database object. 224 :param str dataset_key: A dataset key. 225 :param str item_id: An optional item ID or multiple item IDs to only get annotations from specific 226 items 227 :param int before: The upper timestamp for annotations to retrieve. 228 229 :return list: List with annotations. 230 """ 231 if not dataset_key: 232 return [] 233 234 if not before: 235 before = int(time.time()) 236 237 if item_id: 238 # Normalise to strings so the ANY operator receives a proper text[] 239 if isinstance(item_id, (list, tuple, set)): 240 iterable_ids = item_id 241 else: 242 iterable_ids = [item_id] 243 244 item_ids = [str(i) for i in iterable_ids if i is not None] 245 if not item_ids: 246 return [] 247 248 data = db.fetchall( 249 "SELECT * FROM annotations WHERE dataset = %s AND item_id = ANY(%s::text[]) AND timestamp <= %s", 250 (dataset_key, item_ids, before,) 251 ) 252 else: 253 data = db.fetchall("SELECT * FROM annotations WHERE dataset = %s AND timestamp <= %s", (dataset_key, before,)) 254 if not data: 255 return [] 256 257 for i in range(len(data)): 258 if data[i]["type"] == "checkbox": 259 data[i]["value"] = data[i]["value"].split(",") 260 data[i]["metadata"] = json.loads(data[i]["metadata"]) 261 262 return [Annotation(data=d, db=db) for d in data] 263 264 @staticmethod 265 def delete_many(db: Database, dataset_key=None, annotation_id=None, field_id=None): 266 """ 267 Deletes annotations for an entire dataset or by a list of (field) IDs. 268 269 :param db: Database object. 270 :param str dataset_key: A dataset key. 271 :param li annotation_id: A list or string of unique annotation IDs. 272 :param li field_id: A list or string of IDs for annotation fields. 273 274 :return int: The number of removed records. 275 """ 276 if not dataset_key and not annotation_id and not field_id: 277 return 0 278 279 where = {} 280 if dataset_key: 281 where["dataset"] = dataset_key 282 if annotation_id: 283 where["id"] = annotation_id 284 if field_id: 285 where["field_id"] = field_id 286 287 return db.delete("annotations", where) 288 289 @staticmethod 290 def update_annotations_via_fields(dataset_key: str, old_fields: dict, new_fields: dict, db: Database) -> int: 291 """ 292 Updates annotations in the annotations table if the input fields 293 themselves have been changed, for instance if a dropdown label is renamed 294 or a field is deleted. 295 296 :param str dataset_key: The dataset key for which fields changed. 297 :param dict old_fields: Old annotation fields. 298 :param dict new_fields: New annotation fields; this should contain not just 299 the additions, but all fields, changed or otherwise. 300 :param db: Database object so we can write. 301 302 :returns int: How many records were affected. 303 """ 304 305 text_fields = ["textarea", "text"] 306 307 # If old and new fields are identical, do nothing. 308 if old_fields == new_fields: 309 return 0 310 311 fields_to_delete = set() # Delete all annotations with this field ID 312 fields_to_update = {} # Update values of annotations with this field ID 313 old_options = {} 314 315 # Loop through the old annotation fields 316 for old_field_id, old_field in old_fields.items(): 317 318 # Delete all annotations of this type if the field is deleted. 319 if old_field_id not in new_fields: 320 fields_to_delete.add(old_field_id) 321 continue 322 323 field_id = old_field_id 324 new_field = new_fields[field_id] 325 326 # If the annotation type has changed, also delete existing annotations, 327 # except between text and textarea, where we can just change the type and keep the text. 328 if old_field["type"] != new_field["type"]: 329 if old_field["type"] not in text_fields and new_field["type"] not in text_fields: 330 fields_to_delete.add(field_id) 331 continue 332 333 # Loop through all the key/values in the new field settings 334 # and update in case it's different from the old values. 335 update_data = {} 336 for field_key, field_value in new_field.items(): 337 338 # Update if values don't match 339 if field_value != old_field.get(field_key): 340 341 # Special case: option values that are removed/renamed. 342 # Here we may have to change/delete values within the 343 # values column. 344 if field_key == "options": 345 346 new_options = field_value 347 348 # Edge case: delete annotations of this type if all option fields are deleted 349 if not new_options: 350 fields_to_delete.add(field_id) 351 continue 352 353 # Changed options values (e.g. renamed or one field deleted) 354 old_options[old_field_id] = old_field.get("options", {}) 355 options_to_update = {} 356 if old_options[old_field_id] and old_options != new_options: 357 options_to_update = new_options 358 359 if options_to_update: 360 update_data[field_key] = {"options": options_to_update} 361 362 # For all other changes, just overwrite with new data. 363 else: 364 update_data[field_key] = field_value 365 366 if update_data: 367 fields_to_update[field_id] = update_data 368 369 # Delete annotations 370 if fields_to_delete: 371 Annotation.delete_many(db, field_id=list(fields_to_delete)) 372 373 # Write changes to fields to database 374 count = 0 375 if fields_to_update: 376 for field_id, updates in fields_to_update.items(): 377 378 # Write to db 379 for column, update_value in updates.items(): 380 381 update_value_insert = update_value 382 if column == "options": 383 update_value_insert = ",".join(list(update_value["options"].values())) 384 385 # Change values of columns 386 updates = db.update("annotations", {column: update_value_insert}, 387 where={"dataset": dataset_key, "field_id": field_id}) 388 count += updates 389 390 # Special case: Changed option labels. 391 # Here we have to also rename/remove inserted options from the `value` column. 392 if column == "options": 393 394 annotations = db.fetchall("SELECT id, options, value FROM annotations " 395 "WHERE dataset = '%s' and field_id = '%s' AND value != '';" 396 % (dataset_key, field_id)) 397 398 for annotation in annotations: 399 annotation_id = annotation["id"] 400 annotation_values = annotation["value"].split(",") 401 402 # Remove or rename options 403 new_values = [] 404 new_options = update_value["options"] # Dict with option id->label as items 405 406 for ann_value in annotation_values: 407 # Get the option ID, so we can see if it's new, deleted, or renamed. 408 # Should always be present in old options dict 409 option_id = [k for k, v in old_options[field_id].items() if v == ann_value][0] 410 # Deleted... 411 if option_id not in new_options: 412 continue 413 # Or replaced with a new, possibly renamed value 414 else: 415 new_values.append(new_options[option_id]) 416 417 new_values = ",".join(new_values) 418 db.update("annotations", {"value": new_values}, where={"id": annotation_id}) 419 420 return count 421 422 def __getattr__(self, attr): 423 """ 424 Getter so we don't have to use .data all the time 425 426 :param attr: Data key to get 427 :return: Value 428 """ 429 430 if attr in dir(self): 431 # an explicitly defined attribute should always be called in favour 432 # of this passthrough 433 attribute = getattr(self, attr) 434 return attribute 435 elif attr in self.data: 436 return self.data[attr] 437 else: 438 raise AttributeError("Annotation instance has no attribute %s" % attr) 439 440 def __setattr__(self, attr, value): 441 """ 442 Setter so we can flexibly update the database 443 444 Also updates internal data stores (.data etc.). If the attribute is 445 unknown, it is stored within the 'metadata' attribute. 446 447 :param str attr: Attribute to update 448 :param value: New value 449 """ 450 451 # don't override behaviour for *actual* class attributes 452 if attr in dir(self): 453 super().__setattr__(attr, value) 454 return 455 456 if attr not in self.data: 457 self.metadata[attr] = value 458 attr = "metadata" 459 value = self.metadata 460 461 if attr == "metadata": 462 value = json.dumps(value) 463 464 self.db.update("annotations", where={"id": self.id}, data={attr: value}) 465 466 self.data[attr] = value 467 if attr == "metadata": 468 self.metadata = json.loads(value)
14class Annotation: 15 """ 16 Annotation class 17 18 Annotations are always tied to a dataset, a dataset item (e.g. a csv row), 19 an annotation label, and a type ('text', 'multichoice', etc.). 20 21 """ 22 23 # Attributes must be created here to ensure getattr and setattr work properly 24 25 data = None 26 db = None 27 28 id = None # Unique ID for this annotation 29 item_id = None # ID of the item that this annotation was made for, e.g. a post ID. 30 field_id = None # ID for the annotation field 31 dataset = None # Dataset key this annotation is generated from 32 timestamp = None # When this annotation was edited 33 timestamp_created = None # When this annotation was created 34 label = None # Label of annotation 35 type = None # Type of annotation (e.g. `text`) 36 options = None # Possible options 37 value = None # The actual annotation value 38 author = None # Who last edited the annotation 39 author_original = None # Who originally made the annotation 40 by_processor = None # Whether the annotation was made by a processor 41 from_dataset = None # Processor-made dataset key this annotation was generated as part of 42 metadata = None # Misc metadata 43 44 def __init__(self, data=None, annotation_id=None, db=None): 45 """ 46 Instantiate annotation object. 47 48 :param data: Annotation data; should correspond to the annotations table record. 49 :param annotation_id: The ID of an annotation. If given, it retrieves the annotation 50 from the database. 51 :param db: Database connection object 52 """ 53 54 required_fields = ["field_id", "item_id", "dataset"] 55 56 # Must have an ID or data 57 if (annotation_id is None and data is None) or (data is not None and not isinstance(data, dict)): 58 raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.") 59 60 if not db: 61 raise AnnotationException("Annotation() needs a `db` database object") 62 63 self.db = db 64 65 new_or_updated = False 66 67 # Get the annotation data if the ID is given; if an annotation has 68 # an ID, it is guaranteed to be in the database. 69 # IDs can both be explicitly given or present in the data dict. 70 if annotation_id is not None or "id" in data: 71 if data and "id" in data: 72 annotation_id = data["id"] 73 self.id = annotation_id # IDs correspond to unique serial numbers in the database. 74 current = self.get_by_id(annotation_id) 75 if not current: 76 raise AnnotationException( 77 "Annotation() requires a valid ID for an existing annotation, %s given" % annotation_id) 78 79 # If an ID is not given, get or create an Annotation object from its data. 80 # First check if required fields are present in `data`. 81 else: 82 for required_field in required_fields: 83 if required_field not in data or not data[required_field]: 84 raise AnnotationException("Annotation() requires a %s field" % required_field) 85 86 # Check if this annotation already exists, based on dataset key, item id, and label. 87 current = self.get_by_field(data["dataset"], data["item_id"], data["field_id"]) 88 89 # If we were able to retrieve an annotation from the db, it already exists 90 if current: 91 # Check if we have to overwrite old data with new data 92 if data: 93 for key, value in data.items(): 94 # Save unknown fields in metadata 95 if key not in current: 96 current["metadata"][key] = value 97 new_or_updated = True 98 # If values differ, update the value 99 elif current[key] != value: 100 current[key] = value 101 new_or_updated = True 102 103 self.data = current 104 105 # If this is a new annotation, set all the properties. 106 else: 107 # Keep track of when the annotation was made 108 created_timestamp = int(time.time()) 109 110 new_data = { 111 "dataset": data["dataset"], 112 "item_id": data["item_id"], 113 "field_id": data["field_id"], 114 "timestamp": created_timestamp, 115 "timestamp_created": created_timestamp, 116 "label": data["label"], 117 "type": data.get("type", "text"), 118 "options": data.get("options", ""), 119 "value": data.get("value", ""), 120 "author": data.get("author", ""), 121 "author_original": data.get("author", ""), 122 "by_processor": data.get("by_processor", False), 123 "from_dataset": data.get("from_dataset", ""), 124 "metadata": data.get("metadata", {}), 125 } 126 127 self.data = new_data 128 new_or_updated = True 129 130 if isinstance(self.data["metadata"], str): 131 try: 132 self.metadata = json.loads(self.data["metadata"]) 133 except (TypeError, json.JSONDecodeError): 134 self.metadata = {} 135 136 for k, v in self.data.items(): 137 # Some type checking 138 try: 139 if k == "timestamp" or k == "timestamp_created": 140 v = int(v) 141 elif k == "by_processor": 142 v = bool(v) 143 except ValueError as e: 144 raise AnnotationException("Annotation fields are not of the right type (%s)" % e) 145 self.__setattr__(k, v) 146 147 # Write to db if anything changed 148 if new_or_updated: 149 self.timestamp = int(time.time()) 150 self.write_to_db() 151 152 def get_by_id(self, annotation_id: int): 153 """ 154 Get annotation by ID 155 156 :param str annotation_id: ID of annotation. 157 :return: Annotation object, or an empty dict if the ID doesn't exist. 158 """ 159 160 try: 161 int(annotation_id) 162 except ValueError: 163 raise AnnotationException("Id '%s' is not valid" % annotation_id) 164 165 data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % annotation_id) 166 167 if not data: 168 return {} 169 170 if data["type"] == "checkbox": 171 data["value"] = data["value"].split(",") 172 data["metadata"] = json.loads(data["metadata"]) 173 174 return data 175 176 def get_by_field(self, dataset_key: str, item_id: str, field_id: str) -> dict: 177 """ 178 Get the annotation information via its dataset key, item ID, and field_id. 179 This is always a unique combination. 180 181 :param dataset_key: The key of the dataset this annotation was made for. 182 :param item_id: The ID of the item this annotation was made for. 183 :param field_id: The field ID of the annotation. 184 185 :return data: A dict with data of the retrieved annotation, or an empty dict if it doesn't exist. 186 """ 187 188 data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND field_id = %s", 189 (dataset_key, str(item_id), field_id)) 190 if not data: 191 return {} 192 193 if data["type"] == "checkbox": 194 data["value"] = data["value"].split(",") 195 data["metadata"] = json.loads(data["metadata"]) 196 197 return data 198 199 def write_to_db(self): 200 """ 201 Write an annotation to the database. 202 """ 203 db_data = self.data 204 205 db_data["timestamp"] = int(time.time()) 206 m = db_data["metadata"] # To avoid circular reference error 207 db_data["metadata"] = json.dumps(m) 208 if db_data["type"] == "checkbox": 209 db_data["value"] = ",".join(db_data["value"]) 210 211 return self.db.upsert("annotations", data=db_data, constraints=["field_id", "dataset", "item_id"]) 212 213 def delete(self): 214 """ 215 Deletes this annotation 216 """ 217 return self.db.delete("annotations", {"id": self.id}) 218 219 220 @staticmethod 221 def get_annotations_for_dataset(db: Database, dataset_key: str, item_id=None, before=0) -> list: 222 """ 223 Returns all annotations for a dataset. 224 :param db: Database object. 225 :param str dataset_key: A dataset key. 226 :param str item_id: An optional item ID or multiple item IDs to only get annotations from specific 227 items 228 :param int before: The upper timestamp for annotations to retrieve. 229 230 :return list: List with annotations. 231 """ 232 if not dataset_key: 233 return [] 234 235 if not before: 236 before = int(time.time()) 237 238 if item_id: 239 # Normalise to strings so the ANY operator receives a proper text[] 240 if isinstance(item_id, (list, tuple, set)): 241 iterable_ids = item_id 242 else: 243 iterable_ids = [item_id] 244 245 item_ids = [str(i) for i in iterable_ids if i is not None] 246 if not item_ids: 247 return [] 248 249 data = db.fetchall( 250 "SELECT * FROM annotations WHERE dataset = %s AND item_id = ANY(%s::text[]) AND timestamp <= %s", 251 (dataset_key, item_ids, before,) 252 ) 253 else: 254 data = db.fetchall("SELECT * FROM annotations WHERE dataset = %s AND timestamp <= %s", (dataset_key, before,)) 255 if not data: 256 return [] 257 258 for i in range(len(data)): 259 if data[i]["type"] == "checkbox": 260 data[i]["value"] = data[i]["value"].split(",") 261 data[i]["metadata"] = json.loads(data[i]["metadata"]) 262 263 return [Annotation(data=d, db=db) for d in data] 264 265 @staticmethod 266 def delete_many(db: Database, dataset_key=None, annotation_id=None, field_id=None): 267 """ 268 Deletes annotations for an entire dataset or by a list of (field) IDs. 269 270 :param db: Database object. 271 :param str dataset_key: A dataset key. 272 :param li annotation_id: A list or string of unique annotation IDs. 273 :param li field_id: A list or string of IDs for annotation fields. 274 275 :return int: The number of removed records. 276 """ 277 if not dataset_key and not annotation_id and not field_id: 278 return 0 279 280 where = {} 281 if dataset_key: 282 where["dataset"] = dataset_key 283 if annotation_id: 284 where["id"] = annotation_id 285 if field_id: 286 where["field_id"] = field_id 287 288 return db.delete("annotations", where) 289 290 @staticmethod 291 def update_annotations_via_fields(dataset_key: str, old_fields: dict, new_fields: dict, db: Database) -> int: 292 """ 293 Updates annotations in the annotations table if the input fields 294 themselves have been changed, for instance if a dropdown label is renamed 295 or a field is deleted. 296 297 :param str dataset_key: The dataset key for which fields changed. 298 :param dict old_fields: Old annotation fields. 299 :param dict new_fields: New annotation fields; this should contain not just 300 the additions, but all fields, changed or otherwise. 301 :param db: Database object so we can write. 302 303 :returns int: How many records were affected. 304 """ 305 306 text_fields = ["textarea", "text"] 307 308 # If old and new fields are identical, do nothing. 309 if old_fields == new_fields: 310 return 0 311 312 fields_to_delete = set() # Delete all annotations with this field ID 313 fields_to_update = {} # Update values of annotations with this field ID 314 old_options = {} 315 316 # Loop through the old annotation fields 317 for old_field_id, old_field in old_fields.items(): 318 319 # Delete all annotations of this type if the field is deleted. 320 if old_field_id not in new_fields: 321 fields_to_delete.add(old_field_id) 322 continue 323 324 field_id = old_field_id 325 new_field = new_fields[field_id] 326 327 # If the annotation type has changed, also delete existing annotations, 328 # except between text and textarea, where we can just change the type and keep the text. 329 if old_field["type"] != new_field["type"]: 330 if old_field["type"] not in text_fields and new_field["type"] not in text_fields: 331 fields_to_delete.add(field_id) 332 continue 333 334 # Loop through all the key/values in the new field settings 335 # and update in case it's different from the old values. 336 update_data = {} 337 for field_key, field_value in new_field.items(): 338 339 # Update if values don't match 340 if field_value != old_field.get(field_key): 341 342 # Special case: option values that are removed/renamed. 343 # Here we may have to change/delete values within the 344 # values column. 345 if field_key == "options": 346 347 new_options = field_value 348 349 # Edge case: delete annotations of this type if all option fields are deleted 350 if not new_options: 351 fields_to_delete.add(field_id) 352 continue 353 354 # Changed options values (e.g. renamed or one field deleted) 355 old_options[old_field_id] = old_field.get("options", {}) 356 options_to_update = {} 357 if old_options[old_field_id] and old_options != new_options: 358 options_to_update = new_options 359 360 if options_to_update: 361 update_data[field_key] = {"options": options_to_update} 362 363 # For all other changes, just overwrite with new data. 364 else: 365 update_data[field_key] = field_value 366 367 if update_data: 368 fields_to_update[field_id] = update_data 369 370 # Delete annotations 371 if fields_to_delete: 372 Annotation.delete_many(db, field_id=list(fields_to_delete)) 373 374 # Write changes to fields to database 375 count = 0 376 if fields_to_update: 377 for field_id, updates in fields_to_update.items(): 378 379 # Write to db 380 for column, update_value in updates.items(): 381 382 update_value_insert = update_value 383 if column == "options": 384 update_value_insert = ",".join(list(update_value["options"].values())) 385 386 # Change values of columns 387 updates = db.update("annotations", {column: update_value_insert}, 388 where={"dataset": dataset_key, "field_id": field_id}) 389 count += updates 390 391 # Special case: Changed option labels. 392 # Here we have to also rename/remove inserted options from the `value` column. 393 if column == "options": 394 395 annotations = db.fetchall("SELECT id, options, value FROM annotations " 396 "WHERE dataset = '%s' and field_id = '%s' AND value != '';" 397 % (dataset_key, field_id)) 398 399 for annotation in annotations: 400 annotation_id = annotation["id"] 401 annotation_values = annotation["value"].split(",") 402 403 # Remove or rename options 404 new_values = [] 405 new_options = update_value["options"] # Dict with option id->label as items 406 407 for ann_value in annotation_values: 408 # Get the option ID, so we can see if it's new, deleted, or renamed. 409 # Should always be present in old options dict 410 option_id = [k for k, v in old_options[field_id].items() if v == ann_value][0] 411 # Deleted... 412 if option_id not in new_options: 413 continue 414 # Or replaced with a new, possibly renamed value 415 else: 416 new_values.append(new_options[option_id]) 417 418 new_values = ",".join(new_values) 419 db.update("annotations", {"value": new_values}, where={"id": annotation_id}) 420 421 return count 422 423 def __getattr__(self, attr): 424 """ 425 Getter so we don't have to use .data all the time 426 427 :param attr: Data key to get 428 :return: Value 429 """ 430 431 if attr in dir(self): 432 # an explicitly defined attribute should always be called in favour 433 # of this passthrough 434 attribute = getattr(self, attr) 435 return attribute 436 elif attr in self.data: 437 return self.data[attr] 438 else: 439 raise AttributeError("Annotation instance has no attribute %s" % attr) 440 441 def __setattr__(self, attr, value): 442 """ 443 Setter so we can flexibly update the database 444 445 Also updates internal data stores (.data etc.). If the attribute is 446 unknown, it is stored within the 'metadata' attribute. 447 448 :param str attr: Attribute to update 449 :param value: New value 450 """ 451 452 # don't override behaviour for *actual* class attributes 453 if attr in dir(self): 454 super().__setattr__(attr, value) 455 return 456 457 if attr not in self.data: 458 self.metadata[attr] = value 459 attr = "metadata" 460 value = self.metadata 461 462 if attr == "metadata": 463 value = json.dumps(value) 464 465 self.db.update("annotations", where={"id": self.id}, data={attr: value}) 466 467 self.data[attr] = value 468 if attr == "metadata": 469 self.metadata = json.loads(value)
Annotation class
Annotations are always tied to a dataset, a dataset item (e.g. a csv row), an annotation label, and a type ('text', 'multichoice', etc.).
44 def __init__(self, data=None, annotation_id=None, db=None): 45 """ 46 Instantiate annotation object. 47 48 :param data: Annotation data; should correspond to the annotations table record. 49 :param annotation_id: The ID of an annotation. If given, it retrieves the annotation 50 from the database. 51 :param db: Database connection object 52 """ 53 54 required_fields = ["field_id", "item_id", "dataset"] 55 56 # Must have an ID or data 57 if (annotation_id is None and data is None) or (data is not None and not isinstance(data, dict)): 58 raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.") 59 60 if not db: 61 raise AnnotationException("Annotation() needs a `db` database object") 62 63 self.db = db 64 65 new_or_updated = False 66 67 # Get the annotation data if the ID is given; if an annotation has 68 # an ID, it is guaranteed to be in the database. 69 # IDs can both be explicitly given or present in the data dict. 70 if annotation_id is not None or "id" in data: 71 if data and "id" in data: 72 annotation_id = data["id"] 73 self.id = annotation_id # IDs correspond to unique serial numbers in the database. 74 current = self.get_by_id(annotation_id) 75 if not current: 76 raise AnnotationException( 77 "Annotation() requires a valid ID for an existing annotation, %s given" % annotation_id) 78 79 # If an ID is not given, get or create an Annotation object from its data. 80 # First check if required fields are present in `data`. 81 else: 82 for required_field in required_fields: 83 if required_field not in data or not data[required_field]: 84 raise AnnotationException("Annotation() requires a %s field" % required_field) 85 86 # Check if this annotation already exists, based on dataset key, item id, and label. 87 current = self.get_by_field(data["dataset"], data["item_id"], data["field_id"]) 88 89 # If we were able to retrieve an annotation from the db, it already exists 90 if current: 91 # Check if we have to overwrite old data with new data 92 if data: 93 for key, value in data.items(): 94 # Save unknown fields in metadata 95 if key not in current: 96 current["metadata"][key] = value 97 new_or_updated = True 98 # If values differ, update the value 99 elif current[key] != value: 100 current[key] = value 101 new_or_updated = True 102 103 self.data = current 104 105 # If this is a new annotation, set all the properties. 106 else: 107 # Keep track of when the annotation was made 108 created_timestamp = int(time.time()) 109 110 new_data = { 111 "dataset": data["dataset"], 112 "item_id": data["item_id"], 113 "field_id": data["field_id"], 114 "timestamp": created_timestamp, 115 "timestamp_created": created_timestamp, 116 "label": data["label"], 117 "type": data.get("type", "text"), 118 "options": data.get("options", ""), 119 "value": data.get("value", ""), 120 "author": data.get("author", ""), 121 "author_original": data.get("author", ""), 122 "by_processor": data.get("by_processor", False), 123 "from_dataset": data.get("from_dataset", ""), 124 "metadata": data.get("metadata", {}), 125 } 126 127 self.data = new_data 128 new_or_updated = True 129 130 if isinstance(self.data["metadata"], str): 131 try: 132 self.metadata = json.loads(self.data["metadata"]) 133 except (TypeError, json.JSONDecodeError): 134 self.metadata = {} 135 136 for k, v in self.data.items(): 137 # Some type checking 138 try: 139 if k == "timestamp" or k == "timestamp_created": 140 v = int(v) 141 elif k == "by_processor": 142 v = bool(v) 143 except ValueError as e: 144 raise AnnotationException("Annotation fields are not of the right type (%s)" % e) 145 self.__setattr__(k, v) 146 147 # Write to db if anything changed 148 if new_or_updated: 149 self.timestamp = int(time.time()) 150 self.write_to_db()
Instantiate annotation object.
Parameters
- data: Annotation data; should correspond to the annotations table record.
- annotation_id: The ID of an annotation. If given, it retrieves the annotation from the database.
- db: Database connection object
152 def get_by_id(self, annotation_id: int): 153 """ 154 Get annotation by ID 155 156 :param str annotation_id: ID of annotation. 157 :return: Annotation object, or an empty dict if the ID doesn't exist. 158 """ 159 160 try: 161 int(annotation_id) 162 except ValueError: 163 raise AnnotationException("Id '%s' is not valid" % annotation_id) 164 165 data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % annotation_id) 166 167 if not data: 168 return {} 169 170 if data["type"] == "checkbox": 171 data["value"] = data["value"].split(",") 172 data["metadata"] = json.loads(data["metadata"]) 173 174 return data
Get annotation by ID
Parameters
- str annotation_id: ID of annotation.
Returns
Annotation object, or an empty dict if the ID doesn't exist.
176 def get_by_field(self, dataset_key: str, item_id: str, field_id: str) -> dict: 177 """ 178 Get the annotation information via its dataset key, item ID, and field_id. 179 This is always a unique combination. 180 181 :param dataset_key: The key of the dataset this annotation was made for. 182 :param item_id: The ID of the item this annotation was made for. 183 :param field_id: The field ID of the annotation. 184 185 :return data: A dict with data of the retrieved annotation, or an empty dict if it doesn't exist. 186 """ 187 188 data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND field_id = %s", 189 (dataset_key, str(item_id), field_id)) 190 if not data: 191 return {} 192 193 if data["type"] == "checkbox": 194 data["value"] = data["value"].split(",") 195 data["metadata"] = json.loads(data["metadata"]) 196 197 return data
Get the annotation information via its dataset key, item ID, and field_id. This is always a unique combination.
Parameters
- dataset_key: The key of the dataset this annotation was made for.
- item_id: The ID of the item this annotation was made for.
- field_id: The field ID of the annotation.
Returns
A dict with data of the retrieved annotation, or an empty dict if it doesn't exist.
199 def write_to_db(self): 200 """ 201 Write an annotation to the database. 202 """ 203 db_data = self.data 204 205 db_data["timestamp"] = int(time.time()) 206 m = db_data["metadata"] # To avoid circular reference error 207 db_data["metadata"] = json.dumps(m) 208 if db_data["type"] == "checkbox": 209 db_data["value"] = ",".join(db_data["value"]) 210 211 return self.db.upsert("annotations", data=db_data, constraints=["field_id", "dataset", "item_id"])
Write an annotation to the database.
213 def delete(self): 214 """ 215 Deletes this annotation 216 """ 217 return self.db.delete("annotations", {"id": self.id})
Deletes this annotation
220 @staticmethod 221 def get_annotations_for_dataset(db: Database, dataset_key: str, item_id=None, before=0) -> list: 222 """ 223 Returns all annotations for a dataset. 224 :param db: Database object. 225 :param str dataset_key: A dataset key. 226 :param str item_id: An optional item ID or multiple item IDs to only get annotations from specific 227 items 228 :param int before: The upper timestamp for annotations to retrieve. 229 230 :return list: List with annotations. 231 """ 232 if not dataset_key: 233 return [] 234 235 if not before: 236 before = int(time.time()) 237 238 if item_id: 239 # Normalise to strings so the ANY operator receives a proper text[] 240 if isinstance(item_id, (list, tuple, set)): 241 iterable_ids = item_id 242 else: 243 iterable_ids = [item_id] 244 245 item_ids = [str(i) for i in iterable_ids if i is not None] 246 if not item_ids: 247 return [] 248 249 data = db.fetchall( 250 "SELECT * FROM annotations WHERE dataset = %s AND item_id = ANY(%s::text[]) AND timestamp <= %s", 251 (dataset_key, item_ids, before,) 252 ) 253 else: 254 data = db.fetchall("SELECT * FROM annotations WHERE dataset = %s AND timestamp <= %s", (dataset_key, before,)) 255 if not data: 256 return [] 257 258 for i in range(len(data)): 259 if data[i]["type"] == "checkbox": 260 data[i]["value"] = data[i]["value"].split(",") 261 data[i]["metadata"] = json.loads(data[i]["metadata"]) 262 263 return [Annotation(data=d, db=db) for d in data]
Returns all annotations for a dataset.
Parameters
- db: Database object.
- str dataset_key: A dataset key.
- str item_id: An optional item ID or multiple item IDs to only get annotations from specific items
- int before: The upper timestamp for annotations to retrieve.
Returns
List with annotations.
265 @staticmethod 266 def delete_many(db: Database, dataset_key=None, annotation_id=None, field_id=None): 267 """ 268 Deletes annotations for an entire dataset or by a list of (field) IDs. 269 270 :param db: Database object. 271 :param str dataset_key: A dataset key. 272 :param li annotation_id: A list or string of unique annotation IDs. 273 :param li field_id: A list or string of IDs for annotation fields. 274 275 :return int: The number of removed records. 276 """ 277 if not dataset_key and not annotation_id and not field_id: 278 return 0 279 280 where = {} 281 if dataset_key: 282 where["dataset"] = dataset_key 283 if annotation_id: 284 where["id"] = annotation_id 285 if field_id: 286 where["field_id"] = field_id 287 288 return db.delete("annotations", where)
Deletes annotations for an entire dataset or by a list of (field) IDs.
Parameters
- db: Database object.
- str dataset_key: A dataset key.
- li annotation_id: A list or string of unique annotation IDs.
- li field_id: A list or string of IDs for annotation fields.
Returns
The number of removed records.
290 @staticmethod 291 def update_annotations_via_fields(dataset_key: str, old_fields: dict, new_fields: dict, db: Database) -> int: 292 """ 293 Updates annotations in the annotations table if the input fields 294 themselves have been changed, for instance if a dropdown label is renamed 295 or a field is deleted. 296 297 :param str dataset_key: The dataset key for which fields changed. 298 :param dict old_fields: Old annotation fields. 299 :param dict new_fields: New annotation fields; this should contain not just 300 the additions, but all fields, changed or otherwise. 301 :param db: Database object so we can write. 302 303 :returns int: How many records were affected. 304 """ 305 306 text_fields = ["textarea", "text"] 307 308 # If old and new fields are identical, do nothing. 309 if old_fields == new_fields: 310 return 0 311 312 fields_to_delete = set() # Delete all annotations with this field ID 313 fields_to_update = {} # Update values of annotations with this field ID 314 old_options = {} 315 316 # Loop through the old annotation fields 317 for old_field_id, old_field in old_fields.items(): 318 319 # Delete all annotations of this type if the field is deleted. 320 if old_field_id not in new_fields: 321 fields_to_delete.add(old_field_id) 322 continue 323 324 field_id = old_field_id 325 new_field = new_fields[field_id] 326 327 # If the annotation type has changed, also delete existing annotations, 328 # except between text and textarea, where we can just change the type and keep the text. 329 if old_field["type"] != new_field["type"]: 330 if old_field["type"] not in text_fields and new_field["type"] not in text_fields: 331 fields_to_delete.add(field_id) 332 continue 333 334 # Loop through all the key/values in the new field settings 335 # and update in case it's different from the old values. 336 update_data = {} 337 for field_key, field_value in new_field.items(): 338 339 # Update if values don't match 340 if field_value != old_field.get(field_key): 341 342 # Special case: option values that are removed/renamed. 343 # Here we may have to change/delete values within the 344 # values column. 345 if field_key == "options": 346 347 new_options = field_value 348 349 # Edge case: delete annotations of this type if all option fields are deleted 350 if not new_options: 351 fields_to_delete.add(field_id) 352 continue 353 354 # Changed options values (e.g. renamed or one field deleted) 355 old_options[old_field_id] = old_field.get("options", {}) 356 options_to_update = {} 357 if old_options[old_field_id] and old_options != new_options: 358 options_to_update = new_options 359 360 if options_to_update: 361 update_data[field_key] = {"options": options_to_update} 362 363 # For all other changes, just overwrite with new data. 364 else: 365 update_data[field_key] = field_value 366 367 if update_data: 368 fields_to_update[field_id] = update_data 369 370 # Delete annotations 371 if fields_to_delete: 372 Annotation.delete_many(db, field_id=list(fields_to_delete)) 373 374 # Write changes to fields to database 375 count = 0 376 if fields_to_update: 377 for field_id, updates in fields_to_update.items(): 378 379 # Write to db 380 for column, update_value in updates.items(): 381 382 update_value_insert = update_value 383 if column == "options": 384 update_value_insert = ",".join(list(update_value["options"].values())) 385 386 # Change values of columns 387 updates = db.update("annotations", {column: update_value_insert}, 388 where={"dataset": dataset_key, "field_id": field_id}) 389 count += updates 390 391 # Special case: Changed option labels. 392 # Here we have to also rename/remove inserted options from the `value` column. 393 if column == "options": 394 395 annotations = db.fetchall("SELECT id, options, value FROM annotations " 396 "WHERE dataset = '%s' and field_id = '%s' AND value != '';" 397 % (dataset_key, field_id)) 398 399 for annotation in annotations: 400 annotation_id = annotation["id"] 401 annotation_values = annotation["value"].split(",") 402 403 # Remove or rename options 404 new_values = [] 405 new_options = update_value["options"] # Dict with option id->label as items 406 407 for ann_value in annotation_values: 408 # Get the option ID, so we can see if it's new, deleted, or renamed. 409 # Should always be present in old options dict 410 option_id = [k for k, v in old_options[field_id].items() if v == ann_value][0] 411 # Deleted... 412 if option_id not in new_options: 413 continue 414 # Or replaced with a new, possibly renamed value 415 else: 416 new_values.append(new_options[option_id]) 417 418 new_values = ",".join(new_values) 419 db.update("annotations", {"value": new_values}, where={"id": annotation_id}) 420 421 return count
Updates annotations in the annotations table if the input fields themselves have been changed, for instance if a dropdown label is renamed or a field is deleted.
Parameters
- str dataset_key: The dataset key for which fields changed.
- dict old_fields: Old annotation fields.
- dict new_fields: New annotation fields; this should contain not just the additions, but all fields, changed or otherwise.
- db: Database object so we can write.
:returns int: How many records were affected.