Edit on GitHub

common.lib.annotation

Annotation class

  1"""
  2Annotation class
  3"""
  4
  5
  6import time
  7import json
  8
  9from common.lib.database import Database
 10from common.lib.exceptions import AnnotationException
 11
 12
 13class Annotation:
 14    """
 15    Annotation class
 16
 17    Annotations are always tied to a dataset, a dataset item (e.g. a csv row),
 18    an annotation label, and a type ('text', 'multichoice', etc.).
 19
 20    """
 21
 22    # Attributes must be created here to ensure getattr and setattr work properly
 23
 24    data = None
 25    db = None
 26
 27    id = None                 # Unique ID for this annotation
 28    item_id = None            # ID of the item that this annotation was made for, e.g. a post ID.
 29    field_id = None           # ID for the annotation field
 30    dataset = None            # Dataset key this annotation is generated from
 31    timestamp = None          # When this annotation was edited
 32    timestamp_created = None  # When this annotation was created
 33    label = None              # Label of annotation
 34    type = None               # Type of annotation (e.g. `text`)
 35    options = None            # Possible options
 36    value = None              # The actual annotation value
 37    author = None             # Who last edited the annotation
 38    author_original = None    # Who originally made the annotation
 39    by_processor = None       # Whether the annotation was made by a processor
 40    from_dataset = None       # Processor-made dataset key this annotation was generated as part of
 41    metadata = None           # Misc metadata
 42
 43    def __init__(self, data=None, annotation_id=None, db=None):
 44        """
 45        Instantiate annotation object.
 46
 47        :param data:            Annotation data; should correspond to the annotations table record.
 48        :param annotation_id:   The ID of an annotation. If given, it retrieves the annotation
 49                                from the database.
 50        :param db:              Database connection object
 51        """
 52
 53        required_fields = ["field_id", "item_id", "dataset"]
 54
 55        # Must have an ID or data
 56        if (annotation_id is None and data is None) or (data is not None and not isinstance(data, dict)):
 57            raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.")
 58
 59        if not db:
 60            raise AnnotationException("Annotation() needs a `db` database object")
 61
 62        self.db = db
 63
 64        new_or_updated = False
 65
 66        # Get the annotation data if the ID is given; if an annotation has
 67        # an ID, it is guaranteed to be in the database.
 68        # IDs can both be explicitly given or present in the data dict.
 69        if annotation_id is not None or "id" in data:
 70            if data and "id" in data:
 71                annotation_id = data["id"]
 72            self.id = annotation_id  # IDs correspond to unique serial numbers in the database.
 73            current = self.get_by_id(annotation_id)
 74            if not current:
 75                raise AnnotationException(
 76                    "Annotation() requires a valid ID for an existing annotation, %s given" % annotation_id)
 77
 78        # If an ID is not given, get or create an Annotation object from its data.
 79        # First check if required fields are present in `data`.
 80        else:
 81            for required_field in required_fields:
 82                if required_field not in data or not data[required_field]:
 83                    raise AnnotationException("Annotation() requires a %s field" % required_field)
 84
 85            # Check if this annotation already exists, based on dataset key, item id, and label.
 86            current = self.get_by_field(data["dataset"], data["item_id"], data["field_id"])
 87
 88        # If we were able to retrieve an annotation from the db, it already exists
 89        if current:
 90            # Check if we have to overwrite old data with new data
 91            if data:
 92                for key, value in data.items():
 93                    # Save unknown fields in metadata
 94                    if key not in current:
 95                        current["metadata"][key] = value
 96                        new_or_updated = True
 97                    # If values differ, update the value
 98                    elif current[key] != value:
 99                        current[key] = value
100                        new_or_updated = True
101
102            self.data = current
103
104        # If this is a new annotation, set all the properties.
105        else:
106            # Keep track of when the annotation was made
107            created_timestamp = int(time.time())
108
109            new_data = {
110                "dataset": data["dataset"],
111                "item_id": data["item_id"],
112                "field_id": data["field_id"],
113                "timestamp": created_timestamp,
114                "timestamp_created": created_timestamp,
115                "label": data["label"],
116                "type": data.get("type", "text"),
117                "options": data.get("options", ""),
118                "value": data.get("value", ""),
119                "author": data.get("author", ""),
120                "author_original": data.get("author", ""),
121                "by_processor": data.get("by_processor", False),
122                "from_dataset": data.get("from_dataset", ""),
123                "metadata": data.get("metadata", {}),
124            }
125
126            self.data = new_data
127            new_or_updated = True
128
129        if isinstance(self.data["metadata"], str):
130            try:
131                self.metadata = json.loads(self.data["metadata"])
132            except (TypeError, json.JSONDecodeError):
133                self.metadata = {}
134
135        for k, v in self.data.items():
136            # Some type checking
137            try:
138                if k == "timestamp" or k == "timestamp_created":
139                    v = int(v)
140                elif k == "by_processor":
141                    v = bool(v)
142            except ValueError as e:
143                raise AnnotationException("Annotation fields are not of the right type (%s)" % e)
144            self.__setattr__(k, v)
145
146        # Write to db if anything changed
147        if new_or_updated:
148            self.timestamp = int(time.time())
149            self.write_to_db()
150
151    def get_by_id(self, annotation_id: int):
152        """
153        Get annotation by ID
154
155        :param str annotation_id:  ID of annotation.
156        :return:  Annotation object, or an empty dict if the ID doesn't exist.
157        """
158
159        try:
160            int(annotation_id)
161        except ValueError:
162            raise AnnotationException("Id '%s' is not valid" % annotation_id)
163
164        data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % annotation_id)
165
166        if not data:
167            return {}
168
169        if data["type"] == "checkbox":
170            data["value"] = data["value"].split(",")
171        data["metadata"] = json.loads(data["metadata"])
172
173        return data
174
175    def get_by_field(self, dataset_key: str, item_id: str, field_id: str) -> dict:
176        """
177        Get the annotation information via its dataset key, item ID, and field_id.
178        This is always a unique combination.
179
180        :param dataset_key:     The key of the dataset this annotation was made for.
181        :param item_id:         The ID of the item this annotation was made for.
182        :param field_id:        The field ID of the annotation.
183
184        :return data: A dict with data of the retrieved annotation, or an empty dict if it doesn't exist.
185        """
186
187        data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND field_id = %s",
188                                (dataset_key, str(item_id), field_id))
189        if not data:
190            return {}
191
192        if data["type"] == "checkbox":
193            data["value"] = data["value"].split(",")
194        data["metadata"] = json.loads(data["metadata"])
195
196        return data
197
198    def write_to_db(self):
199        """
200        Write an annotation to the database.
201        """
202        db_data = self.data
203
204        db_data["timestamp"] = int(time.time())
205        m = db_data["metadata"]  # To avoid circular reference error
206        db_data["metadata"] = json.dumps(m)
207        if db_data["type"] == "checkbox":
208            db_data["value"] = ",".join(db_data["value"])
209
210        return self.db.upsert("annotations", data=db_data, constraints=["field_id", "dataset", "item_id"])
211
212    def delete(self):
213        """
214        Deletes this annotation
215        """
216        return self.db.delete("annotations", {"id": self.id})
217
218
219    @staticmethod
220    def get_annotations_for_dataset(db: Database, dataset_key: str, item_id=None, before=0) -> list:
221        """
222        Returns all annotations for a dataset.
223        :param db:                  Database object.
224        :param str dataset_key:     A dataset key.
225        :param str item_id:         An optional item ID or multiple item IDs to only get annotations from specific
226                                    items
227        :param int before:          The upper timestamp for annotations to retrieve.
228
229        :return list: List with annotations.
230        """
231        if not dataset_key:
232            return []
233
234        if not before:
235            before = int(time.time())
236
237        if item_id:
238            # Normalise to strings so the ANY operator receives a proper text[]
239            if isinstance(item_id, (list, tuple, set)):
240                iterable_ids = item_id
241            else:
242                iterable_ids = [item_id]
243
244            item_ids = [str(i) for i in iterable_ids if i is not None]
245            if not item_ids:
246                return []
247
248            data = db.fetchall(
249                "SELECT * FROM annotations WHERE dataset = %s AND item_id = ANY(%s::text[]) AND timestamp <= %s",
250                (dataset_key, item_ids, before,)
251            )
252        else:
253            data = db.fetchall("SELECT * FROM annotations WHERE dataset = %s AND timestamp <= %s", (dataset_key, before,))
254        if not data:
255            return []
256
257        for i in range(len(data)):
258            if data[i]["type"] == "checkbox":
259                data[i]["value"] = data[i]["value"].split(",")
260            data[i]["metadata"] = json.loads(data[i]["metadata"])
261
262        return [Annotation(data=d, db=db) for d in data]
263
264    @staticmethod
265    def delete_many(db: Database, dataset_key=None, annotation_id=None, field_id=None):
266        """
267        Deletes annotations for an entire dataset or by a list of (field) IDs.
268
269        :param db:                  Database object.
270        :param str dataset_key:     A dataset key.
271        :param li annotation_id:	A list or string of unique annotation IDs.
272        :param li field_id:		    A list or string of IDs for annotation fields.
273
274        :return int: The number of removed records.
275        """
276        if not dataset_key and not annotation_id and not field_id:
277            return 0
278
279        where = {}
280        if dataset_key:
281            where["dataset"] = dataset_key
282        if annotation_id:
283            where["id"] = annotation_id
284        if field_id:
285            where["field_id"] = field_id
286
287        return db.delete("annotations", where)
288
289    @staticmethod
290    def update_annotations_via_fields(dataset_key: str, old_fields: dict, new_fields: dict, db: Database) -> int:
291        """
292        Updates annotations in the annotations table if the input fields
293        themselves have been changed, for instance if a dropdown label is renamed
294        or a field is deleted.
295
296        :param str  dataset_key:    The dataset key for which fields changed.
297        :param dict old_fields:	    Old annotation fields.
298        :param dict new_fields:	    New annotation fields; this should contain not just
299                                    the additions, but all fields, changed or otherwise.
300        :param db:                  Database object so we can write.
301
302        :returns int:               How many records were affected.
303        """
304
305        text_fields = ["textarea", "text"]
306
307        # If old and new fields are identical, do nothing.
308        if old_fields == new_fields:
309            return 0
310
311        fields_to_delete = set()        # Delete all annotations with this field ID
312        fields_to_update = {}           # Update values of annotations with this field ID
313        old_options = {}
314
315        # Loop through the old annotation fields
316        for old_field_id, old_field in old_fields.items():
317
318            # Delete all annotations of this type if the field is deleted.
319            if old_field_id not in new_fields:
320                fields_to_delete.add(old_field_id)
321                continue
322
323            field_id = old_field_id
324            new_field = new_fields[field_id]
325
326            # If the annotation type has changed, also delete existing annotations,
327            # except between text and textarea, where we can just change the type and keep the text.
328            if old_field["type"] != new_field["type"]:
329                if old_field["type"] not in text_fields and new_field["type"] not in text_fields:
330                    fields_to_delete.add(field_id)
331                    continue
332
333            # Loop through all the key/values in the new field settings
334            # and update in case it's different from the old values.
335            update_data = {}
336            for field_key, field_value in new_field.items():
337
338                # Update if values don't match
339                if field_value != old_field.get(field_key):
340
341                    # Special case: option values that are removed/renamed.
342                    # Here we may have to change/delete values within the
343                    # values column.
344                    if field_key == "options":
345
346                        new_options = field_value
347
348                        # Edge case: delete annotations of this type if all option fields are deleted
349                        if not new_options:
350                            fields_to_delete.add(field_id)
351                            continue
352
353                        # Changed options values (e.g. renamed or one field deleted)
354                        old_options[old_field_id] = old_field.get("options", {})
355                        options_to_update = {}
356                        if old_options[old_field_id] and old_options != new_options:
357                            options_to_update = new_options
358
359                        if options_to_update:
360                            update_data[field_key] = {"options": options_to_update}
361
362                    # For all other changes, just overwrite with new data.
363                    else:
364                        update_data[field_key] = field_value
365
366            if update_data:
367                fields_to_update[field_id] = update_data
368
369        # Delete annotations
370        if fields_to_delete:
371            Annotation.delete_many(db, field_id=list(fields_to_delete))
372
373        # Write changes to fields to database
374        count = 0
375        if fields_to_update:
376            for field_id, updates in fields_to_update.items():
377
378                # Write to db
379                for column, update_value in updates.items():
380
381                    update_value_insert = update_value
382                    if column == "options":
383                        update_value_insert = ",".join(list(update_value["options"].values()))
384
385                    # Change values of columns
386                    updates = db.update("annotations", {column: update_value_insert},
387                                        where={"dataset": dataset_key, "field_id": field_id})
388                    count += updates
389
390                    # Special case: Changed option labels.
391                    # Here we have to also rename/remove inserted options from the `value` column.
392                    if column == "options":
393
394                        annotations = db.fetchall("SELECT id, options, value FROM annotations "
395                                                  "WHERE dataset = '%s' and field_id = '%s' AND value != '';"
396                                                  % (dataset_key, field_id))
397
398                        for annotation in annotations:
399                            annotation_id = annotation["id"]
400                            annotation_values = annotation["value"].split(",")
401
402                            # Remove or rename options
403                            new_values = []
404                            new_options = update_value["options"]  # Dict with option id->label as items
405
406                            for ann_value in annotation_values:
407                                # Get the option ID, so we can see if it's new, deleted, or renamed.
408                                # Should always be present in old options dict
409                                option_id = [k for k, v in old_options[field_id].items() if v == ann_value][0]
410                                # Deleted...
411                                if option_id not in new_options:
412                                    continue
413                                # Or replaced with a new, possibly renamed value
414                                else:
415                                    new_values.append(new_options[option_id])
416
417                            new_values = ",".join(new_values)
418                            db.update("annotations", {"value": new_values}, where={"id": annotation_id})
419
420        return count
421
422    def __getattr__(self, attr):
423        """
424        Getter so we don't have to use .data all the time
425
426        :param attr:  Data key to get
427        :return:  Value
428        """
429
430        if attr in dir(self):
431            # an explicitly defined attribute should always be called in favour
432            # of this passthrough
433            attribute = getattr(self, attr)
434            return attribute
435        elif attr in self.data:
436            return self.data[attr]
437        else:
438            raise AttributeError("Annotation instance has no attribute %s" % attr)
439
440    def __setattr__(self, attr, value):
441        """
442        Setter so we can flexibly update the database
443
444        Also updates internal data stores (.data etc.). If the attribute is
445        unknown, it is stored within the 'metadata' attribute.
446
447        :param str attr:  Attribute to update
448        :param value:  New value
449        """
450
451        # don't override behaviour for *actual* class attributes
452        if attr in dir(self):
453            super().__setattr__(attr, value)
454            return
455
456        if attr not in self.data:
457            self.metadata[attr] = value
458            attr = "metadata"
459            value = self.metadata
460
461        if attr == "metadata":
462            value = json.dumps(value)
463
464        self.db.update("annotations", where={"id": self.id}, data={attr: value})
465
466        self.data[attr] = value
467        if attr == "metadata":
468            self.metadata = json.loads(value)
class Annotation:
 14class Annotation:
 15    """
 16    Annotation class
 17
 18    Annotations are always tied to a dataset, a dataset item (e.g. a csv row),
 19    an annotation label, and a type ('text', 'multichoice', etc.).
 20
 21    """
 22
 23    # Attributes must be created here to ensure getattr and setattr work properly
 24
 25    data = None
 26    db = None
 27
 28    id = None                 # Unique ID for this annotation
 29    item_id = None            # ID of the item that this annotation was made for, e.g. a post ID.
 30    field_id = None           # ID for the annotation field
 31    dataset = None            # Dataset key this annotation is generated from
 32    timestamp = None          # When this annotation was edited
 33    timestamp_created = None  # When this annotation was created
 34    label = None              # Label of annotation
 35    type = None               # Type of annotation (e.g. `text`)
 36    options = None            # Possible options
 37    value = None              # The actual annotation value
 38    author = None             # Who last edited the annotation
 39    author_original = None    # Who originally made the annotation
 40    by_processor = None       # Whether the annotation was made by a processor
 41    from_dataset = None       # Processor-made dataset key this annotation was generated as part of
 42    metadata = None           # Misc metadata
 43
 44    def __init__(self, data=None, annotation_id=None, db=None):
 45        """
 46        Instantiate annotation object.
 47
 48        :param data:            Annotation data; should correspond to the annotations table record.
 49        :param annotation_id:   The ID of an annotation. If given, it retrieves the annotation
 50                                from the database.
 51        :param db:              Database connection object
 52        """
 53
 54        required_fields = ["field_id", "item_id", "dataset"]
 55
 56        # Must have an ID or data
 57        if (annotation_id is None and data is None) or (data is not None and not isinstance(data, dict)):
 58            raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.")
 59
 60        if not db:
 61            raise AnnotationException("Annotation() needs a `db` database object")
 62
 63        self.db = db
 64
 65        new_or_updated = False
 66
 67        # Get the annotation data if the ID is given; if an annotation has
 68        # an ID, it is guaranteed to be in the database.
 69        # IDs can both be explicitly given or present in the data dict.
 70        if annotation_id is not None or "id" in data:
 71            if data and "id" in data:
 72                annotation_id = data["id"]
 73            self.id = annotation_id  # IDs correspond to unique serial numbers in the database.
 74            current = self.get_by_id(annotation_id)
 75            if not current:
 76                raise AnnotationException(
 77                    "Annotation() requires a valid ID for an existing annotation, %s given" % annotation_id)
 78
 79        # If an ID is not given, get or create an Annotation object from its data.
 80        # First check if required fields are present in `data`.
 81        else:
 82            for required_field in required_fields:
 83                if required_field not in data or not data[required_field]:
 84                    raise AnnotationException("Annotation() requires a %s field" % required_field)
 85
 86            # Check if this annotation already exists, based on dataset key, item id, and label.
 87            current = self.get_by_field(data["dataset"], data["item_id"], data["field_id"])
 88
 89        # If we were able to retrieve an annotation from the db, it already exists
 90        if current:
 91            # Check if we have to overwrite old data with new data
 92            if data:
 93                for key, value in data.items():
 94                    # Save unknown fields in metadata
 95                    if key not in current:
 96                        current["metadata"][key] = value
 97                        new_or_updated = True
 98                    # If values differ, update the value
 99                    elif current[key] != value:
100                        current[key] = value
101                        new_or_updated = True
102
103            self.data = current
104
105        # If this is a new annotation, set all the properties.
106        else:
107            # Keep track of when the annotation was made
108            created_timestamp = int(time.time())
109
110            new_data = {
111                "dataset": data["dataset"],
112                "item_id": data["item_id"],
113                "field_id": data["field_id"],
114                "timestamp": created_timestamp,
115                "timestamp_created": created_timestamp,
116                "label": data["label"],
117                "type": data.get("type", "text"),
118                "options": data.get("options", ""),
119                "value": data.get("value", ""),
120                "author": data.get("author", ""),
121                "author_original": data.get("author", ""),
122                "by_processor": data.get("by_processor", False),
123                "from_dataset": data.get("from_dataset", ""),
124                "metadata": data.get("metadata", {}),
125            }
126
127            self.data = new_data
128            new_or_updated = True
129
130        if isinstance(self.data["metadata"], str):
131            try:
132                self.metadata = json.loads(self.data["metadata"])
133            except (TypeError, json.JSONDecodeError):
134                self.metadata = {}
135
136        for k, v in self.data.items():
137            # Some type checking
138            try:
139                if k == "timestamp" or k == "timestamp_created":
140                    v = int(v)
141                elif k == "by_processor":
142                    v = bool(v)
143            except ValueError as e:
144                raise AnnotationException("Annotation fields are not of the right type (%s)" % e)
145            self.__setattr__(k, v)
146
147        # Write to db if anything changed
148        if new_or_updated:
149            self.timestamp = int(time.time())
150            self.write_to_db()
151
152    def get_by_id(self, annotation_id: int):
153        """
154        Get annotation by ID
155
156        :param str annotation_id:  ID of annotation.
157        :return:  Annotation object, or an empty dict if the ID doesn't exist.
158        """
159
160        try:
161            int(annotation_id)
162        except ValueError:
163            raise AnnotationException("Id '%s' is not valid" % annotation_id)
164
165        data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % annotation_id)
166
167        if not data:
168            return {}
169
170        if data["type"] == "checkbox":
171            data["value"] = data["value"].split(",")
172        data["metadata"] = json.loads(data["metadata"])
173
174        return data
175
176    def get_by_field(self, dataset_key: str, item_id: str, field_id: str) -> dict:
177        """
178        Get the annotation information via its dataset key, item ID, and field_id.
179        This is always a unique combination.
180
181        :param dataset_key:     The key of the dataset this annotation was made for.
182        :param item_id:         The ID of the item this annotation was made for.
183        :param field_id:        The field ID of the annotation.
184
185        :return data: A dict with data of the retrieved annotation, or an empty dict if it doesn't exist.
186        """
187
188        data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND field_id = %s",
189                                (dataset_key, str(item_id), field_id))
190        if not data:
191            return {}
192
193        if data["type"] == "checkbox":
194            data["value"] = data["value"].split(",")
195        data["metadata"] = json.loads(data["metadata"])
196
197        return data
198
199    def write_to_db(self):
200        """
201        Write an annotation to the database.
202        """
203        db_data = self.data
204
205        db_data["timestamp"] = int(time.time())
206        m = db_data["metadata"]  # To avoid circular reference error
207        db_data["metadata"] = json.dumps(m)
208        if db_data["type"] == "checkbox":
209            db_data["value"] = ",".join(db_data["value"])
210
211        return self.db.upsert("annotations", data=db_data, constraints=["field_id", "dataset", "item_id"])
212
213    def delete(self):
214        """
215        Deletes this annotation
216        """
217        return self.db.delete("annotations", {"id": self.id})
218
219
220    @staticmethod
221    def get_annotations_for_dataset(db: Database, dataset_key: str, item_id=None, before=0) -> list:
222        """
223        Returns all annotations for a dataset.
224        :param db:                  Database object.
225        :param str dataset_key:     A dataset key.
226        :param str item_id:         An optional item ID or multiple item IDs to only get annotations from specific
227                                    items
228        :param int before:          The upper timestamp for annotations to retrieve.
229
230        :return list: List with annotations.
231        """
232        if not dataset_key:
233            return []
234
235        if not before:
236            before = int(time.time())
237
238        if item_id:
239            # Normalise to strings so the ANY operator receives a proper text[]
240            if isinstance(item_id, (list, tuple, set)):
241                iterable_ids = item_id
242            else:
243                iterable_ids = [item_id]
244
245            item_ids = [str(i) for i in iterable_ids if i is not None]
246            if not item_ids:
247                return []
248
249            data = db.fetchall(
250                "SELECT * FROM annotations WHERE dataset = %s AND item_id = ANY(%s::text[]) AND timestamp <= %s",
251                (dataset_key, item_ids, before,)
252            )
253        else:
254            data = db.fetchall("SELECT * FROM annotations WHERE dataset = %s AND timestamp <= %s", (dataset_key, before,))
255        if not data:
256            return []
257
258        for i in range(len(data)):
259            if data[i]["type"] == "checkbox":
260                data[i]["value"] = data[i]["value"].split(",")
261            data[i]["metadata"] = json.loads(data[i]["metadata"])
262
263        return [Annotation(data=d, db=db) for d in data]
264
265    @staticmethod
266    def delete_many(db: Database, dataset_key=None, annotation_id=None, field_id=None):
267        """
268        Deletes annotations for an entire dataset or by a list of (field) IDs.
269
270        :param db:                  Database object.
271        :param str dataset_key:     A dataset key.
272        :param li annotation_id:	A list or string of unique annotation IDs.
273        :param li field_id:		    A list or string of IDs for annotation fields.
274
275        :return int: The number of removed records.
276        """
277        if not dataset_key and not annotation_id and not field_id:
278            return 0
279
280        where = {}
281        if dataset_key:
282            where["dataset"] = dataset_key
283        if annotation_id:
284            where["id"] = annotation_id
285        if field_id:
286            where["field_id"] = field_id
287
288        return db.delete("annotations", where)
289
290    @staticmethod
291    def update_annotations_via_fields(dataset_key: str, old_fields: dict, new_fields: dict, db: Database) -> int:
292        """
293        Updates annotations in the annotations table if the input fields
294        themselves have been changed, for instance if a dropdown label is renamed
295        or a field is deleted.
296
297        :param str  dataset_key:    The dataset key for which fields changed.
298        :param dict old_fields:	    Old annotation fields.
299        :param dict new_fields:	    New annotation fields; this should contain not just
300                                    the additions, but all fields, changed or otherwise.
301        :param db:                  Database object so we can write.
302
303        :returns int:               How many records were affected.
304        """
305
306        text_fields = ["textarea", "text"]
307
308        # If old and new fields are identical, do nothing.
309        if old_fields == new_fields:
310            return 0
311
312        fields_to_delete = set()        # Delete all annotations with this field ID
313        fields_to_update = {}           # Update values of annotations with this field ID
314        old_options = {}
315
316        # Loop through the old annotation fields
317        for old_field_id, old_field in old_fields.items():
318
319            # Delete all annotations of this type if the field is deleted.
320            if old_field_id not in new_fields:
321                fields_to_delete.add(old_field_id)
322                continue
323
324            field_id = old_field_id
325            new_field = new_fields[field_id]
326
327            # If the annotation type has changed, also delete existing annotations,
328            # except between text and textarea, where we can just change the type and keep the text.
329            if old_field["type"] != new_field["type"]:
330                if old_field["type"] not in text_fields and new_field["type"] not in text_fields:
331                    fields_to_delete.add(field_id)
332                    continue
333
334            # Loop through all the key/values in the new field settings
335            # and update in case it's different from the old values.
336            update_data = {}
337            for field_key, field_value in new_field.items():
338
339                # Update if values don't match
340                if field_value != old_field.get(field_key):
341
342                    # Special case: option values that are removed/renamed.
343                    # Here we may have to change/delete values within the
344                    # values column.
345                    if field_key == "options":
346
347                        new_options = field_value
348
349                        # Edge case: delete annotations of this type if all option fields are deleted
350                        if not new_options:
351                            fields_to_delete.add(field_id)
352                            continue
353
354                        # Changed options values (e.g. renamed or one field deleted)
355                        old_options[old_field_id] = old_field.get("options", {})
356                        options_to_update = {}
357                        if old_options[old_field_id] and old_options != new_options:
358                            options_to_update = new_options
359
360                        if options_to_update:
361                            update_data[field_key] = {"options": options_to_update}
362
363                    # For all other changes, just overwrite with new data.
364                    else:
365                        update_data[field_key] = field_value
366
367            if update_data:
368                fields_to_update[field_id] = update_data
369
370        # Delete annotations
371        if fields_to_delete:
372            Annotation.delete_many(db, field_id=list(fields_to_delete))
373
374        # Write changes to fields to database
375        count = 0
376        if fields_to_update:
377            for field_id, updates in fields_to_update.items():
378
379                # Write to db
380                for column, update_value in updates.items():
381
382                    update_value_insert = update_value
383                    if column == "options":
384                        update_value_insert = ",".join(list(update_value["options"].values()))
385
386                    # Change values of columns
387                    updates = db.update("annotations", {column: update_value_insert},
388                                        where={"dataset": dataset_key, "field_id": field_id})
389                    count += updates
390
391                    # Special case: Changed option labels.
392                    # Here we have to also rename/remove inserted options from the `value` column.
393                    if column == "options":
394
395                        annotations = db.fetchall("SELECT id, options, value FROM annotations "
396                                                  "WHERE dataset = '%s' and field_id = '%s' AND value != '';"
397                                                  % (dataset_key, field_id))
398
399                        for annotation in annotations:
400                            annotation_id = annotation["id"]
401                            annotation_values = annotation["value"].split(",")
402
403                            # Remove or rename options
404                            new_values = []
405                            new_options = update_value["options"]  # Dict with option id->label as items
406
407                            for ann_value in annotation_values:
408                                # Get the option ID, so we can see if it's new, deleted, or renamed.
409                                # Should always be present in old options dict
410                                option_id = [k for k, v in old_options[field_id].items() if v == ann_value][0]
411                                # Deleted...
412                                if option_id not in new_options:
413                                    continue
414                                # Or replaced with a new, possibly renamed value
415                                else:
416                                    new_values.append(new_options[option_id])
417
418                            new_values = ",".join(new_values)
419                            db.update("annotations", {"value": new_values}, where={"id": annotation_id})
420
421        return count
422
423    def __getattr__(self, attr):
424        """
425        Getter so we don't have to use .data all the time
426
427        :param attr:  Data key to get
428        :return:  Value
429        """
430
431        if attr in dir(self):
432            # an explicitly defined attribute should always be called in favour
433            # of this passthrough
434            attribute = getattr(self, attr)
435            return attribute
436        elif attr in self.data:
437            return self.data[attr]
438        else:
439            raise AttributeError("Annotation instance has no attribute %s" % attr)
440
441    def __setattr__(self, attr, value):
442        """
443        Setter so we can flexibly update the database
444
445        Also updates internal data stores (.data etc.). If the attribute is
446        unknown, it is stored within the 'metadata' attribute.
447
448        :param str attr:  Attribute to update
449        :param value:  New value
450        """
451
452        # don't override behaviour for *actual* class attributes
453        if attr in dir(self):
454            super().__setattr__(attr, value)
455            return
456
457        if attr not in self.data:
458            self.metadata[attr] = value
459            attr = "metadata"
460            value = self.metadata
461
462        if attr == "metadata":
463            value = json.dumps(value)
464
465        self.db.update("annotations", where={"id": self.id}, data={attr: value})
466
467        self.data[attr] = value
468        if attr == "metadata":
469            self.metadata = json.loads(value)

Annotation class

Annotations are always tied to a dataset, a dataset item (e.g. a csv row), an annotation label, and a type ('text', 'multichoice', etc.).

Annotation(data=None, annotation_id=None, db=None)
 44    def __init__(self, data=None, annotation_id=None, db=None):
 45        """
 46        Instantiate annotation object.
 47
 48        :param data:            Annotation data; should correspond to the annotations table record.
 49        :param annotation_id:   The ID of an annotation. If given, it retrieves the annotation
 50                                from the database.
 51        :param db:              Database connection object
 52        """
 53
 54        required_fields = ["field_id", "item_id", "dataset"]
 55
 56        # Must have an ID or data
 57        if (annotation_id is None and data is None) or (data is not None and not isinstance(data, dict)):
 58            raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.")
 59
 60        if not db:
 61            raise AnnotationException("Annotation() needs a `db` database object")
 62
 63        self.db = db
 64
 65        new_or_updated = False
 66
 67        # Get the annotation data if the ID is given; if an annotation has
 68        # an ID, it is guaranteed to be in the database.
 69        # IDs can both be explicitly given or present in the data dict.
 70        if annotation_id is not None or "id" in data:
 71            if data and "id" in data:
 72                annotation_id = data["id"]
 73            self.id = annotation_id  # IDs correspond to unique serial numbers in the database.
 74            current = self.get_by_id(annotation_id)
 75            if not current:
 76                raise AnnotationException(
 77                    "Annotation() requires a valid ID for an existing annotation, %s given" % annotation_id)
 78
 79        # If an ID is not given, get or create an Annotation object from its data.
 80        # First check if required fields are present in `data`.
 81        else:
 82            for required_field in required_fields:
 83                if required_field not in data or not data[required_field]:
 84                    raise AnnotationException("Annotation() requires a %s field" % required_field)
 85
 86            # Check if this annotation already exists, based on dataset key, item id, and label.
 87            current = self.get_by_field(data["dataset"], data["item_id"], data["field_id"])
 88
 89        # If we were able to retrieve an annotation from the db, it already exists
 90        if current:
 91            # Check if we have to overwrite old data with new data
 92            if data:
 93                for key, value in data.items():
 94                    # Save unknown fields in metadata
 95                    if key not in current:
 96                        current["metadata"][key] = value
 97                        new_or_updated = True
 98                    # If values differ, update the value
 99                    elif current[key] != value:
100                        current[key] = value
101                        new_or_updated = True
102
103            self.data = current
104
105        # If this is a new annotation, set all the properties.
106        else:
107            # Keep track of when the annotation was made
108            created_timestamp = int(time.time())
109
110            new_data = {
111                "dataset": data["dataset"],
112                "item_id": data["item_id"],
113                "field_id": data["field_id"],
114                "timestamp": created_timestamp,
115                "timestamp_created": created_timestamp,
116                "label": data["label"],
117                "type": data.get("type", "text"),
118                "options": data.get("options", ""),
119                "value": data.get("value", ""),
120                "author": data.get("author", ""),
121                "author_original": data.get("author", ""),
122                "by_processor": data.get("by_processor", False),
123                "from_dataset": data.get("from_dataset", ""),
124                "metadata": data.get("metadata", {}),
125            }
126
127            self.data = new_data
128            new_or_updated = True
129
130        if isinstance(self.data["metadata"], str):
131            try:
132                self.metadata = json.loads(self.data["metadata"])
133            except (TypeError, json.JSONDecodeError):
134                self.metadata = {}
135
136        for k, v in self.data.items():
137            # Some type checking
138            try:
139                if k == "timestamp" or k == "timestamp_created":
140                    v = int(v)
141                elif k == "by_processor":
142                    v = bool(v)
143            except ValueError as e:
144                raise AnnotationException("Annotation fields are not of the right type (%s)" % e)
145            self.__setattr__(k, v)
146
147        # Write to db if anything changed
148        if new_or_updated:
149            self.timestamp = int(time.time())
150            self.write_to_db()

Instantiate annotation object.

Parameters
  • data: Annotation data; should correspond to the annotations table record.
  • annotation_id: The ID of an annotation. If given, it retrieves the annotation from the database.
  • db: Database connection object
data = None
db = None
id = None
item_id = None
field_id = None
dataset = None
timestamp = None
timestamp_created = None
label = None
type = None
options = None
value = None
author = None
author_original = None
by_processor = None
from_dataset = None
metadata = None
def get_by_id(self, annotation_id: int):
152    def get_by_id(self, annotation_id: int):
153        """
154        Get annotation by ID
155
156        :param str annotation_id:  ID of annotation.
157        :return:  Annotation object, or an empty dict if the ID doesn't exist.
158        """
159
160        try:
161            int(annotation_id)
162        except ValueError:
163            raise AnnotationException("Id '%s' is not valid" % annotation_id)
164
165        data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % annotation_id)
166
167        if not data:
168            return {}
169
170        if data["type"] == "checkbox":
171            data["value"] = data["value"].split(",")
172        data["metadata"] = json.loads(data["metadata"])
173
174        return data

Get annotation by ID

Parameters
  • str annotation_id: ID of annotation.
Returns

Annotation object, or an empty dict if the ID doesn't exist.

def get_by_field(self, dataset_key: str, item_id: str, field_id: str) -> dict:
176    def get_by_field(self, dataset_key: str, item_id: str, field_id: str) -> dict:
177        """
178        Get the annotation information via its dataset key, item ID, and field_id.
179        This is always a unique combination.
180
181        :param dataset_key:     The key of the dataset this annotation was made for.
182        :param item_id:         The ID of the item this annotation was made for.
183        :param field_id:        The field ID of the annotation.
184
185        :return data: A dict with data of the retrieved annotation, or an empty dict if it doesn't exist.
186        """
187
188        data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND field_id = %s",
189                                (dataset_key, str(item_id), field_id))
190        if not data:
191            return {}
192
193        if data["type"] == "checkbox":
194            data["value"] = data["value"].split(",")
195        data["metadata"] = json.loads(data["metadata"])
196
197        return data

Get the annotation information via its dataset key, item ID, and field_id. This is always a unique combination.

Parameters
  • dataset_key: The key of the dataset this annotation was made for.
  • item_id: The ID of the item this annotation was made for.
  • field_id: The field ID of the annotation.
Returns

A dict with data of the retrieved annotation, or an empty dict if it doesn't exist.

def write_to_db(self):
199    def write_to_db(self):
200        """
201        Write an annotation to the database.
202        """
203        db_data = self.data
204
205        db_data["timestamp"] = int(time.time())
206        m = db_data["metadata"]  # To avoid circular reference error
207        db_data["metadata"] = json.dumps(m)
208        if db_data["type"] == "checkbox":
209            db_data["value"] = ",".join(db_data["value"])
210
211        return self.db.upsert("annotations", data=db_data, constraints=["field_id", "dataset", "item_id"])

Write an annotation to the database.

def delete(self):
213    def delete(self):
214        """
215        Deletes this annotation
216        """
217        return self.db.delete("annotations", {"id": self.id})

Deletes this annotation

@staticmethod
def get_annotations_for_dataset( db: common.lib.database.Database, dataset_key: str, item_id=None, before=0) -> list:
220    @staticmethod
221    def get_annotations_for_dataset(db: Database, dataset_key: str, item_id=None, before=0) -> list:
222        """
223        Returns all annotations for a dataset.
224        :param db:                  Database object.
225        :param str dataset_key:     A dataset key.
226        :param str item_id:         An optional item ID or multiple item IDs to only get annotations from specific
227                                    items
228        :param int before:          The upper timestamp for annotations to retrieve.
229
230        :return list: List with annotations.
231        """
232        if not dataset_key:
233            return []
234
235        if not before:
236            before = int(time.time())
237
238        if item_id:
239            # Normalise to strings so the ANY operator receives a proper text[]
240            if isinstance(item_id, (list, tuple, set)):
241                iterable_ids = item_id
242            else:
243                iterable_ids = [item_id]
244
245            item_ids = [str(i) for i in iterable_ids if i is not None]
246            if not item_ids:
247                return []
248
249            data = db.fetchall(
250                "SELECT * FROM annotations WHERE dataset = %s AND item_id = ANY(%s::text[]) AND timestamp <= %s",
251                (dataset_key, item_ids, before,)
252            )
253        else:
254            data = db.fetchall("SELECT * FROM annotations WHERE dataset = %s AND timestamp <= %s", (dataset_key, before,))
255        if not data:
256            return []
257
258        for i in range(len(data)):
259            if data[i]["type"] == "checkbox":
260                data[i]["value"] = data[i]["value"].split(",")
261            data[i]["metadata"] = json.loads(data[i]["metadata"])
262
263        return [Annotation(data=d, db=db) for d in data]

Returns all annotations for a dataset.

Parameters
  • db: Database object.
  • str dataset_key: A dataset key.
  • str item_id: An optional item ID or multiple item IDs to only get annotations from specific items
  • int before: The upper timestamp for annotations to retrieve.
Returns

List with annotations.

@staticmethod
def delete_many( db: common.lib.database.Database, dataset_key=None, annotation_id=None, field_id=None):
265    @staticmethod
266    def delete_many(db: Database, dataset_key=None, annotation_id=None, field_id=None):
267        """
268        Deletes annotations for an entire dataset or by a list of (field) IDs.
269
270        :param db:                  Database object.
271        :param str dataset_key:     A dataset key.
272        :param li annotation_id:	A list or string of unique annotation IDs.
273        :param li field_id:		    A list or string of IDs for annotation fields.
274
275        :return int: The number of removed records.
276        """
277        if not dataset_key and not annotation_id and not field_id:
278            return 0
279
280        where = {}
281        if dataset_key:
282            where["dataset"] = dataset_key
283        if annotation_id:
284            where["id"] = annotation_id
285        if field_id:
286            where["field_id"] = field_id
287
288        return db.delete("annotations", where)

Deletes annotations for an entire dataset or by a list of (field) IDs.

Parameters
  • db: Database object.
  • str dataset_key: A dataset key.
  • li annotation_id: A list or string of unique annotation IDs.
  • li field_id: A list or string of IDs for annotation fields.
Returns

The number of removed records.

@staticmethod
def update_annotations_via_fields( dataset_key: str, old_fields: dict, new_fields: dict, db: common.lib.database.Database) -> int:
290    @staticmethod
291    def update_annotations_via_fields(dataset_key: str, old_fields: dict, new_fields: dict, db: Database) -> int:
292        """
293        Updates annotations in the annotations table if the input fields
294        themselves have been changed, for instance if a dropdown label is renamed
295        or a field is deleted.
296
297        :param str  dataset_key:    The dataset key for which fields changed.
298        :param dict old_fields:	    Old annotation fields.
299        :param dict new_fields:	    New annotation fields; this should contain not just
300                                    the additions, but all fields, changed or otherwise.
301        :param db:                  Database object so we can write.
302
303        :returns int:               How many records were affected.
304        """
305
306        text_fields = ["textarea", "text"]
307
308        # If old and new fields are identical, do nothing.
309        if old_fields == new_fields:
310            return 0
311
312        fields_to_delete = set()        # Delete all annotations with this field ID
313        fields_to_update = {}           # Update values of annotations with this field ID
314        old_options = {}
315
316        # Loop through the old annotation fields
317        for old_field_id, old_field in old_fields.items():
318
319            # Delete all annotations of this type if the field is deleted.
320            if old_field_id not in new_fields:
321                fields_to_delete.add(old_field_id)
322                continue
323
324            field_id = old_field_id
325            new_field = new_fields[field_id]
326
327            # If the annotation type has changed, also delete existing annotations,
328            # except between text and textarea, where we can just change the type and keep the text.
329            if old_field["type"] != new_field["type"]:
330                if old_field["type"] not in text_fields and new_field["type"] not in text_fields:
331                    fields_to_delete.add(field_id)
332                    continue
333
334            # Loop through all the key/values in the new field settings
335            # and update in case it's different from the old values.
336            update_data = {}
337            for field_key, field_value in new_field.items():
338
339                # Update if values don't match
340                if field_value != old_field.get(field_key):
341
342                    # Special case: option values that are removed/renamed.
343                    # Here we may have to change/delete values within the
344                    # values column.
345                    if field_key == "options":
346
347                        new_options = field_value
348
349                        # Edge case: delete annotations of this type if all option fields are deleted
350                        if not new_options:
351                            fields_to_delete.add(field_id)
352                            continue
353
354                        # Changed options values (e.g. renamed or one field deleted)
355                        old_options[old_field_id] = old_field.get("options", {})
356                        options_to_update = {}
357                        if old_options[old_field_id] and old_options != new_options:
358                            options_to_update = new_options
359
360                        if options_to_update:
361                            update_data[field_key] = {"options": options_to_update}
362
363                    # For all other changes, just overwrite with new data.
364                    else:
365                        update_data[field_key] = field_value
366
367            if update_data:
368                fields_to_update[field_id] = update_data
369
370        # Delete annotations
371        if fields_to_delete:
372            Annotation.delete_many(db, field_id=list(fields_to_delete))
373
374        # Write changes to fields to database
375        count = 0
376        if fields_to_update:
377            for field_id, updates in fields_to_update.items():
378
379                # Write to db
380                for column, update_value in updates.items():
381
382                    update_value_insert = update_value
383                    if column == "options":
384                        update_value_insert = ",".join(list(update_value["options"].values()))
385
386                    # Change values of columns
387                    updates = db.update("annotations", {column: update_value_insert},
388                                        where={"dataset": dataset_key, "field_id": field_id})
389                    count += updates
390
391                    # Special case: Changed option labels.
392                    # Here we have to also rename/remove inserted options from the `value` column.
393                    if column == "options":
394
395                        annotations = db.fetchall("SELECT id, options, value FROM annotations "
396                                                  "WHERE dataset = '%s' and field_id = '%s' AND value != '';"
397                                                  % (dataset_key, field_id))
398
399                        for annotation in annotations:
400                            annotation_id = annotation["id"]
401                            annotation_values = annotation["value"].split(",")
402
403                            # Remove or rename options
404                            new_values = []
405                            new_options = update_value["options"]  # Dict with option id->label as items
406
407                            for ann_value in annotation_values:
408                                # Get the option ID, so we can see if it's new, deleted, or renamed.
409                                # Should always be present in old options dict
410                                option_id = [k for k, v in old_options[field_id].items() if v == ann_value][0]
411                                # Deleted...
412                                if option_id not in new_options:
413                                    continue
414                                # Or replaced with a new, possibly renamed value
415                                else:
416                                    new_values.append(new_options[option_id])
417
418                            new_values = ",".join(new_values)
419                            db.update("annotations", {"value": new_values}, where={"id": annotation_id})
420
421        return count

Updates annotations in the annotations table if the input fields themselves have been changed, for instance if a dropdown label is renamed or a field is deleted.

Parameters
  • str dataset_key: The dataset key for which fields changed.
  • dict old_fields: Old annotation fields.
  • dict new_fields: New annotation fields; this should contain not just the additions, but all fields, changed or otherwise.
  • db: Database object so we can write.

:returns int: How many records were affected.