From b0726210c9e3a648f2d8bd9d22c014b557110929 Mon Sep 17 00:00:00 2001 From: Jorrit Schaap <schaap@astron.nl> Date: Fri, 5 Mar 2021 12:46:50 +0100 Subject: [PATCH] TMSS-261: use caching when validating json documents. Major speedup! --- LCS/PyCommon/json_utils.py | 24 +++++-- .../backend/src/tmss/tmssapp/models/common.py | 72 ++++++++++--------- .../src/tmss/tmssapp/models/scheduling.py | 20 +++--- .../src/tmss/tmssapp/models/specification.py | 36 +++++----- 4 files changed, 84 insertions(+), 68 deletions(-) diff --git a/LCS/PyCommon/json_utils.py b/LCS/PyCommon/json_utils.py index f2701985630..963e397174e 100644 --- a/LCS/PyCommon/json_utils.py +++ b/LCS/PyCommon/json_utils.py @@ -19,6 +19,9 @@ import json import jsonschema from copy import deepcopy import requests +from datetime import datetime, timedelta + +DEFAULT_MAX_SCHEMA_CACHE_AGE = timedelta(minutes=1) def _extend_with_default(validator_class): """ @@ -109,7 +112,7 @@ def get_default_json_object_for_schema(schema: str) -> dict: '''return a valid json object for the given schema with all properties with their default values''' return add_defaults_to_json_object_for_schema({}, schema) -def add_defaults_to_json_object_for_schema(json_object: dict, schema: str) -> dict: +def add_defaults_to_json_object_for_schema(json_object: dict, schema: str, cache: dict=None, max_cache_age: timedelta=DEFAULT_MAX_SCHEMA_CACHE_AGE) -> dict: '''return a copy of the json object with defaults filled in according to the schema for all the missing properties''' copy_of_json_object = deepcopy(json_object) @@ -118,7 +121,7 @@ def add_defaults_to_json_object_for_schema(json_object: dict, schema: str) -> di copy_of_json_object['$schema'] = schema['$id'] # resolve $refs to fill in defaults for those, too - schema = resolved_refs(schema) + schema = resolved_refs(schema, cache=cache, max_cache_age=max_cache_age) # run validator, which populates the properties with defaults. get_validator_for_schema(schema, add_defaults=True).validate(copy_of_json_object) @@ -152,16 +155,23 @@ def replace_host_in_urls(schema, new_base_url: str, keys=['$id', '$ref', '$schem return schema -def get_referenced_subschema(ref_url, cache: dict=None): +def get_referenced_subschema(ref_url, cache: dict=None, max_cache_age: timedelta=DEFAULT_MAX_SCHEMA_CACHE_AGE): '''fetch the schema given by the ref_url, and get the sub-schema given by the #/ path in the ref_url''' # deduct referred schema name and version from ref-value head, anchor, tail = ref_url.partition('#') if isinstance(cache, dict) and head in cache: - referenced_schema = cache[head] + # use cached value + referenced_schema, last_update_timestamp = cache[head] + + # refresh cache if outdated + if datetime.utcnow() - last_update_timestamp > max_cache_age: + referenced_schema = json.loads(requests.get(ref_url).text) + cache[head] = referenced_schema, datetime.utcnow() else: + # fetch url, and store in cache referenced_schema = json.loads(requests.get(ref_url).text) if isinstance(cache, dict): - cache[head] = referenced_schema + cache[head] = referenced_schema, datetime.utcnow() # extract sub-schema tail = tail.strip('/') @@ -173,7 +183,7 @@ def get_referenced_subschema(ref_url, cache: dict=None): return referenced_schema -def resolved_refs(schema, cache: dict=None): +def resolved_refs(schema, cache: dict=None, max_cache_age: timedelta=DEFAULT_MAX_SCHEMA_CACHE_AGE): '''return the given schema with all $ref fields replaced by the referred json (sub)schema that they point to.''' if cache is None: cache = {} @@ -183,7 +193,7 @@ def resolved_refs(schema, cache: dict=None): keys = list(schema.keys()) if "$ref" in keys and isinstance(schema['$ref'], str) and schema['$ref'].startswith('http'): keys.remove("$ref") - referenced_subschema = get_referenced_subschema(schema['$ref'], cache) + referenced_subschema = get_referenced_subschema(schema['$ref'], cache=cache, max_cache_age=max_cache_age) updated_schema = resolved_refs(referenced_subschema, cache) for key in keys: diff --git a/SAS/TMSS/backend/src/tmss/tmssapp/models/common.py b/SAS/TMSS/backend/src/tmss/tmssapp/models/common.py index 9631cfc2fc3..c8553fbb288 100644 --- a/SAS/TMSS/backend/src/tmss/tmssapp/models/common.py +++ b/SAS/TMSS/backend/src/tmss/tmssapp/models/common.py @@ -13,6 +13,7 @@ from lofar.sas.tmss.tmss.exceptions import SchemaValidationException from django.urls import reverse as reverse_url import json import jsonschema +from datetime import timedelta # abstract models @@ -162,36 +163,41 @@ class Tags(Model): description = CharField(max_length=255) -# methods - -def annotate_validate_add_defaults_to_doc_using_template(model: Model, document_attr:str, template_attr:str) -> None: - ''' - annotate, validate and add defaults to the JSON document in the model instance using the schema of the given template. - ''' - try: - # fetch the actual JSON document and template-model-instance - document = getattr(model, document_attr) - template = getattr(model, template_attr) - - if document is not None and template is not None: - try: - if isinstance(document, str): - document = json.loads(document) - - # always annotate the json data document with a $schema URI to the schema that it is based on. - # this enables all users using this document (inside or outside of TMSS) to do their own validation and usage of editors which use the schema as UI template - document['$schema'] = template.schema['$id'] - except (KeyError, TypeError, AttributeError) as e: - raise SchemaValidationException("Cannot set $schema in json_doc to the schema's $id.\nError: %s \njson_doc: %s\nschema: %s" % (str(e), document, template.schema)) - - # add defaults for missing properies, and validate on the fly - document = add_defaults_to_json_object_for_schema(document, template.schema) - - # update the model instance with the updated and validated document - setattr(model, document_attr, document) - except AttributeError: - pass - except json.JSONDecodeError as e: - raise SchemaValidationException("Invalid JSON.\nError: %s \ndata: %s" % (str(e), document)) - except jsonschema.ValidationError as e: - raise SchemaValidationException(str(e)) \ No newline at end of file +class TemplateSchemaMixin(): + '''The TemplateSchemaMixin class can be mixed in to models which do validate and add defaults to json documents given a json-schema. + It uses an internal cache with a max age to minimize the number of requests to schema's, subschema's or referenced (sub)schema's.''' + _schema_cache = {} + _MAX_SCHEMA_CACHE_AGE = timedelta(minutes=1) + + def annotate_validate_add_defaults_to_doc_using_template(self, document_attr:str, template_attr:str) -> None: + ''' + annotate, validate and add defaults to the JSON document in the model instance using the schema of the given template. + ''' + try: + # fetch the actual JSON document and template-model-instance + document = getattr(self, document_attr) + template = getattr(self, template_attr) + + if document is not None and template is not None: + try: + if isinstance(document, str): + document = json.loads(document) + + # always annotate the json data document with a $schema URI to the schema that it is based on. + # this enables all users using this document (inside or outside of TMSS) to do their own validation and usage of editors which use the schema as UI template + document['$schema'] = template.schema['$id'] + except (KeyError, TypeError, AttributeError) as e: + raise SchemaValidationException("Cannot set $schema in json_doc to the schema's $id.\nError: %s \njson_doc: %s\nschema: %s" % (str(e), document, template.schema)) + + # add defaults for missing properies, and validate on the fly + # use the class's _schema_cache + document = add_defaults_to_json_object_for_schema(document, template.schema, self._schema_cache) + + # update the model instance with the updated and validated document + setattr(self, document_attr, document) + except AttributeError: + pass + except json.JSONDecodeError as e: + raise SchemaValidationException("Invalid JSON.\nError: %s \ndata: %s" % (str(e), document)) + except jsonschema.ValidationError as e: + raise SchemaValidationException(str(e)) \ No newline at end of file diff --git a/SAS/TMSS/backend/src/tmss/tmssapp/models/scheduling.py b/SAS/TMSS/backend/src/tmss/tmssapp/models/scheduling.py index d5a0964b579..b1bb4fa91ff 100644 --- a/SAS/TMSS/backend/src/tmss/tmssapp/models/scheduling.py +++ b/SAS/TMSS/backend/src/tmss/tmssapp/models/scheduling.py @@ -12,7 +12,7 @@ from django.db.models import Model, ForeignKey, OneToOneField, CharField, DateTi ManyToManyField, CASCADE, SET_NULL, PROTECT, QuerySet, BigAutoField, UniqueConstraint from django.contrib.postgres.fields import ArrayField, JSONField from django.contrib.auth.models import User -from .common import AbstractChoice, BasicCommon, Template, NamedCommon, annotate_validate_add_defaults_to_doc_using_template +from .common import AbstractChoice, BasicCommon, Template, NamedCommon, TemplateSchemaMixin from enum import Enum from django.db.models.expressions import RawSQL from django.core.exceptions import ValidationError @@ -137,7 +137,7 @@ class SIPidentifier(Model): # # Instance Objects # -class Subtask(BasicCommon): +class Subtask(BasicCommon, TemplateSchemaMixin): """ Represents a low-level task, which is an atomic unit of execution, such as running an observation, running inspection plots on the observed data, etc. Each task has a specific configuration, will have resources allocated @@ -274,7 +274,7 @@ class Subtask(BasicCommon): def save(self, force_insert=False, force_update=False, using=None, update_fields=None): creating = self._state.adding # True on create, False on update - annotate_validate_add_defaults_to_doc_using_template(self, 'specifications_doc', 'specifications_template') + self.annotate_validate_add_defaults_to_doc_using_template('specifications_doc', 'specifications_template') SIPidentifier.assign_new_id_to_model(self) # check for uniqueness of SAP names: @@ -325,7 +325,7 @@ class SubtaskStateLog(BasicCommon): new_state = ForeignKey('SubtaskState', null=False, editable=False, on_delete=PROTECT, related_name='is_new_state_of', help_text='Subtask state after update (see Subtask State Machine).') -class SubtaskInput(BasicCommon): +class SubtaskInput(BasicCommon, TemplateSchemaMixin): subtask = ForeignKey('Subtask', null=False, on_delete=CASCADE, related_name='inputs', help_text='Subtask to which this input specification refers.') task_relation_blueprint = ForeignKey('TaskRelationBlueprint', null=True, on_delete=SET_NULL, help_text='Task Relation Blueprint which this Subtask Input implements (NULLable).') producer = ForeignKey('SubtaskOutput', on_delete=PROTECT, related_name='consumers', help_text='The SubtaskOutput producing the input dataproducts for this SubtaskInput.') @@ -334,7 +334,7 @@ class SubtaskInput(BasicCommon): selection_template = ForeignKey('TaskRelationSelectionTemplate', on_delete=PROTECT, help_text='Schema used for selection_doc.') def save(self, force_insert=False, force_update=False, using=None, update_fields=None): - annotate_validate_add_defaults_to_doc_using_template(self, 'selection_doc', 'selection_template') + self.annotate_validate_add_defaults_to_doc_using_template('selection_doc', 'selection_template') super().save(force_insert, force_update, using, update_fields) @@ -342,19 +342,19 @@ class SubtaskOutput(BasicCommon): subtask = ForeignKey('Subtask', null=False, on_delete=CASCADE, related_name='outputs', help_text='Subtask to which this output specification refers.') -class SAP(BasicCommon): +class SAP(BasicCommon, TemplateSchemaMixin): specifications_doc = JSONField(help_text='SAP properties.') specifications_template = ForeignKey('SAPTemplate', null=False, on_delete=CASCADE, help_text='Schema used for specifications_doc.') global_identifier = OneToOneField('SIPidentifier', null=False, editable=False, on_delete=PROTECT, help_text='The global unique identifier for LTA SIP.') def save(self, force_insert=False, force_update=False, using=None, update_fields=None): - annotate_validate_add_defaults_to_doc_using_template(self, 'specifications_doc', 'specifications_template') + self.annotate_validate_add_defaults_to_doc_using_template('specifications_doc', 'specifications_template') SIPidentifier.assign_new_id_to_model(self) super().save(force_insert, force_update, using, update_fields) -class Dataproduct(BasicCommon): +class Dataproduct(BasicCommon, TemplateSchemaMixin): """ A data product represents an atomic dataset, produced and consumed by subtasks. The consumed dataproducts are those resulting from interpreting the Subtask Connector filters of the inputs. These links are explicitly saved, should @@ -381,8 +381,8 @@ class Dataproduct(BasicCommon): constraints = [UniqueConstraint(fields=['directory', 'filename'], name='%(class)s_unique_path')] def save(self, force_insert=False, force_update=False, using=None, update_fields=None): - annotate_validate_add_defaults_to_doc_using_template(self, 'specifications_doc', 'specifications_template') - annotate_validate_add_defaults_to_doc_using_template(self, 'feedback_doc', 'feedback_template') + self.annotate_validate_add_defaults_to_doc_using_template('specifications_doc', 'specifications_template') + self.annotate_validate_add_defaults_to_doc_using_template('feedback_doc', 'feedback_template') SIPidentifier.assign_new_id_to_model(self) super().save(force_insert, force_update, using, update_fields) diff --git a/SAS/TMSS/backend/src/tmss/tmssapp/models/specification.py b/SAS/TMSS/backend/src/tmss/tmssapp/models/specification.py index 38eaadcb154..2e02333b423 100644 --- a/SAS/TMSS/backend/src/tmss/tmssapp/models/specification.py +++ b/SAS/TMSS/backend/src/tmss/tmssapp/models/specification.py @@ -10,7 +10,7 @@ from django.contrib.postgres.fields import JSONField from enum import Enum from django.db.models.expressions import RawSQL from django.db.models.deletion import ProtectedError -from .common import AbstractChoice, BasicCommon, Template, NamedCommon, annotate_validate_add_defaults_to_doc_using_template, NamedCommonPK +from .common import AbstractChoice, BasicCommon, Template, NamedCommon, TemplateSchemaMixin, NamedCommonPK from lofar.common.json_utils import validate_json_against_schema, validate_json_against_its_schema, add_defaults_to_json_object_for_schema from lofar.sas.tmss.tmss.exceptions import * from django.core.exceptions import ValidationError @@ -311,18 +311,18 @@ class ResourceType(NamedCommonPK): quantity = ForeignKey('Quantity', null=False, on_delete=PROTECT, help_text='The quantity of this resource type.') -class SchedulingSet(NamedCommon): +class SchedulingSet(NamedCommon, TemplateSchemaMixin): generator_doc = JSONField(null=True, help_text='Parameters for the generator (NULLable).') generator_template = ForeignKey('GeneratorTemplate', on_delete=SET_NULL, null=True, help_text='Generator for the scheduling units in this set (NULLable).') generator_source = ForeignKey('SchedulingUnitDraft', on_delete=SET_NULL, null=True, help_text='Reference for the generator to an existing collection of specifications (NULLable).') project = ForeignKey('Project', related_name="scheduling_sets", on_delete=PROTECT, help_text='Project to which this scheduling set belongs.') # protected to avoid accidents def save(self, force_insert=False, force_update=False, using=None, update_fields=None): - annotate_validate_add_defaults_to_doc_using_template(self, 'generator_doc', 'generator_template') + self.annotate_validate_add_defaults_to_doc_using_template('generator_doc', 'generator_template') super().save(force_insert, force_update, using, update_fields) -class SchedulingUnitDraft(NamedCommon): +class SchedulingUnitDraft(NamedCommon, TemplateSchemaMixin): requirements_doc = JSONField(help_text='Scheduling and/or quality requirements for this run.') copies = ForeignKey('SchedulingUnitDraft', related_name="copied_from", on_delete=SET_NULL, null=True, help_text='Source reference, if we are a copy (NULLable).') copy_reason = ForeignKey('CopyReason', null=True, on_delete=PROTECT, help_text='Reason why source was copied (NULLable).') @@ -349,8 +349,8 @@ class SchedulingUnitDraft(NamedCommon): #When auto_ingest=False, the scheduling units will be created with ingest_permission_required = True self.ingest_permission_required=True - annotate_validate_add_defaults_to_doc_using_template(self, 'requirements_doc', 'requirements_template') - annotate_validate_add_defaults_to_doc_using_template(self, 'scheduling_constraints_doc', 'scheduling_constraints_template') + self.annotate_validate_add_defaults_to_doc_using_template('requirements_doc', 'requirements_template') + self.annotate_validate_add_defaults_to_doc_using_template('scheduling_constraints_doc', 'scheduling_constraints_template') super().save(force_insert, force_update, using, update_fields) @cached_property @@ -386,7 +386,7 @@ class SchedulingUnitDraft(NamedCommon): return self.scheduling_set.project -class SchedulingUnitBlueprint(NamedCommon): +class SchedulingUnitBlueprint(NamedCommon, TemplateSchemaMixin): class Status(Enum): DEFINED = "defined" FINISHED = "finished" @@ -411,7 +411,7 @@ class SchedulingUnitBlueprint(NamedCommon): results_accepted = BooleanField(default=False, help_text='boolean (default NULL), which records whether the results were accepted, allowing the higher-level accounting to be adjusted.') def save(self, force_insert=False, force_update=False, using=None, update_fields=None): - annotate_validate_add_defaults_to_doc_using_template(self, 'requirements_doc', 'requirements_template') + self.annotate_validate_add_defaults_to_doc_using_template('requirements_doc', 'requirements_template') # This code only happens if the objects is not in the database yet. self._state.adding is True creating if self._state.adding and hasattr(self, 'draft') and self.draft.scheduling_set.project.auto_ingest is False: @@ -606,7 +606,7 @@ class ProjectPropertyMixin(): return obj -class TaskDraft(NamedCommon, ProjectPropertyMixin): +class TaskDraft(NamedCommon, ProjectPropertyMixin, TemplateSchemaMixin): specifications_doc = JSONField(help_text='Specifications for this task.') copies = ForeignKey('TaskDraft', related_name="copied_from", on_delete=SET_NULL, null=True, help_text='Source reference, if we are a copy (NULLable).') copy_reason = ForeignKey('CopyReason', on_delete=PROTECT, null=True, help_text='Reason why source was copied (NULLable).') @@ -620,7 +620,7 @@ class TaskDraft(NamedCommon, ProjectPropertyMixin): constraints = [UniqueConstraint(fields=['name', 'scheduling_unit_draft'], name='TaskDraft_unique_name_in_scheduling_unit')] def save(self, force_insert=False, force_update=False, using=None, update_fields=None): - annotate_validate_add_defaults_to_doc_using_template(self, 'specifications_doc', 'specifications_template') + self.annotate_validate_add_defaults_to_doc_using_template('specifications_doc', 'specifications_template') if self._state.adding: # True on create, False on update, needs to be checked before super().save() super().save(force_insert, force_update, using, update_fields) if self.scheduling_unit_draft.scheduling_set.project.auto_pin: @@ -737,7 +737,7 @@ class TaskDraft(NamedCommon, ProjectPropertyMixin): # return None -class TaskBlueprint(NamedCommon): +class TaskBlueprint(NamedCommon, TemplateSchemaMixin): specifications_doc = JSONField(help_text='Schedulings for this task (IMMUTABLE).') do_cancel = BooleanField(help_text='Cancel this task.') @@ -751,7 +751,7 @@ class TaskBlueprint(NamedCommon): constraints = [UniqueConstraint(fields=['name', 'scheduling_unit_blueprint'], name='TaskBlueprint_unique_name_in_scheduling_unit')] def save(self, force_insert=False, force_update=False, using=None, update_fields=None): - annotate_validate_add_defaults_to_doc_using_template(self, 'specifications_doc', 'specifications_template') + self.annotate_validate_add_defaults_to_doc_using_template('specifications_doc', 'specifications_template') super().save(force_insert, force_update, using, update_fields) @cached_property @@ -885,7 +885,7 @@ class TaskBlueprint(NamedCommon): return "schedulable" -class TaskRelationDraft(BasicCommon): +class TaskRelationDraft(BasicCommon, TemplateSchemaMixin): selection_doc = JSONField(help_text='Filter for selecting dataproducts from the output role.') selection_template = ForeignKey('TaskRelationSelectionTemplate', on_delete=CASCADE, help_text='Schema used for selection_doc.') dataformat = ForeignKey('Dataformat', null=False, on_delete=PROTECT, help_text='Selected data format to use. One of (MS, HDF5).') @@ -903,11 +903,11 @@ class TaskRelationDraft(BasicCommon): constraints = [UniqueConstraint(fields=['producer', 'consumer', 'input_role', 'output_role'], name='TaskRelationDraft_unique_relation')] def save(self, force_insert=False, force_update=False, using=None, update_fields=None): - annotate_validate_add_defaults_to_doc_using_template(self, 'selection_doc', 'selection_template') + self.annotate_validate_add_defaults_to_doc_using_template('selection_doc', 'selection_template') super().save(force_insert, force_update, using, update_fields) -class TaskRelationBlueprint(BasicCommon): +class TaskRelationBlueprint(BasicCommon, TemplateSchemaMixin): selection_doc = JSONField(help_text='Filter for selecting dataproducts from the output role.') dataformat = ForeignKey('Dataformat', null=False, on_delete=PROTECT, help_text='Selected data format to use.') @@ -926,7 +926,7 @@ class TaskRelationBlueprint(BasicCommon): constraints = [UniqueConstraint(fields=['producer', 'consumer', 'input_role', 'output_role'], name='TaskRelationBlueprint_unique_relation')] def save(self, force_insert=False, force_update=False, using=None, update_fields=None): - annotate_validate_add_defaults_to_doc_using_template(self, 'selection_doc', 'selection_template') + self.annotate_validate_add_defaults_to_doc_using_template('selection_doc', 'selection_template') super().save(force_insert, force_update, using, update_fields) @@ -966,7 +966,7 @@ class TaskSchedulingRelationDraft(BasicCommon): super().save(force_insert, force_update, using, update_fields) -class Reservation(NamedCommon): +class Reservation(NamedCommon, TemplateSchemaMixin): project = ForeignKey('Project', null=True, related_name='reservations', on_delete=CASCADE, help_text='Reservation will be accounted for this project.') description = CharField(max_length=255, help_text='Short description for this reservation, used in overviews') start_time = DateTimeField(help_text='Start of this reservation.') @@ -982,5 +982,5 @@ class Reservation(NamedCommon): return None def save(self, force_insert=False, force_update=False, using=None, update_fields=None): - annotate_validate_add_defaults_to_doc_using_template(self, 'specifications_doc', 'specifications_template') + self.annotate_validate_add_defaults_to_doc_using_template('specifications_doc', 'specifications_template') super().save(force_insert, force_update, using, update_fields) -- GitLab