From 2b9a8ddb7cc78bbde322e369afbf73e08d38bbc3 Mon Sep 17 00:00:00 2001 From: Vermaas <vermaas@astron.nl> Date: Mon, 3 Feb 2025 16:04:02 +0100 Subject: [PATCH 1/6] SDC-1663 add input validation --- atdb/taskdatabase/models.py | 5 ++ atdb/taskdatabase/services/common.py | 1 + .../specification/input_validation.py | 69 +++++++++++++++++++ .../templates/taskdatabase/index.html | 2 +- .../tests/test_input_validation.py | 69 +++++++++++++++++++ 5 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 atdb/taskdatabase/services/specification/input_validation.py create mode 100644 atdb/taskdatabase/tests/test_input_validation.py diff --git a/atdb/taskdatabase/models.py b/atdb/taskdatabase/models.py index e3422b21..4e661494 100644 --- a/atdb/taskdatabase/models.py +++ b/atdb/taskdatabase/models.py @@ -9,6 +9,7 @@ import logging from .services import calculated_qualities as qualities from .services.common import State, AggregationStrategy, ACTIVITY_RESET_STATUSSEN +from .services.specification.input_validation import validate_inputs logger = logging.getLogger(__name__) @@ -271,6 +272,10 @@ class Task(models.Model): # make sure that every task has an activity (also for backward compatibility) associate_task_with_activity(self) + # when a task comes in as DEFINING some validation needs to be done. + if (self.status != State.DEFINING.value) & (self.new_status == State.DEFINING.value): + validate_inputs(self) + # when a task goes to PROCESSED... handle the (potential) aggregation functionality if (self.status != State.PROCESSED.value) & (self.new_status == State.PROCESSED.value): self.handle_aggregation() diff --git a/atdb/taskdatabase/services/common.py b/atdb/taskdatabase/services/common.py index a9b60ce7..835dfc17 100644 --- a/atdb/taskdatabase/services/common.py +++ b/atdb/taskdatabase/services/common.py @@ -9,6 +9,7 @@ from enum import Enum logger = logging.getLogger(__name__) class State(Enum): UNKNOWN = "unknown" + DEFINING = "defining" DEFINED = "defined" STAGED = "staged" FETCHED = "fetched" diff --git a/atdb/taskdatabase/services/specification/input_validation.py b/atdb/taskdatabase/services/specification/input_validation.py new file mode 100644 index 00000000..1cc13fce --- /dev/null +++ b/atdb/taskdatabase/services/specification/input_validation.py @@ -0,0 +1,69 @@ +import logging; +logger = logging.getLogger(__name__) + +def remove_duplicates(data): + """ + task.inputs can contain double entries (caused by faulty ingests during the 'mom era'). Remove them. + The following is an example of an inputs section with a duplicate 'surl', the duplicate will be removed + [ + { + "size": 33638400, + "surl": "srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/projects/lc5_020/432340/L432340_SB000_uv.dppp.MS_caf35c3d.tar", + "type": "File", + "location": "srm.grid.sara.nl" + }, + { + "size": 33638400, + "surl": "srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/projects/lc5_020/432340/L432340_SB001_uv.dppp.MS_74948c4c.tar", + "type": "File", + "location": "srm.grid.sara.nl" + }, + { + "size": 33638400, + "surl": "srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/projects/lc5_020/432340/L432340_SB000_uv.dppp.MS_caf35c3d.tar", + "type": "File", + "location": "srm.grid.sara.nl" + }] + + """ + logger.info(f'check_for_duplicates') + + recalculate = False + seen_surls = set() + unique_inputs = [] + + for item in data: + if item["surl"] not in seen_surls: + unique_inputs.append(item) + seen_surls.add(item["surl"]) + else: + recalculate = True + logger.info(f'duplicate found: {item["surl"]}') + + return recalculate,unique_inputs + + +def recalculate_size(data): + """ + Operators or ATDB could have removed files from the inputs json blob. + If that has happened, the task.size_to_process needs to be recalculated + """ + logger.info(f'recalculate_sums') + new_size = sum([e["size"] for e in data]) + return new_size + +def validate_inputs(task): + """ + check if the task.inputs is valid + - currently only duplicates are checked, but this functionality can be expanded + """ + inputs = task.inputs + + # remove duplicates + recalculate,unique_inputs = remove_duplicates(inputs) + + # if a duplicate was found, recalculate the size_to_process and update the inputs list of the task + if recalculate: + task.inputs = unique_inputs + recalculate_size(unique_inputs) + task.size_to_process = recalculate_size(unique_inputs) \ No newline at end of file diff --git a/atdb/taskdatabase/templates/taskdatabase/index.html b/atdb/taskdatabase/templates/taskdatabase/index.html index 370d1dbb..080cda67 100644 --- a/atdb/taskdatabase/templates/taskdatabase/index.html +++ b/atdb/taskdatabase/templates/taskdatabase/index.html @@ -31,7 +31,7 @@ {% include 'taskdatabase/pagination.html' %} </div> </div> - <p class="footer"> Version 30 Jan 2025</p> + <p class="footer"> Version 3 Feb 2025</p> </div> {% include 'taskdatabase/refresh.html' %} diff --git a/atdb/taskdatabase/tests/test_input_validation.py b/atdb/taskdatabase/tests/test_input_validation.py new file mode 100644 index 00000000..f847fc72 --- /dev/null +++ b/atdb/taskdatabase/tests/test_input_validation.py @@ -0,0 +1,69 @@ +from django.test import TestCase +from django.utils import timezone +from datetime import datetime +from taskdatabase.models import LogEntry, Task, Workflow, Job, Configuration +from taskdatabase.services.specification import input_validation +from unittest.mock import Mock, MagicMock, patch + +class TestInputValidation(TestCase): + + def setUp(self): + + # used to test the get_size calculation, this uses a database + self.workflow = Workflow(id=22, workflow_uri="psrfits_requantisation") + self.workflow.save() + + self.inputs_in = [ + { + "size": 100, + "surl": "srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/projects/lc5_020/432340/L432340_SB000_uv.dppp.MS_caf35c3d.tar", + "type": "File", + "location": "srm.grid.sara.nl" + }, + { + "size": 200, + "surl": "srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/projects/lc5_020/432340/L432340_SB001_uv.dppp.MS_74948c4c.tar", + "type": "File", + "location": "srm.grid.sara.nl" + }, + { + "size": 300, + "surl": "srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/projects/lc5_020/432340/L432340_SB000_uv.dppp.MS_caf35c3d.tar", + "type": "File", + "location": "srm.grid.sara.nl" + }] + + + self.task = Task.objects.create(sas_id='5432', status='defining', new_status='defining', + size_to_process = 100, workflow=self.workflow, inputs = self.inputs_in) + self.task.save() + + def test_validate_inputs(self): + + # arrange + expected_size = 300 + expected_inputs = [ + { + "size": 100, + "surl": "srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/projects/lc5_020/432340/L432340_SB000_uv.dppp.MS_caf35c3d.tar", + "type": "File", + "location": "srm.grid.sara.nl" + }, + { + "size": 200, + "surl": "srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/projects/lc5_020/432340/L432340_SB001_uv.dppp.MS_74948c4c.tar", + "type": "File", + "location": "srm.grid.sara.nl" + }] + + # act + input_validation.validate_inputs(self.task) + inputs = self.task.inputs + size = self.task.size_to_process + + # assert + self.assertEqual(inputs, expected_inputs) + self.assertEqual(size, expected_size) + + + -- GitLab From 6cdafa1198c9d514deeafada9a67a3a31a4d17e0 Mon Sep 17 00:00:00 2001 From: Vermaas <vermaas@astron.nl> Date: Mon, 3 Feb 2025 16:32:34 +0100 Subject: [PATCH 2/6] SDC-1663 add input validation --- atdb/taskdatabase/models.py | 4 +- .../tests/test_input_validation.py | 69 +++++++++++++++---- 2 files changed, 56 insertions(+), 17 deletions(-) diff --git a/atdb/taskdatabase/models.py b/atdb/taskdatabase/models.py index 4e661494..0ed582db 100644 --- a/atdb/taskdatabase/models.py +++ b/atdb/taskdatabase/models.py @@ -272,8 +272,8 @@ class Task(models.Model): # make sure that every task has an activity (also for backward compatibility) associate_task_with_activity(self) - # when a task comes in as DEFINING some validation needs to be done. - if (self.status != State.DEFINING.value) & (self.new_status == State.DEFINING.value): + # when a task becomes DEFINING or DEFINED some validation needs to be done. + if self.new_status in [State.DEFINING.value, State.DEFINED.value]: validate_inputs(self) # when a task goes to PROCESSED... handle the (potential) aggregation functionality diff --git a/atdb/taskdatabase/tests/test_input_validation.py b/atdb/taskdatabase/tests/test_input_validation.py index f847fc72..21e7f080 100644 --- a/atdb/taskdatabase/tests/test_input_validation.py +++ b/atdb/taskdatabase/tests/test_input_validation.py @@ -13,7 +13,7 @@ class TestInputValidation(TestCase): self.workflow = Workflow(id=22, workflow_uri="psrfits_requantisation") self.workflow.save() - self.inputs_in = [ + self.inputs_with_duplicate = [ { "size": 100, "surl": "srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/projects/lc5_020/432340/L432340_SB000_uv.dppp.MS_caf35c3d.tar", @@ -33,16 +33,7 @@ class TestInputValidation(TestCase): "location": "srm.grid.sara.nl" }] - - self.task = Task.objects.create(sas_id='5432', status='defining', new_status='defining', - size_to_process = 100, workflow=self.workflow, inputs = self.inputs_in) - self.task.save() - - def test_validate_inputs(self): - - # arrange - expected_size = 300 - expected_inputs = [ + self.inputs_validated = [ { "size": 100, "surl": "srm://srm.grid.sara.nl:8443/pnfs/grid.sara.nl/data/lofar/ops/projects/lc5_020/432340/L432340_SB000_uv.dppp.MS_caf35c3d.tar", @@ -56,14 +47,62 @@ class TestInputValidation(TestCase): "location": "srm.grid.sara.nl" }] + self.task = Task.objects.create(sas_id='5432', status='defining', new_status='defining', + size_to_process = 100, workflow=self.workflow, inputs = self.inputs_with_duplicate) + self.task.save() + + def test_validate_inputs(self): + """ + run the validates_inputs function with a task.inputs that contains a duplicate. + The duplicate should be removed, and the size recalculated + """ + + # arrange + expected_size = 300 + expected_inputs = self.inputs_validated + # act input_validation.validate_inputs(self.task) - inputs = self.task.inputs - size = self.task.size_to_process # assert - self.assertEqual(inputs, expected_inputs) - self.assertEqual(size, expected_size) + self.assertEqual(self.task.inputs, expected_inputs) + self.assertEqual(self.task.size_to_process, expected_size) + def test_trigger_on_defined(self): + """ + test that the functionality (also) triggers on 'defined' + """ + # arrange + expected_size = 300 + expected_inputs = self.inputs_validated + + # set to 'wrong' inputs containing a duplicate + self.task.inputs = self.inputs_with_duplicate + + # act + self.task.new_status="defined" + self.task.save() + + # assert + self.assertEqual(self.task.inputs, expected_inputs) + self.assertEqual(self.task.size_to_process, expected_size) + + def test_no_trigger_on_staged(self): + """ + test that the functionality does not trigger on other statusses, like 'staged' + """ + + # arrange + expected_inputs = self.inputs_with_duplicate + + # set to 'wrong' inputs containing a duplicate + self.task.inputs = self.inputs_with_duplicate + + # act + self.task.new_status="staged" + self.task.save() + + # assert + self.assertEqual(self.task.inputs, expected_inputs) -- GitLab From 96bff0fd5d7811d0f91211c5fedb86a68dda5c5b Mon Sep 17 00:00:00 2001 From: Vermaas <vermaas@astron.nl> Date: Mon, 3 Feb 2025 16:48:40 +0100 Subject: [PATCH 3/6] SDC-1663 add input validation --- atdb/taskdatabase/services/specification/input_validation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/atdb/taskdatabase/services/specification/input_validation.py b/atdb/taskdatabase/services/specification/input_validation.py index 1cc13fce..9ed544bc 100644 --- a/atdb/taskdatabase/services/specification/input_validation.py +++ b/atdb/taskdatabase/services/specification/input_validation.py @@ -28,6 +28,10 @@ def remove_duplicates(data): """ logger.info(f'check_for_duplicates') + if not data: + # tasks without inputs should just return + return False,None + recalculate = False seen_surls = set() unique_inputs = [] -- GitLab From 4647b2e66a80c1dccdcdde7c7c9cf383242e9d62 Mon Sep 17 00:00:00 2001 From: Vermaas <vermaas@astron.nl> Date: Tue, 4 Feb 2025 16:23:44 +0100 Subject: [PATCH 4/6] adapted the functionality based on review --- atdb/atdb/static/taskdatabase/style.css | 2 +- atdb/taskdatabase/models.py | 2 +- atdb/taskdatabase/services/signals.py | 27 +++++++ .../specification/input_validation.py | 78 +++++++++++++++---- .../static/taskdatabase/style.css | 2 +- .../templates/taskdatabase/index.html | 2 +- .../tests/test_input_validation.py | 28 ++++--- 7 files changed, 106 insertions(+), 35 deletions(-) diff --git a/atdb/atdb/static/taskdatabase/style.css b/atdb/atdb/static/taskdatabase/style.css index 258fea6f..b7bc0ad4 100644 --- a/atdb/atdb/static/taskdatabase/style.css +++ b/atdb/atdb/static/taskdatabase/style.css @@ -27,7 +27,7 @@ TD { background-color: lightgreen; } -.aggregate_failed,.aggregating_failed { +.aggregate_failed,.aggregating_failed,.defining_failed,.defined_failed { color: red; font-weight: bold; } diff --git a/atdb/taskdatabase/models.py b/atdb/taskdatabase/models.py index 0ed582db..293a41e1 100644 --- a/atdb/taskdatabase/models.py +++ b/atdb/taskdatabase/models.py @@ -9,7 +9,7 @@ import logging from .services import calculated_qualities as qualities from .services.common import State, AggregationStrategy, ACTIVITY_RESET_STATUSSEN -from .services.specification.input_validation import validate_inputs +from .services.specification.input_validation import validate_inputs,recalculate_size logger = logging.getLogger(__name__) diff --git a/atdb/taskdatabase/services/signals.py b/atdb/taskdatabase/services/signals.py index 2442e3d7..9a81796f 100644 --- a/atdb/taskdatabase/services/signals.py +++ b/atdb/taskdatabase/services/signals.py @@ -5,15 +5,32 @@ from django.core.signals import request_started, request_finished from django.contrib.auth.models import User from django.dispatch import receiver from django.contrib.contenttypes.models import ContentType +from django.utils import timezone from taskdatabase.models import Task, Workflow, LogEntry, Status from .activities_handler import update_activity + """ Signals sent from different parts of the backend are centrally defined and handled here. """ logger = logging.getLogger(__name__) +def add_logentry(task, step, status, description, service='ATDB'): + """ + add log entry + usually this functionality is called by external services directly using the REST API to report to ATDB + but there are also 'internal services', like input_validation, that can report to the user this way. + """ + logentry = LogEntry( + task=task, + service=service, + step_name=step, + status=status, + description=description, + timestamp=timezone.now()) + logentry.save() + #--- Task signals------------- @@ -67,6 +84,16 @@ def handle_post_save(sender, **kwargs): update_activity(task) + if task.environment: + step = task.environment.split('::')[0] + description = task.environment.split('::')[1] + add_logentry(task,step,task.status,description) + + # clear the 'cache' + task.environment='' + disconnect_signals() + task.save() + connect_signals() def connect_signals(): #logger.info("connect_signals") diff --git a/atdb/taskdatabase/services/specification/input_validation.py b/atdb/taskdatabase/services/specification/input_validation.py index 9ed544bc..8000ad8b 100644 --- a/atdb/taskdatabase/services/specification/input_validation.py +++ b/atdb/taskdatabase/services/specification/input_validation.py @@ -1,10 +1,41 @@ import logging; +import os.path + logger = logging.getLogger(__name__) -def remove_duplicates(data): +# copied from ldv_ingest module +# https://git.astron.nl/astron-sdc/ldv-services/-/blob/master/atdb_services_pip/atdb_services/ldv_ingest.py?ref_type=heads#L1721 +def get_filename_without_hash(file_name): + """ + Remove magic number (hash) between latest '_' and '.' + The hash should be 8 characters long otherwise the original file_name will be returned + e.g. LOFAR_PULSAR_ARCHIVE_locus004_L231550_red_c122bb36.tar will result in + LOFAR_PULSAR_ARCHIVE_locus004_L231550_red.tar + :param file_name: + :return file_name_without_hash: + """ + file_name_without_hash = file_name + if "_" and "." in file_name: + file_start_of_hash = file_name.rsplit("_", 1)[0] + file_end_of_hash = file_name.rsplit(".", 1)[1] + file_name_without_hash = f"{file_start_of_hash}.{file_end_of_hash}" + + # Check difference in length should be 8 char + '_' + if (len(file_name) - len(file_name_without_hash)) != 9: + logger.warning(f"No correct hash found in '{file_name_without_hash}'. Use the original filename") + file_name_without_hash = file_name + + # only return the filename without the path + filename = os.path.basename(file_name_without_hash) + return filename + +def check_duplicates(data): """ task.inputs can contain double entries (caused by faulty ingests during the 'mom era'). Remove them. The following is an example of an inputs section with a duplicate 'surl', the duplicate will be removed + + :return: found_duplicates is True when a duplicate was found + :return: [ { "size": 33638400, @@ -26,25 +57,30 @@ def remove_duplicates(data): }] """ - logger.info(f'check_for_duplicates') + logger.info(f'check_duplicates') if not data: # tasks without inputs should just return return False,None - recalculate = False + found_duplicates = False seen_surls = set() unique_inputs = [] + duplicates = [] for item in data: - if item["surl"] not in seen_surls: + + filename_without_hash = get_filename_without_hash(item["surl"]) + + if filename_without_hash not in seen_surls: unique_inputs.append(item) - seen_surls.add(item["surl"]) + seen_surls.add(filename_without_hash) else: - recalculate = True - logger.info(f'duplicate found: {item["surl"]}') + duplicates.append(filename_without_hash) + found_duplicates = True + logger.info(f'duplicate found: {filename_without_hash}') - return recalculate,unique_inputs + return found_duplicates,duplicates, unique_inputs def recalculate_size(data): @@ -52,22 +88,32 @@ def recalculate_size(data): Operators or ATDB could have removed files from the inputs json blob. If that has happened, the task.size_to_process needs to be recalculated """ - logger.info(f'recalculate_sums') + logger.info(f'recalculate_size') new_size = sum([e["size"] for e in data]) return new_size + def validate_inputs(task): """ check if the task.inputs is valid - currently only duplicates are checked, but this functionality can be expanded """ - inputs = task.inputs - # remove duplicates - recalculate,unique_inputs = remove_duplicates(inputs) + # --- check for duplicate files in task.inputs --- + found_duplicates,duplicates,unique_inputs = check_duplicates(task.inputs) # if a duplicate was found, recalculate the size_to_process and update the inputs list of the task - if recalculate: - task.inputs = unique_inputs - recalculate_size(unique_inputs) - task.size_to_process = recalculate_size(unique_inputs) \ No newline at end of file + if found_duplicates: + + # set a failed state. + new_status = task.new_status + "_failed" + task.new_status = new_status + + # add some information for the user + # (abuse the unused 'task.environment' field as a 'cache' to transport this to a 'signal' (to avoid circular imports) + task.environment = f"inputs_duplicates::{duplicates}" + + + # --- recalculate sizes --- + size_to_process = recalculate_size(task.inputs) + task.size_to_process = size_to_process \ No newline at end of file diff --git a/atdb/taskdatabase/static/taskdatabase/style.css b/atdb/taskdatabase/static/taskdatabase/style.css index 258fea6f..b7bc0ad4 100644 --- a/atdb/taskdatabase/static/taskdatabase/style.css +++ b/atdb/taskdatabase/static/taskdatabase/style.css @@ -27,7 +27,7 @@ TD { background-color: lightgreen; } -.aggregate_failed,.aggregating_failed { +.aggregate_failed,.aggregating_failed,.defining_failed,.defined_failed { color: red; font-weight: bold; } diff --git a/atdb/taskdatabase/templates/taskdatabase/index.html b/atdb/taskdatabase/templates/taskdatabase/index.html index 080cda67..5bc1d629 100644 --- a/atdb/taskdatabase/templates/taskdatabase/index.html +++ b/atdb/taskdatabase/templates/taskdatabase/index.html @@ -31,7 +31,7 @@ {% include 'taskdatabase/pagination.html' %} </div> </div> - <p class="footer"> Version 3 Feb 2025</p> + <p class="footer"> Version 4 Feb 2025</p> </div> {% include 'taskdatabase/refresh.html' %} diff --git a/atdb/taskdatabase/tests/test_input_validation.py b/atdb/taskdatabase/tests/test_input_validation.py index 21e7f080..a3e768f7 100644 --- a/atdb/taskdatabase/tests/test_input_validation.py +++ b/atdb/taskdatabase/tests/test_input_validation.py @@ -1,9 +1,8 @@ from django.test import TestCase from django.utils import timezone from datetime import datetime -from taskdatabase.models import LogEntry, Task, Workflow, Job, Configuration +from taskdatabase.models import LogEntry, Task, Workflow from taskdatabase.services.specification import input_validation -from unittest.mock import Mock, MagicMock, patch class TestInputValidation(TestCase): @@ -51,21 +50,21 @@ class TestInputValidation(TestCase): size_to_process = 100, workflow=self.workflow, inputs = self.inputs_with_duplicate) self.task.save() - def test_validate_inputs(self): + + def test_check_duplicates(self): """ - run the validates_inputs function with a task.inputs that contains a duplicate. - The duplicate should be removed, and the size recalculated + test if duplicates are detected """ - # arrange - expected_size = 300 - expected_inputs = self.inputs_validated + expected_size = 600 + expected_duplicates = ['L432340_SB000_uv.dppp.MS.tar'] # act input_validation.validate_inputs(self.task) + found_duplicates, duplicates, unique_inputs = input_validation.check_duplicates(self.inputs_with_duplicate) # assert - self.assertEqual(self.task.inputs, expected_inputs) + self.assertEqual(duplicates, expected_duplicates) self.assertEqual(self.task.size_to_process, expected_size) @@ -76,33 +75,32 @@ class TestInputValidation(TestCase): # arrange expected_size = 300 - expected_inputs = self.inputs_validated # set to 'wrong' inputs containing a duplicate - self.task.inputs = self.inputs_with_duplicate + self.task.inputs = self.inputs_validated # act self.task.new_status="defined" self.task.save() # assert - self.assertEqual(self.task.inputs, expected_inputs) self.assertEqual(self.task.size_to_process, expected_size) def test_no_trigger_on_staged(self): """ test that the functionality does not trigger on other statusses, like 'staged' + the inputs.validated has a total sum of 300, but 'staged' should not recalculate the new size. """ # arrange - expected_inputs = self.inputs_with_duplicate + expected_size = 600 # set to 'wrong' inputs containing a duplicate - self.task.inputs = self.inputs_with_duplicate + self.task.inputs = self.inputs_validated # act self.task.new_status="staged" self.task.save() # assert - self.assertEqual(self.task.inputs, expected_inputs) + self.assertEqual(self.task.size_to_process, expected_size) -- GitLab From 4320e3c7e3893bc181dc12a5d80a093bfa17a3be Mon Sep 17 00:00:00 2001 From: Vermaas <vermaas@astron.nl> Date: Tue, 4 Feb 2025 16:48:15 +0100 Subject: [PATCH 5/6] adapted the functionality based on review --- atdb/taskdatabase/services/specification/input_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/atdb/taskdatabase/services/specification/input_validation.py b/atdb/taskdatabase/services/specification/input_validation.py index 8000ad8b..070c5392 100644 --- a/atdb/taskdatabase/services/specification/input_validation.py +++ b/atdb/taskdatabase/services/specification/input_validation.py @@ -61,7 +61,7 @@ def check_duplicates(data): if not data: # tasks without inputs should just return - return False,None + return False,None,None found_duplicates = False seen_surls = set() @@ -80,7 +80,7 @@ def check_duplicates(data): found_duplicates = True logger.info(f'duplicate found: {filename_without_hash}') - return found_duplicates,duplicates, unique_inputs + return found_duplicates,duplicates,unique_inputs def recalculate_size(data): -- GitLab From 868b9f103edf79be2fef14a3e948c0dbea4d6ff3 Mon Sep 17 00:00:00 2001 From: Vermaas <vermaas@astron.nl> Date: Tue, 4 Feb 2025 17:05:13 +0100 Subject: [PATCH 6/6] adapted the functionality based on review --- .../services/specification/input_validation.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/atdb/taskdatabase/services/specification/input_validation.py b/atdb/taskdatabase/services/specification/input_validation.py index 070c5392..4acbff89 100644 --- a/atdb/taskdatabase/services/specification/input_validation.py +++ b/atdb/taskdatabase/services/specification/input_validation.py @@ -88,6 +88,10 @@ def recalculate_size(data): Operators or ATDB could have removed files from the inputs json blob. If that has happened, the task.size_to_process needs to be recalculated """ + if not data: + # tasks without inputs should just return + return None + logger.info(f'recalculate_size') new_size = sum([e["size"] for e in data]) return new_size @@ -116,4 +120,5 @@ def validate_inputs(task): # --- recalculate sizes --- size_to_process = recalculate_size(task.inputs) - task.size_to_process = size_to_process \ No newline at end of file + if size_to_process: + task.size_to_process = size_to_process \ No newline at end of file -- GitLab