diff --git a/graph_creation/cst_processing.py b/graph_creation/cst_processing.py index 2e99434a76ecbc890abae22329e12d1b3f4b4d9e..2989560d15ed7f1c2c127c73126e95e7c7c66151 100644 --- a/graph_creation/cst_processing.py +++ b/graph_creation/cst_processing.py @@ -4,12 +4,12 @@ from neo4j import Driver from neo4j_queries.edge_queries import create_has_child_relationship from neo4j_queries.node_queries import create_ast_node -def traverse_and_create(driver: Driver, tree, parent_node_id=None): +def traverse_and_create(driver: Driver, component_id: str, tree, parent_node_id=None): # Create a Neo4j node for the current tree node rule_name = type(tree).__name__ text = tree.getText() if tree.getText() else None - current_node_id = create_ast_node(driver, rule_name, text) + current_node_id = create_ast_node(driver, component_id, rule_name, text) # If there's a parent, create a relationship if parent_node_id is not None: @@ -18,7 +18,7 @@ def traverse_and_create(driver: Driver, tree, parent_node_id=None): # Recursively process all children for i in range(tree.getChildCount()): child = tree.getChild(i) - traverse_and_create(driver, child, current_node_id) + traverse_and_create(driver, component_id, child, current_node_id) def traverse_when_statement_extract_dependencies(tree: ParserRuleContext) -> list[tuple[str,str]]: @@ -41,7 +41,7 @@ def traverse_when_statement_extract_dependencies(tree: ParserRuleContext) -> lis ref_list = [] # The "when" field of a step can reference: - # - inputs (parameters) of that step in the form input.[param ID] + # - inputs (parameters) of that step in the form inputs.[param ID] # - outputs of different steps in the form steps.[step ID].outputs.[output ID] if rule_name == "MemberDotExpressionContext": split_text = text.split(".") diff --git a/graph_creation/cwl_processing.py b/graph_creation/cwl_processing.py index 8fea8e1524f87183b6038ed29ffe67ce1f8756bc..c0b1fc4ab1596c7bb981b3827bc92bff54c323b0 100644 --- a/graph_creation/cwl_processing.py +++ b/graph_creation/cwl_processing.py @@ -1,11 +1,11 @@ from neo4j import Driver -from graph_creation.cst_processing import traverse_when_statement_extract_dependencies +from graph_creation.cst_processing import traverse_and_create, traverse_when_statement_extract_dependencies from graph_creation.utils import create_input_nodes_and_relationships, process_source_relationship, resolve_relative_path -from neo4j_queries.node_queries import ensure_component_node, ensure_data_node, ensure_parameter_node, get_wf_data_nodes_from_step_in_param +from neo4j_queries.node_queries import ensure_component_node, ensure_data_node, ensure_in_parameter_node, ensure_out_parameter_node, get_wf_data_nodes_from_step_in_param from neo4j_queries.edge_queries import create_control_relationship, create_data_relationship, create_out_param_relationship from pathlib import Path -from parsers.javascript_parsing import parse_javascript_expression_string +from parsers.javascript_parsing import parse_javascript_expression_string, parse_javascript_string # TODO: deal with inputBindings def process_cwl_inputs(driver: Driver, cwl_entity: dict) -> None: @@ -54,7 +54,7 @@ def process_cwl_outputs(driver: Driver, cwl_entity: dict) -> None: if type(output) == dict: # Create out-parameter node with the parameter ID as defined in the component # and component ID equal to the path of the componet - param_node = ensure_parameter_node(driver, output['id'], component_id, 'out') + param_node = ensure_out_parameter_node(driver, output['id'], component_id) param_node_internal_id = param_node[0] # Create out-parameter node with the parameter ID as defined in the component # and component ID equal to the path of the componet @@ -114,7 +114,7 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None: # Process the list of inputs of the step for input in step['in']: # Create in-parameter node with ID as defined in the component and component ID equal to the path of the step - param_node = ensure_parameter_node(driver, input['id'], step_path, 'in') + param_node = ensure_in_parameter_node(driver, input['id'], step_path) param_node_internal_id = param_node[0] # Create a data edge from the step component node to the in-parameter node create_data_relationship(driver, s_node_internal_id, param_node_internal_id) @@ -135,18 +135,18 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None: expr_tree = parse_javascript_expression_string(when_expr) when_refs = traverse_when_statement_extract_dependencies(expr_tree) - data_nodes = [] + nodes = [] for ref in when_refs: ref_id = ref[1] if ref[0] == "parameter": - input_data = get_wf_data_nodes_from_step_in_param(driver, ref_id, step_path, cwl_entity['path']) - data_nodes.extend(input_data) + input_data = ensure_in_parameter_node(driver, ref_id, step_path)[0] + nodes.append(input_data) elif ref[0] == "step_output": step_output = ensure_data_node(driver, ref_id, cwl_entity['path'])[0] - data_nodes.append(step_output) + nodes.append(step_output) - for data_node in data_nodes: - create_control_relationship(driver, s_node_internal_id, data_node) + for node in nodes: + create_control_relationship(driver, s_node_internal_id, node, cwl_entity['path']) # Process the list of outputs of the step for output in step['out']: @@ -156,7 +156,7 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None: else: output_id = output # Create out-parameter node with ID as defined in the component and component ID equal to the path of the step - param_node = ensure_parameter_node(driver, output_id, step_path, 'out') + param_node = ensure_out_parameter_node(driver, output_id, step_path) param_node_internal_id = param_node[0] # Create a data edge from out-parameter node to the step component node create_data_relationship(driver, param_node_internal_id, s_node_internal_id) @@ -164,5 +164,10 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None: outer_output_id = f"{step['id']}/{output_id}" data_node = ensure_data_node(driver, outer_output_id, cwl_entity['path']) data_node_internal_id = data_node[0] - # Create a data edge from the out-parameter node to the data node - create_data_relationship(driver, param_node_internal_id, data_node_internal_id) \ No newline at end of file + # Create a data edge from the data node to the out-parameter node + create_data_relationship(driver, data_node_internal_id, param_node_internal_id) + +def process_cwl_expression(driver: Driver, entity: dict) -> None: + expression = entity['expression'] + expr_tree = parse_javascript_string(expression) + # traverse_and_create(driver, entity['path'], expr_tree) \ No newline at end of file diff --git a/graph_creation/repo_processing.py b/graph_creation/repo_processing.py index c69bc75daf3735b636ca791e762d3962af7531e6..0fb3afed0f054021b3f704aaf9bbc95055ea561b 100644 --- a/graph_creation/repo_processing.py +++ b/graph_creation/repo_processing.py @@ -1,6 +1,7 @@ from neo4j import Driver from graph_creation.cwl_parsing import get_cwl_from_repo -from graph_creation.cwl_processing import process_cwl_inputs, process_cwl_outputs, process_cwl_steps +from graph_creation.cwl_processing import process_cwl_expression, process_cwl_inputs, process_cwl_outputs, process_cwl_steps +from neo4j_queries.edge_queries import simplify_data_and_control_edges from neo4j_queries.node_queries import ensure_component_node def process_repos(repo_list: list[str], driver: Driver) -> None: @@ -23,3 +24,7 @@ def process_repos(repo_list: list[str], driver: Driver) -> None: process_cwl_outputs(driver, entity) if entity['class'] == 'Workflow': process_cwl_steps(driver, entity) + # elif entity['class'] == 'ExpressionTool': + # process_cwl_expression(driver, entity) + + simplify_data_and_control_edges(driver) diff --git a/graph_creation/utils.py b/graph_creation/utils.py index 0b62a0e43408db6c19e1c2ac3ab465bb9798539a..4f1c3ad7aff715348d62075f6655b59e1504eb9f 100644 --- a/graph_creation/utils.py +++ b/graph_creation/utils.py @@ -1,6 +1,6 @@ from pathlib import Path from neo4j import Driver -from neo4j_queries.node_queries import ensure_data_node, ensure_parameter_node +from neo4j_queries.node_queries import ensure_data_node, ensure_in_parameter_node from neo4j_queries.edge_queries import create_data_relationship, create_in_param_relationship def create_input_nodes_and_relationships(driver: Driver, input_id: str, component_id: str) -> None: @@ -18,7 +18,7 @@ def create_input_nodes_and_relationships(driver: Driver, input_id: str, componen component_id (str): the unique ID of the CWL component (its path) """ # Create in-parameter with the parameter ID as defined in the component and component ID equal to the path of the componet - param_node = ensure_parameter_node(driver, input_id, component_id, 'in') + param_node = ensure_in_parameter_node(driver, input_id, component_id) param_node_internal_id = param_node[0] # Create a data edge from the component node to the in-parameter node create_in_param_relationship(driver, component_id, param_node_internal_id) diff --git a/neo4j_queries/edge_queries.py b/neo4j_queries/edge_queries.py index 634673b89683bc53e7b8373e1d1f73447d6b342d..94fcdff11d749a681e3ef57069a0ff27af844826 100644 --- a/neo4j_queries/edge_queries.py +++ b/neo4j_queries/edge_queries.py @@ -19,7 +19,7 @@ def create_in_param_relationship(driver: Driver, prefixed_component_id: str, par """ component_id = clean_component_id(prefixed_component_id) query = """ - MATCH (c:Component {component_id: $component_id}), (p) + MATCH (c:Component {component_id: $component_id}), (p:InParameter) WHERE elementId(p) = $parameter_internal_id MERGE (c)-[:DATA]->(p) RETURN c.component_id AS component_id, p.parameter_id AS parameter_id @@ -48,7 +48,7 @@ def create_out_param_relationship(driver: Driver, prefixed_component_id: str, pa """ component_id = clean_component_id(prefixed_component_id) query = """ - MATCH (c:Component {component_id: $component_id}), (p) + MATCH (c:Component {component_id: $component_id}), (p: OutParameter) WHERE elementId(p) = $parameter_internal_id MERGE (c)<-[:DATA]-(p) RETURN c.component_id AS component_id, p.parameter_id AS parameter_id @@ -86,7 +86,7 @@ def create_data_relationship(driver: Driver, from_internal_node_id: int, to_inte return record["id_1"], record["id_2"] -def create_control_relationship(driver: Driver, from_internal_node_id: int, to_internal_node_id: int) -> tuple[int,int]: +def create_control_relationship(driver: Driver, from_internal_node_id: int, to_internal_node_id: int, component_id: str) -> tuple[int,int]: """ Creates a control dependency relationship in Neo4j between the two nodes with Neo4j internal IDs given as parameters. This relationship is an outgoing control edge from the node with internal ID from_internal_node_id @@ -103,12 +103,12 @@ def create_control_relationship(driver: Driver, from_internal_node_id: int, to_i query = """ MATCH (a), (b) WHERE elementId(a) = $from_internal_node_id AND elementId(b) = $to_internal_node_id - MERGE (a)-[:CONTROL]->(b) + MERGE (a)-[:CONTROL {component_id: $component_id}]->(b) RETURN elementId(a) AS id_1, elementId(b) AS id_2 """ with driver.session() as session: result = session.run(query, from_internal_node_id=from_internal_node_id, - to_internal_node_id=to_internal_node_id) + to_internal_node_id=to_internal_node_id, component_id=component_id) record = result.single() return record["id_1"], record["id_2"] @@ -128,11 +128,33 @@ def create_has_child_relationship(driver: Driver, parent_internal_node_id: int, query = """ MATCH (parent), (child) WHERE elementId(parent) = $parent_id AND elementId(child) = $child_id - CREATE (parent)-[:HAS_CHILD]->(child) + MERGE (parent)-[:HAS_CHILD]->(child) RETURN elementId(parent) AS id_1, elementId(child) AS id_2 """ with driver.session() as session: result = session.run(query, parent_id=parent_internal_node_id, child_id=child_internal_node_id) record = result.single() - return record["id_1"], record["id_2"] \ No newline at end of file + return record["id_1"], record["id_2"] + +def simplify_data_and_control_edges(driver: Driver): + with driver.session() as session: + create_data_edges_query = """ + MATCH (n:Data)<-[inEdge:DATA]-(n1), (n)-[outEdge:DATA]->(n2) + WITH n, n1, n2, n.component_id AS component_id, n.data_id AS data_id + MERGE (n1)-[newEdge:DATA {component_id: component_id, data_id: data_id}]->(n2) + """ + session.run(create_data_edges_query) + + create_control_edges_query = """ + MATCH (n:Data)<-[inEdge:CONTROL]-(n1), (n)-[outEdge:DATA]->(n2) + WITH n, n1, n2, n.component_id AS component_id, n.data_id AS data_id + MERGE (n1)-[newEdge:CONTROL {component_id: component_id, data_id: data_id}]->(n2) + """ + session.run(create_control_edges_query) + + delete_data_query = """ + MATCH (n:Data) + DETACH DELETE n + """ + session.run(delete_data_query) \ No newline at end of file diff --git a/neo4j_queries/node_queries.py b/neo4j_queries/node_queries.py index 361ff445c642b5f2d3925293730ce0a704218d93..11051870171f32fb43b3b7c672fe6c5d8e99e392 100644 --- a/neo4j_queries/node_queries.py +++ b/neo4j_queries/node_queries.py @@ -25,10 +25,10 @@ def ensure_component_node(driver: Driver, prefixed_component_id: str) -> tuple[i record = result.single() return record["node_internal_id"], record["component_id"] -def ensure_parameter_node(driver: Driver, node_id: str, prefixed_component_id: str, param_type: str) \ +def ensure_in_parameter_node(driver: Driver, node_id: str, prefixed_component_id: str) \ -> tuple[int,str,str,str]: """ - Ensures that there exists a parameter node with ID node_id and type param_type + Ensures that there exists an in-parameter node with ID node_id associated with the component in the file with local path prefixed_component_id. The ID of the component can be given based on the local relative path, so it is cleaned before querying Neo4j. @@ -37,21 +37,45 @@ def ensure_parameter_node(driver: Driver, node_id: str, prefixed_component_id: s driver (Driver): the Neo4j driver node_id (str): the ID of the parameter prefixed_component_id (str): the local relative path of the component - param_type (str): the type of the parameter ('in' or 'out') Returns: - tuple[int,str,str, str]: the Neoj4 internal ID of the parameter node, the parameter ID, the component ID, the parameter type + tuple[int,str,str]: the Neoj4 internal ID of the parameter node, the parameter ID, the component ID """ component_id = clean_component_id(prefixed_component_id) query = """ - MERGE (n:Parameter {parameter_id: $node_id, component_id: $component_id, parameter_type: $param_type}) - RETURN elementId(n) AS node_internal_id, n.parameter_id AS id_property, n.component_id AS component_id_property, - n.parameter_type AS parameter_type_property + MERGE (n:InParameter {parameter_id: $node_id, component_id: $component_id}) + RETURN elementId(n) AS node_internal_id, n.parameter_id AS id_property, n.component_id AS component_id_property """ with driver.session() as session: - result = session.run(query, node_id=node_id, component_id=component_id, param_type=param_type) + result = session.run(query, node_id=node_id, component_id=component_id) record = result.single() - return record["node_internal_id"], record["id_property"], record["component_id_property"], record['parameter_type_property'] + return record["node_internal_id"], record["id_property"], record["component_id_property"] + +def ensure_out_parameter_node(driver: Driver, node_id: str, prefixed_component_id: str) \ + -> tuple[int,str,str,str]: + """ + Ensures that there exists an out-parameter node with ID node_id + associated with the component in the file with local path prefixed_component_id. + The ID of the component can be given based on the local relative path, so it is cleaned + before querying Neo4j. + + Parameters: + driver (Driver): the Neo4j driver + node_id (str): the ID of the parameter + prefixed_component_id (str): the local relative path of the component + + Returns: + tuple[int,str,str]: the Neoj4 internal ID of the parameter node, the parameter ID, the component ID + """ + component_id = clean_component_id(prefixed_component_id) + query = """ + MERGE (n:OutParameter {parameter_id: $node_id, component_id: $component_id}) + RETURN elementId(n) AS node_internal_id, n.parameter_id AS id_property, n.component_id AS component_id_property + """ + with driver.session() as session: + result = session.run(query, node_id=node_id, component_id=component_id) + record = result.single() + return record["node_internal_id"], record["id_property"], record["component_id_property"] def ensure_data_node(driver: Driver, node_id: str, prefixed_component_id: str) -> tuple[int,str,str]: """ @@ -79,39 +103,13 @@ def ensure_data_node(driver: Driver, node_id: str, prefixed_component_id: str) - return record["node_internal_id"], record["id_property"], record["component_id_property"] -def create_ast_node(driver, rule, text): +def create_ast_node(driver, component_id, rule, text): query = """ - CREATE (n:ASTNode {rule: $rule, text: $text}) + MERGE (n:ASTNode {component_id:$component_id, rule: $rule, text: $text}) RETURN elementId(n) AS node_id """ with driver.session() as session: - result = session.run(query, rule=rule, text=text) + result = session.run(query, component_id=component_id, rule=rule, text=text) record = result.single() return record["node_id"] - -def get_wf_data_nodes_from_step_in_param(driver: Driver, param_id: str, prefixed_step_id: str, prefixed_workflow_id: str) -> list[int]: - """ - Retrieves the internal IDs of data nodes (in a Neo4j database) belonging to the workflow with ID workflow_id - such that the in parameter with ID param_id of workflow step step_id has a data dependency on these data nodes. - This means that in said workflow these data nodes are injected into the parameter param_id of the step. - The ID of the component can be given based on the local relative path, so it is cleaned - before querying Neo4j. - - Parameters: - param_id: the parameter ID of the step parameter - prefixed_step_id: the unique ID of the step - prefixed_workflow_id: the unique ID of the workflow the step is part of - - Returns: - list[int]: the Neo4j internal IDs of the data nodes connected to the parameter node of the step in the mentioned workflow - """ - step_id = clean_component_id(prefixed_step_id) - workflow_id = clean_component_id(prefixed_workflow_id) - - query = """ - MATCH (n1:Data {component_id: $workflow_id})<-[:DATA]-(n2:Parameter {component_id: $step_id, parameter_type: "in", parameter_id: $param_id}) - RETURN elementId(n1) AS internal_id - """ - with driver.session() as session: - result = session.run(query, workflow_id=workflow_id, step_id=step_id, param_id=param_id) - return [record["internal_id"] for record in result] + \ No newline at end of file