Skip to content
Snippets Groups Projects
Commit be863471 authored by Chiara Liotta's avatar Chiara Liotta
Browse files

simplify graph

parent f3dbea4b
No related branches found
No related tags found
No related merge requests found
...@@ -4,12 +4,12 @@ from neo4j import Driver ...@@ -4,12 +4,12 @@ from neo4j import Driver
from neo4j_queries.edge_queries import create_has_child_relationship from neo4j_queries.edge_queries import create_has_child_relationship
from neo4j_queries.node_queries import create_ast_node from neo4j_queries.node_queries import create_ast_node
def traverse_and_create(driver: Driver, tree, parent_node_id=None): def traverse_and_create(driver: Driver, component_id: str, tree, parent_node_id=None):
# Create a Neo4j node for the current tree node # Create a Neo4j node for the current tree node
rule_name = type(tree).__name__ rule_name = type(tree).__name__
text = tree.getText() if tree.getText() else None text = tree.getText() if tree.getText() else None
current_node_id = create_ast_node(driver, rule_name, text) current_node_id = create_ast_node(driver, component_id, rule_name, text)
# If there's a parent, create a relationship # If there's a parent, create a relationship
if parent_node_id is not None: if parent_node_id is not None:
...@@ -18,7 +18,7 @@ def traverse_and_create(driver: Driver, tree, parent_node_id=None): ...@@ -18,7 +18,7 @@ def traverse_and_create(driver: Driver, tree, parent_node_id=None):
# Recursively process all children # Recursively process all children
for i in range(tree.getChildCount()): for i in range(tree.getChildCount()):
child = tree.getChild(i) child = tree.getChild(i)
traverse_and_create(driver, child, current_node_id) traverse_and_create(driver, component_id, child, current_node_id)
def traverse_when_statement_extract_dependencies(tree: ParserRuleContext) -> list[tuple[str,str]]: def traverse_when_statement_extract_dependencies(tree: ParserRuleContext) -> list[tuple[str,str]]:
...@@ -41,7 +41,7 @@ def traverse_when_statement_extract_dependencies(tree: ParserRuleContext) -> lis ...@@ -41,7 +41,7 @@ def traverse_when_statement_extract_dependencies(tree: ParserRuleContext) -> lis
ref_list = [] ref_list = []
# The "when" field of a step can reference: # The "when" field of a step can reference:
# - inputs (parameters) of that step in the form input.[param ID] # - inputs (parameters) of that step in the form inputs.[param ID]
# - outputs of different steps in the form steps.[step ID].outputs.[output ID] # - outputs of different steps in the form steps.[step ID].outputs.[output ID]
if rule_name == "MemberDotExpressionContext": if rule_name == "MemberDotExpressionContext":
split_text = text.split(".") split_text = text.split(".")
......
from neo4j import Driver from neo4j import Driver
from graph_creation.cst_processing import traverse_when_statement_extract_dependencies from graph_creation.cst_processing import traverse_and_create, traverse_when_statement_extract_dependencies
from graph_creation.utils import create_input_nodes_and_relationships, process_source_relationship, resolve_relative_path from graph_creation.utils import create_input_nodes_and_relationships, process_source_relationship, resolve_relative_path
from neo4j_queries.node_queries import ensure_component_node, ensure_data_node, ensure_parameter_node, get_wf_data_nodes_from_step_in_param from neo4j_queries.node_queries import ensure_component_node, ensure_data_node, ensure_in_parameter_node, ensure_out_parameter_node, get_wf_data_nodes_from_step_in_param
from neo4j_queries.edge_queries import create_control_relationship, create_data_relationship, create_out_param_relationship from neo4j_queries.edge_queries import create_control_relationship, create_data_relationship, create_out_param_relationship
from pathlib import Path from pathlib import Path
from parsers.javascript_parsing import parse_javascript_expression_string from parsers.javascript_parsing import parse_javascript_expression_string, parse_javascript_string
# TODO: deal with inputBindings # TODO: deal with inputBindings
def process_cwl_inputs(driver: Driver, cwl_entity: dict) -> None: def process_cwl_inputs(driver: Driver, cwl_entity: dict) -> None:
...@@ -54,7 +54,7 @@ def process_cwl_outputs(driver: Driver, cwl_entity: dict) -> None: ...@@ -54,7 +54,7 @@ def process_cwl_outputs(driver: Driver, cwl_entity: dict) -> None:
if type(output) == dict: if type(output) == dict:
# Create out-parameter node with the parameter ID as defined in the component # Create out-parameter node with the parameter ID as defined in the component
# and component ID equal to the path of the componet # and component ID equal to the path of the componet
param_node = ensure_parameter_node(driver, output['id'], component_id, 'out') param_node = ensure_out_parameter_node(driver, output['id'], component_id)
param_node_internal_id = param_node[0] param_node_internal_id = param_node[0]
# Create out-parameter node with the parameter ID as defined in the component # Create out-parameter node with the parameter ID as defined in the component
# and component ID equal to the path of the componet # and component ID equal to the path of the componet
...@@ -114,7 +114,7 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None: ...@@ -114,7 +114,7 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None:
# Process the list of inputs of the step # Process the list of inputs of the step
for input in step['in']: for input in step['in']:
# Create in-parameter node with ID as defined in the component and component ID equal to the path of the step # Create in-parameter node with ID as defined in the component and component ID equal to the path of the step
param_node = ensure_parameter_node(driver, input['id'], step_path, 'in') param_node = ensure_in_parameter_node(driver, input['id'], step_path)
param_node_internal_id = param_node[0] param_node_internal_id = param_node[0]
# Create a data edge from the step component node to the in-parameter node # Create a data edge from the step component node to the in-parameter node
create_data_relationship(driver, s_node_internal_id, param_node_internal_id) create_data_relationship(driver, s_node_internal_id, param_node_internal_id)
...@@ -135,18 +135,18 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None: ...@@ -135,18 +135,18 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None:
expr_tree = parse_javascript_expression_string(when_expr) expr_tree = parse_javascript_expression_string(when_expr)
when_refs = traverse_when_statement_extract_dependencies(expr_tree) when_refs = traverse_when_statement_extract_dependencies(expr_tree)
data_nodes = [] nodes = []
for ref in when_refs: for ref in when_refs:
ref_id = ref[1] ref_id = ref[1]
if ref[0] == "parameter": if ref[0] == "parameter":
input_data = get_wf_data_nodes_from_step_in_param(driver, ref_id, step_path, cwl_entity['path']) input_data = ensure_in_parameter_node(driver, ref_id, step_path)[0]
data_nodes.extend(input_data) nodes.append(input_data)
elif ref[0] == "step_output": elif ref[0] == "step_output":
step_output = ensure_data_node(driver, ref_id, cwl_entity['path'])[0] step_output = ensure_data_node(driver, ref_id, cwl_entity['path'])[0]
data_nodes.append(step_output) nodes.append(step_output)
for data_node in data_nodes: for node in nodes:
create_control_relationship(driver, s_node_internal_id, data_node) create_control_relationship(driver, s_node_internal_id, node, cwl_entity['path'])
# Process the list of outputs of the step # Process the list of outputs of the step
for output in step['out']: for output in step['out']:
...@@ -156,7 +156,7 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None: ...@@ -156,7 +156,7 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None:
else: else:
output_id = output output_id = output
# Create out-parameter node with ID as defined in the component and component ID equal to the path of the step # Create out-parameter node with ID as defined in the component and component ID equal to the path of the step
param_node = ensure_parameter_node(driver, output_id, step_path, 'out') param_node = ensure_out_parameter_node(driver, output_id, step_path)
param_node_internal_id = param_node[0] param_node_internal_id = param_node[0]
# Create a data edge from out-parameter node to the step component node # Create a data edge from out-parameter node to the step component node
create_data_relationship(driver, param_node_internal_id, s_node_internal_id) create_data_relationship(driver, param_node_internal_id, s_node_internal_id)
...@@ -164,5 +164,10 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None: ...@@ -164,5 +164,10 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None:
outer_output_id = f"{step['id']}/{output_id}" outer_output_id = f"{step['id']}/{output_id}"
data_node = ensure_data_node(driver, outer_output_id, cwl_entity['path']) data_node = ensure_data_node(driver, outer_output_id, cwl_entity['path'])
data_node_internal_id = data_node[0] data_node_internal_id = data_node[0]
# Create a data edge from the out-parameter node to the data node # Create a data edge from the data node to the out-parameter node
create_data_relationship(driver, param_node_internal_id, data_node_internal_id) create_data_relationship(driver, data_node_internal_id, param_node_internal_id)
\ No newline at end of file
def process_cwl_expression(driver: Driver, entity: dict) -> None:
expression = entity['expression']
expr_tree = parse_javascript_string(expression)
# traverse_and_create(driver, entity['path'], expr_tree)
\ No newline at end of file
from neo4j import Driver from neo4j import Driver
from graph_creation.cwl_parsing import get_cwl_from_repo from graph_creation.cwl_parsing import get_cwl_from_repo
from graph_creation.cwl_processing import process_cwl_inputs, process_cwl_outputs, process_cwl_steps from graph_creation.cwl_processing import process_cwl_expression, process_cwl_inputs, process_cwl_outputs, process_cwl_steps
from neo4j_queries.edge_queries import simplify_data_and_control_edges
from neo4j_queries.node_queries import ensure_component_node from neo4j_queries.node_queries import ensure_component_node
def process_repos(repo_list: list[str], driver: Driver) -> None: def process_repos(repo_list: list[str], driver: Driver) -> None:
...@@ -23,3 +24,7 @@ def process_repos(repo_list: list[str], driver: Driver) -> None: ...@@ -23,3 +24,7 @@ def process_repos(repo_list: list[str], driver: Driver) -> None:
process_cwl_outputs(driver, entity) process_cwl_outputs(driver, entity)
if entity['class'] == 'Workflow': if entity['class'] == 'Workflow':
process_cwl_steps(driver, entity) process_cwl_steps(driver, entity)
# elif entity['class'] == 'ExpressionTool':
# process_cwl_expression(driver, entity)
simplify_data_and_control_edges(driver)
from pathlib import Path from pathlib import Path
from neo4j import Driver from neo4j import Driver
from neo4j_queries.node_queries import ensure_data_node, ensure_parameter_node from neo4j_queries.node_queries import ensure_data_node, ensure_in_parameter_node
from neo4j_queries.edge_queries import create_data_relationship, create_in_param_relationship from neo4j_queries.edge_queries import create_data_relationship, create_in_param_relationship
def create_input_nodes_and_relationships(driver: Driver, input_id: str, component_id: str) -> None: def create_input_nodes_and_relationships(driver: Driver, input_id: str, component_id: str) -> None:
...@@ -18,7 +18,7 @@ def create_input_nodes_and_relationships(driver: Driver, input_id: str, componen ...@@ -18,7 +18,7 @@ def create_input_nodes_and_relationships(driver: Driver, input_id: str, componen
component_id (str): the unique ID of the CWL component (its path) component_id (str): the unique ID of the CWL component (its path)
""" """
# Create in-parameter with the parameter ID as defined in the component and component ID equal to the path of the componet # Create in-parameter with the parameter ID as defined in the component and component ID equal to the path of the componet
param_node = ensure_parameter_node(driver, input_id, component_id, 'in') param_node = ensure_in_parameter_node(driver, input_id, component_id)
param_node_internal_id = param_node[0] param_node_internal_id = param_node[0]
# Create a data edge from the component node to the in-parameter node # Create a data edge from the component node to the in-parameter node
create_in_param_relationship(driver, component_id, param_node_internal_id) create_in_param_relationship(driver, component_id, param_node_internal_id)
......
...@@ -19,7 +19,7 @@ def create_in_param_relationship(driver: Driver, prefixed_component_id: str, par ...@@ -19,7 +19,7 @@ def create_in_param_relationship(driver: Driver, prefixed_component_id: str, par
""" """
component_id = clean_component_id(prefixed_component_id) component_id = clean_component_id(prefixed_component_id)
query = """ query = """
MATCH (c:Component {component_id: $component_id}), (p) MATCH (c:Component {component_id: $component_id}), (p:InParameter)
WHERE elementId(p) = $parameter_internal_id WHERE elementId(p) = $parameter_internal_id
MERGE (c)-[:DATA]->(p) MERGE (c)-[:DATA]->(p)
RETURN c.component_id AS component_id, p.parameter_id AS parameter_id RETURN c.component_id AS component_id, p.parameter_id AS parameter_id
...@@ -48,7 +48,7 @@ def create_out_param_relationship(driver: Driver, prefixed_component_id: str, pa ...@@ -48,7 +48,7 @@ def create_out_param_relationship(driver: Driver, prefixed_component_id: str, pa
""" """
component_id = clean_component_id(prefixed_component_id) component_id = clean_component_id(prefixed_component_id)
query = """ query = """
MATCH (c:Component {component_id: $component_id}), (p) MATCH (c:Component {component_id: $component_id}), (p: OutParameter)
WHERE elementId(p) = $parameter_internal_id WHERE elementId(p) = $parameter_internal_id
MERGE (c)<-[:DATA]-(p) MERGE (c)<-[:DATA]-(p)
RETURN c.component_id AS component_id, p.parameter_id AS parameter_id RETURN c.component_id AS component_id, p.parameter_id AS parameter_id
...@@ -86,7 +86,7 @@ def create_data_relationship(driver: Driver, from_internal_node_id: int, to_inte ...@@ -86,7 +86,7 @@ def create_data_relationship(driver: Driver, from_internal_node_id: int, to_inte
return record["id_1"], record["id_2"] return record["id_1"], record["id_2"]
def create_control_relationship(driver: Driver, from_internal_node_id: int, to_internal_node_id: int) -> tuple[int,int]: def create_control_relationship(driver: Driver, from_internal_node_id: int, to_internal_node_id: int, component_id: str) -> tuple[int,int]:
""" """
Creates a control dependency relationship in Neo4j between the two nodes with Neo4j internal IDs given as parameters. Creates a control dependency relationship in Neo4j between the two nodes with Neo4j internal IDs given as parameters.
This relationship is an outgoing control edge from the node with internal ID from_internal_node_id This relationship is an outgoing control edge from the node with internal ID from_internal_node_id
...@@ -103,12 +103,12 @@ def create_control_relationship(driver: Driver, from_internal_node_id: int, to_i ...@@ -103,12 +103,12 @@ def create_control_relationship(driver: Driver, from_internal_node_id: int, to_i
query = """ query = """
MATCH (a), (b) MATCH (a), (b)
WHERE elementId(a) = $from_internal_node_id AND elementId(b) = $to_internal_node_id WHERE elementId(a) = $from_internal_node_id AND elementId(b) = $to_internal_node_id
MERGE (a)-[:CONTROL]->(b) MERGE (a)-[:CONTROL {component_id: $component_id}]->(b)
RETURN elementId(a) AS id_1, elementId(b) AS id_2 RETURN elementId(a) AS id_1, elementId(b) AS id_2
""" """
with driver.session() as session: with driver.session() as session:
result = session.run(query, from_internal_node_id=from_internal_node_id, result = session.run(query, from_internal_node_id=from_internal_node_id,
to_internal_node_id=to_internal_node_id) to_internal_node_id=to_internal_node_id, component_id=component_id)
record = result.single() record = result.single()
return record["id_1"], record["id_2"] return record["id_1"], record["id_2"]
...@@ -128,7 +128,7 @@ def create_has_child_relationship(driver: Driver, parent_internal_node_id: int, ...@@ -128,7 +128,7 @@ def create_has_child_relationship(driver: Driver, parent_internal_node_id: int,
query = """ query = """
MATCH (parent), (child) MATCH (parent), (child)
WHERE elementId(parent) = $parent_id AND elementId(child) = $child_id WHERE elementId(parent) = $parent_id AND elementId(child) = $child_id
CREATE (parent)-[:HAS_CHILD]->(child) MERGE (parent)-[:HAS_CHILD]->(child)
RETURN elementId(parent) AS id_1, elementId(child) AS id_2 RETURN elementId(parent) AS id_1, elementId(child) AS id_2
""" """
with driver.session() as session: with driver.session() as session:
...@@ -136,3 +136,25 @@ def create_has_child_relationship(driver: Driver, parent_internal_node_id: int, ...@@ -136,3 +136,25 @@ def create_has_child_relationship(driver: Driver, parent_internal_node_id: int,
child_id=child_internal_node_id) child_id=child_internal_node_id)
record = result.single() record = result.single()
return record["id_1"], record["id_2"] return record["id_1"], record["id_2"]
def simplify_data_and_control_edges(driver: Driver):
with driver.session() as session:
create_data_edges_query = """
MATCH (n:Data)<-[inEdge:DATA]-(n1), (n)-[outEdge:DATA]->(n2)
WITH n, n1, n2, n.component_id AS component_id, n.data_id AS data_id
MERGE (n1)-[newEdge:DATA {component_id: component_id, data_id: data_id}]->(n2)
"""
session.run(create_data_edges_query)
create_control_edges_query = """
MATCH (n:Data)<-[inEdge:CONTROL]-(n1), (n)-[outEdge:DATA]->(n2)
WITH n, n1, n2, n.component_id AS component_id, n.data_id AS data_id
MERGE (n1)-[newEdge:CONTROL {component_id: component_id, data_id: data_id}]->(n2)
"""
session.run(create_control_edges_query)
delete_data_query = """
MATCH (n:Data)
DETACH DELETE n
"""
session.run(delete_data_query)
\ No newline at end of file
...@@ -25,10 +25,10 @@ def ensure_component_node(driver: Driver, prefixed_component_id: str) -> tuple[i ...@@ -25,10 +25,10 @@ def ensure_component_node(driver: Driver, prefixed_component_id: str) -> tuple[i
record = result.single() record = result.single()
return record["node_internal_id"], record["component_id"] return record["node_internal_id"], record["component_id"]
def ensure_parameter_node(driver: Driver, node_id: str, prefixed_component_id: str, param_type: str) \ def ensure_in_parameter_node(driver: Driver, node_id: str, prefixed_component_id: str) \
-> tuple[int,str,str,str]: -> tuple[int,str,str,str]:
""" """
Ensures that there exists a parameter node with ID node_id and type param_type Ensures that there exists an in-parameter node with ID node_id
associated with the component in the file with local path prefixed_component_id. associated with the component in the file with local path prefixed_component_id.
The ID of the component can be given based on the local relative path, so it is cleaned The ID of the component can be given based on the local relative path, so it is cleaned
before querying Neo4j. before querying Neo4j.
...@@ -37,21 +37,45 @@ def ensure_parameter_node(driver: Driver, node_id: str, prefixed_component_id: s ...@@ -37,21 +37,45 @@ def ensure_parameter_node(driver: Driver, node_id: str, prefixed_component_id: s
driver (Driver): the Neo4j driver driver (Driver): the Neo4j driver
node_id (str): the ID of the parameter node_id (str): the ID of the parameter
prefixed_component_id (str): the local relative path of the component prefixed_component_id (str): the local relative path of the component
param_type (str): the type of the parameter ('in' or 'out')
Returns: Returns:
tuple[int,str,str, str]: the Neoj4 internal ID of the parameter node, the parameter ID, the component ID, the parameter type tuple[int,str,str]: the Neoj4 internal ID of the parameter node, the parameter ID, the component ID
""" """
component_id = clean_component_id(prefixed_component_id) component_id = clean_component_id(prefixed_component_id)
query = """ query = """
MERGE (n:Parameter {parameter_id: $node_id, component_id: $component_id, parameter_type: $param_type}) MERGE (n:InParameter {parameter_id: $node_id, component_id: $component_id})
RETURN elementId(n) AS node_internal_id, n.parameter_id AS id_property, n.component_id AS component_id_property, RETURN elementId(n) AS node_internal_id, n.parameter_id AS id_property, n.component_id AS component_id_property
n.parameter_type AS parameter_type_property
""" """
with driver.session() as session: with driver.session() as session:
result = session.run(query, node_id=node_id, component_id=component_id, param_type=param_type) result = session.run(query, node_id=node_id, component_id=component_id)
record = result.single() record = result.single()
return record["node_internal_id"], record["id_property"], record["component_id_property"], record['parameter_type_property'] return record["node_internal_id"], record["id_property"], record["component_id_property"]
def ensure_out_parameter_node(driver: Driver, node_id: str, prefixed_component_id: str) \
-> tuple[int,str,str,str]:
"""
Ensures that there exists an out-parameter node with ID node_id
associated with the component in the file with local path prefixed_component_id.
The ID of the component can be given based on the local relative path, so it is cleaned
before querying Neo4j.
Parameters:
driver (Driver): the Neo4j driver
node_id (str): the ID of the parameter
prefixed_component_id (str): the local relative path of the component
Returns:
tuple[int,str,str]: the Neoj4 internal ID of the parameter node, the parameter ID, the component ID
"""
component_id = clean_component_id(prefixed_component_id)
query = """
MERGE (n:OutParameter {parameter_id: $node_id, component_id: $component_id})
RETURN elementId(n) AS node_internal_id, n.parameter_id AS id_property, n.component_id AS component_id_property
"""
with driver.session() as session:
result = session.run(query, node_id=node_id, component_id=component_id)
record = result.single()
return record["node_internal_id"], record["id_property"], record["component_id_property"]
def ensure_data_node(driver: Driver, node_id: str, prefixed_component_id: str) -> tuple[int,str,str]: def ensure_data_node(driver: Driver, node_id: str, prefixed_component_id: str) -> tuple[int,str,str]:
""" """
...@@ -79,39 +103,13 @@ def ensure_data_node(driver: Driver, node_id: str, prefixed_component_id: str) - ...@@ -79,39 +103,13 @@ def ensure_data_node(driver: Driver, node_id: str, prefixed_component_id: str) -
return record["node_internal_id"], record["id_property"], record["component_id_property"] return record["node_internal_id"], record["id_property"], record["component_id_property"]
def create_ast_node(driver, rule, text): def create_ast_node(driver, component_id, rule, text):
query = """ query = """
CREATE (n:ASTNode {rule: $rule, text: $text}) MERGE (n:ASTNode {component_id:$component_id, rule: $rule, text: $text})
RETURN elementId(n) AS node_id RETURN elementId(n) AS node_id
""" """
with driver.session() as session: with driver.session() as session:
result = session.run(query, rule=rule, text=text) result = session.run(query, component_id=component_id, rule=rule, text=text)
record = result.single() record = result.single()
return record["node_id"] return record["node_id"]
\ No newline at end of file
def get_wf_data_nodes_from_step_in_param(driver: Driver, param_id: str, prefixed_step_id: str, prefixed_workflow_id: str) -> list[int]:
"""
Retrieves the internal IDs of data nodes (in a Neo4j database) belonging to the workflow with ID workflow_id
such that the in parameter with ID param_id of workflow step step_id has a data dependency on these data nodes.
This means that in said workflow these data nodes are injected into the parameter param_id of the step.
The ID of the component can be given based on the local relative path, so it is cleaned
before querying Neo4j.
Parameters:
param_id: the parameter ID of the step parameter
prefixed_step_id: the unique ID of the step
prefixed_workflow_id: the unique ID of the workflow the step is part of
Returns:
list[int]: the Neo4j internal IDs of the data nodes connected to the parameter node of the step in the mentioned workflow
"""
step_id = clean_component_id(prefixed_step_id)
workflow_id = clean_component_id(prefixed_workflow_id)
query = """
MATCH (n1:Data {component_id: $workflow_id})<-[:DATA]-(n2:Parameter {component_id: $step_id, parameter_type: "in", parameter_id: $param_id})
RETURN elementId(n1) AS internal_id
"""
with driver.session() as session:
result = session.run(query, workflow_id=workflow_id, step_id=step_id, param_id=param_id)
return [record["internal_id"] for record in result]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment