diff --git a/graph_creation/cwl_parsing.py b/graph_creation/cwl_parsing.py index 53dc06183d68db7426976945cba5a804944dc2e7..36baae11f5d6d121a875df4df2761009b0126019 100644 --- a/graph_creation/cwl_parsing.py +++ b/graph_creation/cwl_parsing.py @@ -9,10 +9,10 @@ def get_cwl_from_repo(repo_path: str) -> list[dict]: The path is saved using the key 'path' with value equal to the relative path of the CWL file. Parameters: - repo_path (str): the path of the local repository + repo_path (str): the path of the local repository Returns: - list[dict]: a list of dictonaries, each dictionary is a parsed CWL file + list[dict]: a list of dictonaries, each dictionary is a parsed CWL file """ cwl_entities = [] pathlist = Path(repo_path).glob('**/*.cwl') diff --git a/graph_creation/cwl_processing.py b/graph_creation/cwl_processing.py index 24463960331c254c136c40c58501ead07a206e75..57ab18fe254d9cec4e822dfafe9015e8b719d143 100644 --- a/graph_creation/cwl_processing.py +++ b/graph_creation/cwl_processing.py @@ -1,5 +1,5 @@ from neo4j import Driver -from graph_creation.utils import create_input_nodes_and_relationships, process_source_relationship +from graph_creation.utils import create_input_nodes_and_relationships, process_source_relationship, resolve_relative_path from neo4j_queries.node_queries import ensure_component_node, ensure_data_node, ensure_parameter_node from neo4j_queries.edge_queries import create_data_relationship, create_out_param_relationship from pathlib import Path @@ -15,8 +15,8 @@ def process_cwl_inputs(driver: Driver, cwl_entity: dict) -> None: - a data edge from the data node to the the in-parameter node Parameters: - driver (Driver): the driver used to connect to Neo4j - cwl_entity (dict): the dictionary containing the parsed contents of the CWL component + driver (Driver): the driver used to connect to Neo4j + cwl_entity (dict): the dictionary containing the parsed contents of the CWL component """ component_id = cwl_entity['path'] # Inputs can be defined a list or a dictionary @@ -43,8 +43,8 @@ def process_cwl_outputs(driver: Driver, cwl_entity: dict) -> None: - a data edge from the out-parameter node to the data node Parameters: - driver (Driver): the driver used to connect to Neo4j - cwl_entity (dict): the dictionary containing the parsed contents of the CWL component + driver (Driver): the driver used to connect to Neo4j + cwl_entity (dict): the dictionary containing the parsed contents of the CWL component """ component_id = cwl_entity['path'] for output in cwl_entity['outputs']: @@ -66,7 +66,7 @@ def process_cwl_outputs(driver: Driver, cwl_entity: dict) -> None: for source_id in output['outputSource']: process_source_relationship(driver, source_id, component_id, param_node_internal_id) -def process_cwl_steps(driver: Driver, cwl_entity: dict, repo_path: str) -> None: +def process_cwl_steps(driver: Driver, cwl_entity: dict) -> None: """ Processes the steps of a CWL Workflow component( which we will refer to as outer workflow component). A step can be a Workflow, CommandLineTool or ExpressionTool. @@ -87,14 +87,16 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict, repo_path: str) -> None: - a data edge from the out-parameter node to the data node Parameters: - driver (Driver): the driver used to connect to Neo4j - cwl_entity (dict): the dictionary containing the parsed contents of the CWL component - repo_path (str): the path of the repository that contains the CWL component + driver (Driver): the driver used to connect to Neo4j + cwl_entity (dict): the dictionary containing the parsed contents of the CWL component """ for step in cwl_entity['steps']: + # Retrieve path of the step - combined_path = Path(repo_path) / step['run'] - step_path = str(combined_path) + workflow_folder = Path(cwl_entity['path']).parent + full_step_path = workflow_folder / Path(step['run']) + step_path = str(resolve_relative_path(full_step_path)) + # Create the step component node with ID equal to the step s_node = ensure_component_node(driver, step_path) s_node_internal_id = s_node[0] diff --git a/graph_creation/repo_processing.py b/graph_creation/repo_processing.py index 9e53fbbdb6195eca995bb1849b75cfb5e83d1cf9..34850abda8a2390d567efc72260bc7ffc3ddda04 100644 --- a/graph_creation/repo_processing.py +++ b/graph_creation/repo_processing.py @@ -9,8 +9,8 @@ def process_repos(repo_list: list[str], driver: Driver) -> None: the function parses the CWL files and turns them into a Neo4j dependency graph. Parameters: - repo_list (list[str]): a list of paths to local repositories - driver (Driver): a Neo4j driver + repo_list (list[str]): a list of paths to local repositories + driver (Driver): a Neo4j driver """ cwl_entities = {} for repo in repo_list: diff --git a/graph_creation/utils.py b/graph_creation/utils.py index 86e00181e8d7dc5a81ca534c272d48756a0fa939..0b62a0e43408db6c19e1c2ac3ab465bb9798539a 100644 --- a/graph_creation/utils.py +++ b/graph_creation/utils.py @@ -1,3 +1,4 @@ +from pathlib import Path from neo4j import Driver from neo4j_queries.node_queries import ensure_data_node, ensure_parameter_node from neo4j_queries.edge_queries import create_data_relationship, create_in_param_relationship @@ -12,9 +13,9 @@ def create_input_nodes_and_relationships(driver: Driver, input_id: str, componen - a data edge from the data node to the the in-parameter node Parameters: - driver (Driver): the driver used to connect to Neo4j - input_id (str): the ID of the input as defined in the CWL component - component_id (str): the unique ID of the CWL component (its path) + driver (Driver): the driver used to connect to Neo4j + input_id (str): the ID of the input as defined in the CWL component + component_id (str): the unique ID of the CWL component (its path) """ # Create in-parameter with the parameter ID as defined in the component and component ID equal to the path of the componet param_node = ensure_parameter_node(driver, input_id, component_id, 'in') @@ -36,11 +37,38 @@ def process_source_relationship(driver: Driver, source_id: str, component_id: st - a data edge from the parameter node to the data node Parameters: - driver (Driver): the driver used to connect to Neo4j - source_id (str): the ID of the data that functions as a source for the parameter - component_id (str): the unique ID of the CWL component (its path) - param_node_internal_id (int): the unique ID of the parameter node as defined internally by Neo4j + driver (Driver): the driver used to connect to Neo4j + source_id (str): the ID of the data that functions as a source for the parameter + component_id (str): the unique ID of the CWL component (its path) + param_node_internal_id (int): the unique ID of the parameter node as defined internally by Neo4j """ data_node = ensure_data_node(driver, source_id, component_id) data_node_internal_id = data_node[0] - create_data_relationship(driver, param_node_internal_id, data_node_internal_id) \ No newline at end of file + create_data_relationship(driver, param_node_internal_id, data_node_internal_id) + +def resolve_relative_path(path: Path)-> Path: + """ + Resolves a relative path by simplifying `.` (current directory) + and `..` (parent directory) components without converting it to an absolute path. + + Parameters: + path (Path): the input Path object to be resolved + + Returns: + Path: a new object representing the simplified relative path + + Example: + >>> resolve_relative_path(Path("x/y/../z")) + Path('x/z') + + >>> resolve_relative_path(Path("./a/./b/c/../d")) + Path('a/b/d') + """ + parts = [] + for part in path.parts: + if part == "..": + if parts: + parts.pop() + elif part != ".": + parts.append(part) + return Path(*parts) \ No newline at end of file diff --git a/main.py b/main.py index 81e6ccf044517829be230df71d317c911e8c1a65..55a78076d6cfff10b04d8a574e7c2ecf0976e513 100644 --- a/main.py +++ b/main.py @@ -11,8 +11,8 @@ def clone_repos(repo_list: list[str], folder_name: str) -> None: the mentioned repositories are cloned into the mentioned folder. Parameters: - repo_list (list[str]): list of relative paths to ASTRON GitLab repositories - folder_name (str): the name of the folder to clone the repos into + repo_list (list[str]): list of relative paths to ASTRON GitLab repositories + folder_name (str): the name of the folder to clone the repos into """ gl = gitlab.Gitlab('https://git.astron.nl') projects = gl.projects.list(iterator=True, get_all=True) diff --git a/neo4j_queries/edge_queries.py b/neo4j_queries/edge_queries.py index c3970092e8e98489fa2f51a6ada22bbff145997e..a8bd1d2ce528d66a7ad5ef703c03e44af8f3fec8 100644 --- a/neo4j_queries/edge_queries.py +++ b/neo4j_queries/edge_queries.py @@ -10,19 +10,19 @@ def create_in_param_relationship(driver: Driver, prefixed_component_id: str, par before querying Neo4j. Parameters: - driver (Driver): the Neo4j driver - prefixed_component_id (str): the local relative path of the component - parameter_internal_id (int): the internal Neo4j ID of the in-parameter node + driver (Driver): the Neo4j driver + prefixed_component_id (str): the local relative path of the component + parameter_internal_id (int): the internal Neo4j ID of the in-parameter node Returns: - tuple[str,str]: the component ID of the component, the parameter ID of the parameter + tuple[str,str]: the component ID of the component, the parameter ID of the parameter """ component_id = clean_component_id(prefixed_component_id) query = """ MATCH (c:Component {component_id: $component_id}), (p) - WHERE id(p) = $parameter_internal_id + WHERE elementId(p) = $parameter_internal_id MERGE (c)-[:DATA]->(p) - RETURN c.id AS component_id, p.parameter_id AS parameter_id + RETURN c.component_id AS component_id, p.parameter_id AS parameter_id """ with driver.session() as session: result = session.run(query, component_id=component_id, @@ -39,17 +39,17 @@ def create_out_param_relationship(driver: Driver, prefixed_component_id: str, pa before querying Neo4j. Parameters: - driver (Driver): the Neo4j driver - prefixed_component_id (str): the local relative path of the component - parameter_internal_id (int): the internal Neo4j ID of the out-parameter node + driver (Driver): the Neo4j driver + prefixed_component_id (str): the local relative path of the component + parameter_internal_id (int): the internal Neo4j ID of the out-parameter node Returns: - tuple[str,str]: the component ID of the component, the parameter ID of the parameter + tuple[str,str]: the component ID of the component, the parameter ID of the parameter """ component_id = clean_component_id(prefixed_component_id) query = """ MATCH (c:Component {component_id: $component_id}), (p) - WHERE id(p) = $parameter_internal_id + WHERE elementId(p) = $parameter_internal_id MERGE (c)<-[:DATA]-(p) RETURN c.component_id AS component_id, p.parameter_id AS parameter_id """ @@ -66,21 +66,22 @@ def create_data_relationship(driver: Driver, from_internal_node_id: int, to_inte to the node with internal ID to_internal_node_id. Parameters: - driver (Driver): the Neo4j driver - from_internal_node_id (int): the internal Neo4j ID of the first node - to_internal_node_id (int): the internal Neo4j ID of the second node + driver (Driver): the Neo4j driver + from_internal_node_id (int): the internal Neo4j ID of the first node + to_internal_node_id (int): the internal Neo4j ID of the second node Returns: - tuple[int,int]: from_internal_node_id, to_internal_node_id + tuple[int,int]: from_internal_node_id, to_internal_node_id """ query = """ MATCH (a), (b) - WHERE id(a) = $from_internal_node_id AND id(b) = $to_internal_node_id + WHERE elementId(a) = $from_internal_node_id AND elementId(b) = $to_internal_node_id MERGE (a)-[:DATA]->(b) - RETURN a.id AS id_1, b.id AS id_2 + RETURN elementId(a) AS id_1, elementId(b) AS id_2 """ with driver.session() as session: result = session.run(query, from_internal_node_id=from_internal_node_id, to_internal_node_id=to_internal_node_id) record = result.single() + return record["id_1"], record["id_2"] return record["id_1"], record["id_2"] \ No newline at end of file diff --git a/neo4j_queries/node_queries.py b/neo4j_queries/node_queries.py index b2dbb9f9105885489336f7fb4d1221cc19929e57..21d1f766c6f950c769140c20cb3e92b7c9d780ba 100644 --- a/neo4j_queries/node_queries.py +++ b/neo4j_queries/node_queries.py @@ -9,16 +9,16 @@ def ensure_component_node(driver: Driver, prefixed_component_id: str) -> tuple[i before querying Neo4j. Parameters: - driver (Driver): the Neo4j driver - prefixed_component_id (str): the local relative path of the component + driver (Driver): the Neo4j driver + prefixed_component_id (str): the local relative path of the component Returns: - tuple[int,str]: the Neoj4 internal ID of the component node, the component ID of the component + tuple[int,str]: the Neoj4 internal ID of the component node, the component ID of the component """ component_id = clean_component_id(prefixed_component_id) query = """ MERGE (c:Component {component_id: $component_id}) - RETURN id(c) AS node_internal_id, c.component_id AS component_id + RETURN elementId(c) AS node_internal_id, c.component_id AS component_id """ with driver.session() as session: result = session.run(query, component_id=component_id) @@ -34,18 +34,18 @@ def ensure_parameter_node(driver: Driver, node_id: str, prefixed_component_id: s before querying Neo4j. Parameters: - driver (Driver): the Neo4j driver - node_id (str): the ID of the parameter - prefixed_component_id (str): the local relative path of the component - param_type (str): the type of the parameter ('in' or 'out') + driver (Driver): the Neo4j driver + node_id (str): the ID of the parameter + prefixed_component_id (str): the local relative path of the component + param_type (str): the type of the parameter ('in' or 'out') Returns: - tuple[int,str,str, str]: the Neoj4 internal ID of the parameter node, the parameter ID, the component ID, the parameter type + tuple[int,str,str, str]: the Neoj4 internal ID of the parameter node, the parameter ID, the component ID, the parameter type """ component_id = clean_component_id(prefixed_component_id) query = """ MERGE (n:Parameter {parameter_id: $node_id, component_id: $component_id, parameter_type: $param_type}) - RETURN id(n) AS node_internal_id, n.parameter_id AS id_property, n.component_id AS component_id_property, + RETURN elementId(n) AS node_internal_id, n.parameter_id AS id_property, n.component_id AS component_id_property, n.parameter_type AS parameter_type_property """ with driver.session() as session: @@ -61,19 +61,19 @@ def ensure_data_node(driver: Driver, node_id: str, prefixed_component_id: str) - before querying Neo4j. Parameters: - driver (Driver): the Neo4j driver - node_id (str): the ID of the data - prefixed_component_id (str): the local relative path of the component + driver (Driver): the Neo4j driver + node_id (str): the ID of the data + prefixed_component_id (str): the local relative path of the component Returns: - tuple[int,str,str, str]: the Neoj4 internal ID of the data node, the data ID, the component ID + tuple[int,str,str, str]: the Neoj4 internal ID of the data node, the data ID, the component ID """ component_id = clean_component_id(prefixed_component_id) query = """ MERGE (n:Data {data_id: $node_id, component_id: $component_id}) - RETURN id(n) AS node_internal_id, n.data_id AS id_property, n.component_id AS component_id_property + RETURN elementId(n) AS node_internal_id, n.data_id AS id_property, n.component_id AS component_id_property """ with driver.session() as session: result = session.run(query, node_id=node_id, component_id=component_id) record = result.single() - return record["node_internal_id"], record["id_property"], record["component_id_property"] \ No newline at end of file + return record["node_internal_id"], record["id_property"], record["component_id_property"]