diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..4c364784856c6b77c19a3e07da43112fb6871ca6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.venv/ +repos/ +Neo4j-25ebc0db-Created-2024-11-17.txt \ No newline at end of file diff --git a/graph_creation/__init__.py b/graph_creation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/graph_creation/__pycache__/__init__.cpython-312.pyc b/graph_creation/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbad6cf95681231d978322d11e618805beb3bb82 Binary files /dev/null and b/graph_creation/__pycache__/__init__.cpython-312.pyc differ diff --git a/graph_creation/__pycache__/cwl_parsing.cpython-312.pyc b/graph_creation/__pycache__/cwl_parsing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58d71c29805070e2d59eda693a27cace85366622 Binary files /dev/null and b/graph_creation/__pycache__/cwl_parsing.cpython-312.pyc differ diff --git a/graph_creation/__pycache__/cwl_processing.cpython-312.pyc b/graph_creation/__pycache__/cwl_processing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..232a17bed36eb80bed003a243e6982549795994b Binary files /dev/null and b/graph_creation/__pycache__/cwl_processing.cpython-312.pyc differ diff --git a/graph_creation/__pycache__/repo_processing.cpython-312.pyc b/graph_creation/__pycache__/repo_processing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9c48a27062eeca18a32843f0a54b79e7c414a85e Binary files /dev/null and b/graph_creation/__pycache__/repo_processing.cpython-312.pyc differ diff --git a/graph_creation/__pycache__/utils.cpython-312.pyc b/graph_creation/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20e22c7ac192b92d0f4fd508e6ec00afe23d91f3 Binary files /dev/null and b/graph_creation/__pycache__/utils.cpython-312.pyc differ diff --git a/graph_creation/cwl_parsing.py b/graph_creation/cwl_parsing.py new file mode 100644 index 0000000000000000000000000000000000000000..9417087be0506af8e2454579a5845213822c0728 --- /dev/null +++ b/graph_creation/cwl_parsing.py @@ -0,0 +1,15 @@ +from pathlib import Path +from cwl_utils.parser import save +from cwl_utils.parser.cwl_v1_2_utils import load_inputfile + +def get_cwl_from_repo(repo_path: str) -> list[dict]: + cwl_entities = [] + pathlist = Path(repo_path).glob('**/*.cwl') + for path in pathlist: + path_in_str = str(path) + cwl_obj = load_inputfile(path_in_str) + saved_obj = save(cwl_obj, relative_uris=True) + saved_obj['path'] = path_in_str + cwl_entities.append(saved_obj) + + return cwl_entities \ No newline at end of file diff --git a/graph_creation/cwl_processing.py b/graph_creation/cwl_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..847d7dffdd2987f1841aeb763074d3ee910bfcce --- /dev/null +++ b/graph_creation/cwl_processing.py @@ -0,0 +1,72 @@ +from graph_creation.utils import create_input_nodes_and_relationships, process_source_relationship +from neo4j_queries.node_queries import ensure_component_node, ensure_data_node, ensure_parameter_node +from neo4j_queries.edge_queries import create_data_relationship, create_out_param_relationship +from pathlib import Path + +def process_cwl_inputs(driver, cwl_entity: dict): + component_id = cwl_entity['path'] + if type(cwl_entity['inputs']) == list: + for input in cwl_entity['inputs']: + if type(input) == dict: + create_input_nodes_and_relationships(driver, input['id'], component_id) + elif type(cwl_entity['inputs']) == dict: + for key in cwl_entity['inputs'].keys(): + create_input_nodes_and_relationships(driver, key, component_id) + +def process_cwl_outputs(driver, cwl_entity: dict): + component_id = cwl_entity['path'] + for output in cwl_entity['outputs']: + if type(output) == dict: + # Create out-parameter node o_node with id = o.id and component_id = c_node.id + param_node = ensure_parameter_node(driver, output['id'], component_id, 'out') + # Create a directed data edge from o_node to c_node + param_node_internal_id = param_node[0] + create_out_param_relationship(driver, component_id, param_node_internal_id) + if 'outputSource' in output: + if type(output['outputSource']) == str: + process_source_relationship(driver, output['outputSource'], component_id, param_node_internal_id) + elif type(output['outputSource']) == list: + for o in output['outputSource']: + process_source_relationship(driver, o, component_id, param_node_internal_id) + +def process_cwl_steps(driver, cwl_entity: dict, repo: str): + for step in cwl_entity['steps']: + combined_path = Path(repo) / step['run'] + step_path = str(combined_path) + # if a component node with the same path (run) as s does not exist then + # Create component node s_node unique to s with id equal to run + s_node = ensure_component_node(driver, step_path) + s_node_internal_id = s_node[0] + for i in step['in']: + # Create in-parameter node i_node with id = i.id and component_id = s.run + param_node = ensure_parameter_node(driver, i['id'], step_path, 'in') + param_node_internal_id = param_node[0] + # Create a data edge from s_node to i_node + create_data_relationship(driver, s_node_internal_id, param_node_internal_id) + + if 'source' in i: + if type(i['source']) == str: + source_id = i['source'] + process_source_relationship(driver, source_id, cwl_entity['path'], param_node_internal_id) + elif type(i['source']) == list: + for source_id in i['source']: + process_source_relationship(driver, source_id, cwl_entity['path'], param_node_internal_id) + + for o in step['out']: + if type(o) == dict: + o_id = o['id'] + else: + o_id = o + # Create out-parameter node o_node with id = o.id and component_id = s.run + param_node = ensure_parameter_node(driver, o_id, step_path, 'out') + param_node_internal_id = param_node[0] + # Create a data edge from o_node to s_node + create_data_relationship(driver, param_node_internal_id, s_node_internal_id) + # Workflow-level outputs of a step have \texttt{id} corresponding to \texttt{[[step ID]/[output ID as defined in workflow]]} + # and a \texttt{component\_id} property equal to the ID of the workflow + # Create data node o_data_node with id = step_id/output_id and component_id = c_node.id + output_id = f"{step['id']}/{o_id}" + data_node = ensure_data_node(driver, output_id, cwl_entity['path']) + data_node_internal_id = data_node[0] + # Create a data edge from o_node to o_data_node + create_data_relationship(driver, param_node_internal_id, data_node_internal_id) \ No newline at end of file diff --git a/graph_creation/repo_processing.py b/graph_creation/repo_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..c22cdfbe5225d49a0ad65ef92338d6867fd2ba57 --- /dev/null +++ b/graph_creation/repo_processing.py @@ -0,0 +1,17 @@ +from graph_creation.cwl_parsing import get_cwl_from_repo +from graph_creation.cwl_processing import process_cwl_inputs, process_cwl_outputs, process_cwl_steps +from neo4j_queries.node_queries import ensure_component_node + +def process_repos(repo_list: list, driver): + cwl_entities = {} + for repo in repo_list: + cwl_entities[repo]= get_cwl_from_repo(repo) + for entity in cwl_entities[repo]: + # if a component node with the same path as c does not exist then + # create component node c_node unique to c with id equal to path and alias equal to a empty dictionary + component_id = entity['path'] + ensure_component_node(driver, component_id) + process_cwl_inputs(driver, entity) + process_cwl_outputs(driver, entity) + if entity['class'] == 'Workflow': + process_cwl_steps(driver, entity, repo) diff --git a/graph_creation/utils.py b/graph_creation/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bf44831f498e6df51e2d72ebf8e0d5f0569c03cb --- /dev/null +++ b/graph_creation/utils.py @@ -0,0 +1,19 @@ +from neo4j_queries.node_queries import ensure_data_node, ensure_parameter_node +from neo4j_queries.edge_queries import create_data_relationship, create_in_param_relationship + +def create_input_nodes_and_relationships(driver, input_id, component_id): + # Create in-parameter node i_node with id = i.id and component_id = c_node.id + param_node = ensure_parameter_node(driver, input_id, component_id, 'in') + param_node_internal_id = param_node[0] + # Create a directed data edge from c_node to i_node + create_in_param_relationship(driver, component_id, param_node_internal_id) + # Create a data node i_data_node with id = i.id and component_id = c_node.id + data_node = ensure_data_node(driver, input_id, component_id) + data_node_internal_id = data_node[0] + # Create a data edge from i_data_node to i_node + create_data_relationship(driver, data_node_internal_id, param_node_internal_id) + +def process_source_relationship(driver, source_id, component_id, param_node_internal_id): + data_node = ensure_data_node(driver, source_id, component_id) + data_node_internal_id = data_node[0] + create_data_relationship(driver, param_node_internal_id, data_node_internal_id) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..1a9e5327707c434f5206c26cdb881097b91ff98f --- /dev/null +++ b/main.py @@ -0,0 +1,37 @@ +from graph_creation.repo_processing import process_repos +from neo4j import GraphDatabase +import dotenv +import os +import gitlab +import subprocess + +def clone_repos(repo_list: list, folder_name: str): + gl = gitlab.Gitlab('https://git.astron.nl') + projects = gl.projects.list(iterator=True, get_all=True) + for project in projects: + repo_name = project.attributes['path_with_namespace'] + if repo_name in repo_list: + git_url = project.ssh_url_to_repo + subprocess.call(['git', 'clone', git_url, f'./{folder_name}/{repo_name}']) + +if __name__ == '__main__': + relevant_repos = ['ldv/imaging_compress_pipeline'] + folder = 'repos' + clone_repos(relevant_repos) + + load_status = dotenv.load_dotenv("Neo4j-25ebc0db-Created-2024-11-17.txt") + if load_status is False: + raise RuntimeError('Environment variables not loaded.') + + URI = os.getenv("NEO4J_URI") + AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD")) + + repo_paths = [f'{folder}/{path}' for path in relevant_repos] + print(repo_paths) + with GraphDatabase.driver(URI, auth=AUTH) as driver: + driver.verify_connectivity() + print("Connection established.") + driver = GraphDatabase.driver(URI, auth=AUTH) + process_repos(repo_paths, driver) + driver.close() + diff --git a/neo4j_queries/__init__.py b/neo4j_queries/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/neo4j_queries/__pycache__/__init__.cpython-312.pyc b/neo4j_queries/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7189352bf21b6440c0bba245ba0830a5ca09c06 Binary files /dev/null and b/neo4j_queries/__pycache__/__init__.cpython-312.pyc differ diff --git a/neo4j_queries/__pycache__/edge_queries.cpython-312.pyc b/neo4j_queries/__pycache__/edge_queries.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41101cd7aaf942db65346904cc0998e1b13494fa Binary files /dev/null and b/neo4j_queries/__pycache__/edge_queries.cpython-312.pyc differ diff --git a/neo4j_queries/__pycache__/node_queries.cpython-312.pyc b/neo4j_queries/__pycache__/node_queries.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..26b0b655e6fc677c6b4e3f5e389f1a50654b52d3 Binary files /dev/null and b/neo4j_queries/__pycache__/node_queries.cpython-312.pyc differ diff --git a/neo4j_queries/__pycache__/utils.cpython-312.pyc b/neo4j_queries/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1071760f3536644afec6f0c65efebc415c6af69d Binary files /dev/null and b/neo4j_queries/__pycache__/utils.cpython-312.pyc differ diff --git a/neo4j_queries/edge_queries.py b/neo4j_queries/edge_queries.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ee233c7d510eb9d79c34f45e477ead6b06254f --- /dev/null +++ b/neo4j_queries/edge_queries.py @@ -0,0 +1,42 @@ +from neo4j_queries.utils import clean_component_id + +def create_in_param_relationship(driver, prefixed_component_id, parameter_internal_id): + component_id = clean_component_id(prefixed_component_id) + query = """ + MATCH (c:Component {component_id: $component_id}), (p) + WHERE id(p) = $parameter_internal_id + MERGE (c)-[:DATA]->(p) + RETURN c.id AS component_id, p.parameter_id AS parameter_id + """ + with driver.session() as session: + result = session.run(query, component_id=component_id, + parameter_internal_id=parameter_internal_id) + record = result.single() + return record["component_id"], record["parameter_id"] + +def create_out_param_relationship(driver, prefixed_component_id, parameter_internal_id): + component_id = clean_component_id(prefixed_component_id) + query = """ + MATCH (c:Component {component_id: $component_id}), (p) + WHERE id(p) = $parameter_internal_id + MERGE (c)<-[:DATA]-(p) + RETURN c.component_id AS component_id, p.parameter_id AS parameter_id + """ + with driver.session() as session: + result = session.run(query, component_id=component_id, + parameter_internal_id=parameter_internal_id) + record = result.single() + return record["component_id"], record["parameter_id"] + +def create_data_relationship(driver, from_internal_node_id, to_internal_node_id): + query = """ + MATCH (a), (b) + WHERE id(a) = $from_internal_node_id AND id(b) = $to_internal_node_id + MERGE (a)-[:DATA]->(b) + RETURN a.id AS id_1, b.id AS id_2 + """ + with driver.session() as session: + result = session.run(query, from_internal_node_id=from_internal_node_id, + to_internal_node_id=to_internal_node_id) + record = result.single() + return record["id_1"], record["id_2"] \ No newline at end of file diff --git a/neo4j_queries/node_queries.py b/neo4j_queries/node_queries.py new file mode 100644 index 0000000000000000000000000000000000000000..b78b58bbfec744769c1130ae97da9ac873146192 --- /dev/null +++ b/neo4j_queries/node_queries.py @@ -0,0 +1,41 @@ + +from neo4j_queries.utils import clean_component_id + +def ensure_component_node(driver, prefixed_component_id): + component_id = clean_component_id(prefixed_component_id) + query = """ + MERGE (c:Component {component_id: $component_id}) + RETURN id(c) AS node_internal_id, c.id AS id_property + """ + with driver.session() as session: + result = session.run(query, component_id=component_id) + record = result.single() + return record["node_internal_id"], record["id_property"] + +def ensure_parameter_node(driver, node_id, prefixed_component_id, param_type): + component_id = clean_component_id(prefixed_component_id) + query = """ + MERGE (n:Parameter {parameter_id: $node_id, component_id: $component_id}) + ON CREATE SET + n.component_id = $component_id, + n.parameter_type = $param_type + RETURN id(n) AS node_internal_id, n.parameter_id AS id_property, n.component_id AS component_id_property, + n.parameter_type AS parameter_type_property + """ + with driver.session() as session: + result = session.run(query, node_id=node_id, component_id=component_id, param_type=param_type) + record = result.single() + return record["node_internal_id"], record["id_property"], record["component_id_property"], record['parameter_type_property'] + +def ensure_data_node(driver, node_id, prefixed_component_id): + component_id = clean_component_id(prefixed_component_id) + query = """ + MERGE (n:Data {data_id: $node_id, component_id: $component_id}) + ON CREATE SET + n.component_id = $component_id + RETURN id(n) AS node_internal_id, n.data_id AS id_property, n.component_id AS component_id_property + """ + with driver.session() as session: + result = session.run(query, node_id=node_id, component_id=component_id) + record = result.single() + return record["node_internal_id"], record["id_property"], record["component_id_property"] \ No newline at end of file diff --git a/neo4j_queries/utils.py b/neo4j_queries/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c00f1b5d8deae743479921dc870b3c698bafa47c --- /dev/null +++ b/neo4j_queries/utils.py @@ -0,0 +1,3 @@ +def clean_component_id(prefixed_component_id: str) -> str: + component_id = prefixed_component_id.removeprefix("repos\\") + return component_id \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..31918a854da83fe2026e7db14600ad21ee77a0c0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +python-gitlab +python-dotenv +neo4j +ruamel.yaml +cwl-utils \ No newline at end of file