Skip to content
Snippets Groups Projects
Commit e7fea0fc authored by Chiara Liotta's avatar Chiara Liotta
Browse files

process dockerfile

parent 694fecea
No related branches found
No related tags found
No related merge requests found
from antlr4 import ParserRuleContext from antlr4 import ParserRuleContext
from neo4j import Driver from neo4j import Driver
from neo4j_queries.edge_queries import create_has_child_relationship
from neo4j_queries.node_queries import create_ast_node from neo4j_queries.node_queries import create_ast_node
def traverse_and_create(driver: Driver, component_id: str, tree, parent_node_id=None): def traverse_and_create(driver: Driver, component_id: str, tree, parent_node_id=None):
...@@ -12,8 +11,8 @@ def traverse_and_create(driver: Driver, component_id: str, tree, parent_node_id= ...@@ -12,8 +11,8 @@ def traverse_and_create(driver: Driver, component_id: str, tree, parent_node_id=
current_node_id = create_ast_node(driver, component_id, rule_name, text) current_node_id = create_ast_node(driver, component_id, rule_name, text)
# If there's a parent, create a relationship # If there's a parent, create a relationship
if parent_node_id is not None: # if parent_node_id is not None:
create_has_child_relationship(driver, parent_node_id, current_node_id) # create_has_child_relationship(driver, parent_node_id, current_node_id)
# Recursively process all children # Recursively process all children
for i in range(tree.getChildCount()): for i in range(tree.getChildCount()):
......
...@@ -185,13 +185,74 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict, tool_paths: list[str], s ...@@ -185,13 +185,74 @@ def process_cwl_steps(driver: Driver, cwl_entity: dict, tool_paths: list[str], s
create_data_relationship_with_id(driver, param_node_internal_id, s_node_internal_id, step_path) create_data_relationship_with_id(driver, param_node_internal_id, s_node_internal_id, step_path)
def process_base_commands(driver, entity): def process_cwl_base_commands(driver, entity, links):
return base_command_key = "baseCommand"
link_commands = links["commands"]
link_paths = links["paths"]
if base_command_key in entity:
commands = entity[base_command_key]
if isinstance(commands, list):
if commands:
first_command = commands[0]
all_commands = " ".join(commands)
extension = Path(first_command).suffix
if extension:
for key, value in link_paths.items():
if is_executable(key) and first_command in key:
git_internal_node_id = ensure_git_node(driver, value)[0]
create_references_relationship(driver, entity["path"], git_internal_node_id, all_commands)
break
else:
if first_command in link_commands:
git_internal_node_id = ensure_git_node(driver, link_commands[first_command])[0]
create_references_relationship(driver, entity["path"], git_internal_node_id, all_commands)
return commands
return None
def is_executable(path):
prefix = Path("\\usr\\local\\bin")
return Path(path).parts[:len(prefix.parts)] == prefix.parts
def get_executable(path):
prefix = Path("\\usr\\local\\bin")
if Path(path).parts[:len(prefix.parts)] == prefix.parts:
return Path(*Path(path).parts[len(prefix.parts):])
def process_cwl_commandline(driver, entity, links):
commands = process_cwl_base_commands(driver, entity, links)
listing = None
if "requirements" in entity:
if "InitialWorkDirRequirement" in entity["requirements"]:
listing = entity["requirements"]["InitialWorkDirRequirement"]["listing"]
else:
init_work_dir = next((item for item in entity["requirements"] if item.get('class') == 'InitialWorkDirRequirement'), None)
if init_work_dir:
listing = init_work_dir["listing"]
if commands and listing:
entry_map = {entry["entryname"]: entry["entry"] for entry in listing if "entryname" in entry}
for command in commands:
if command in entry_map:
all_links = links["commands"] | links["paths"]
print(entry_map[command])
# Regular expression pattern to match POSIX file paths
# pattern = r'(?<![\'"\[])(/[\w/.-]+(?:\.[a-zA-Z0-9]+)?)(?![\'"\]])'
# # Find all POSIX paths in the bash script
# posix_paths = re.findall(pattern, entry_map[command])
# print(posix_paths)
for key, value in all_links.items():
path = str(Path(key).as_posix())
if bool(re.search(rf'\b{re.escape(path)}\b', entry_map[command])):
print(f"created: {value}")
git_internal_node_id = ensure_git_node(driver, value)[0]
create_references_relationship(driver, entity["path"], git_internal_node_id, entry_map[command])
if is_executable(key):
executable = str(get_executable(key))
if bool(re.search(rf'\b{re.escape(executable)}\b', entry_map[command])):
print(f"created: {value}")
git_internal_node_id = ensure_git_node(driver, value)[0]
create_references_relationship(driver, entity["path"], git_internal_node_id, entry_map[command])
print()
def process_cwl_expression(driver: Driver, entity: dict) -> None:
expression = entity['expression']
expr_tree = parse_javascript_string(expression)
# traverse_and_create(driver, entity['path'], expr_tree)
def process_cwl_commandline(driver, entity):
return
\ No newline at end of file
...@@ -12,25 +12,29 @@ def parse_all_dockerfiles(repo_path): ...@@ -12,25 +12,29 @@ def parse_all_dockerfiles(repo_path):
all_commands.extend(commands) all_commands.extend(commands)
mentioned_repos = handle_git_clone_commands(all_commands) mentioned_repos = handle_git_clone_commands(all_commands)
moves = handle_ln_commands(all_commands) moves = handle_ln_commands(all_commands)
links = {} links = {"paths": {}, "commands": {}}
link_paths = links["paths"]
link_commands = links["commands"]
for repo_url in mentioned_repos["no-checkout"]: for repo_url in mentioned_repos["no-checkout"]:
repo_name = get_repo_name(repo_url) repo_name = get_repo_name(repo_url)
links[repo_name] = repo_url link_commands[repo_name] = repo_url
# print(f"Repository URL with --no-checkout: {repo_url}. Alias: {repo_name}") # print(f"Repository URL with --no-checkout: {repo_url}. Alias: {repo_name}")
for (repo_url, repo_folder) in mentioned_repos["checkout"]: for (repo_url, repo_folder) in mentioned_repos["checkout"]:
paths = list_repo_files_from_api(repo_url) paths = list_repo_files_from_api(repo_url)
for path in paths: for path in paths:
full_path = Path(repo_folder) / Path(path) full_path = Path(repo_folder) / Path(path)
links[str(full_path)] = repo_url link_paths[str(full_path)] = repo_url
for move in moves: for move in moves:
from_position = move[0] from_position = move[0]
to_position = move[1] to_position = move[1]
if from_position in links: if from_position in link_paths:
originating_repo = links[from_position] originating_repo = link_paths[from_position]
links[to_position] = originating_repo link_paths[to_position] = originating_repo
else: else:
print(f"{from_position} is not a recognized path.") print(f"{from_position} is not a recognized path.")
return links
def parse_dockerfile_run_commands(dockerfile_path): def parse_dockerfile_run_commands(dockerfile_path):
"""Parse the Dockerfile and return a list of RUN commands.""" """Parse the Dockerfile and return a list of RUN commands."""
with open(dockerfile_path, "r") as f: with open(dockerfile_path, "r") as f:
......
...@@ -3,6 +3,7 @@ from graph_creation.cwl_parsing import get_cwl_from_repo ...@@ -3,6 +3,7 @@ from graph_creation.cwl_parsing import get_cwl_from_repo
from graph_creation.docker_parsing import parse_all_dockerfiles from graph_creation.docker_parsing import parse_all_dockerfiles
from graph_creation.utils import process_step_lookup from graph_creation.utils import process_step_lookup
from graph_creation.cwl_processing import process_cwl_commandline, process_cwl_inputs, process_cwl_outputs, process_cwl_steps from graph_creation.cwl_processing import process_cwl_commandline, process_cwl_inputs, process_cwl_outputs, process_cwl_steps
from neo4j_queries.edge_queries import clean_relationship
from neo4j_queries.node_queries import ensure_component_node from neo4j_queries.node_queries import ensure_component_node
from neo4j_queries.utils import get_is_workflow from neo4j_queries.utils import get_is_workflow
...@@ -24,12 +25,13 @@ def process_repos(repo_list: list[str], driver: Driver) -> None: ...@@ -24,12 +25,13 @@ def process_repos(repo_list: list[str], driver: Driver) -> None:
None None
""" """
for repo in repo_list: for repo in repo_list:
parse_all_dockerfiles(repo) links = parse_all_dockerfiles(repo)
# Parse CWL files of current repo # Parse CWL files of current repo
workflows, tools = get_cwl_from_repo(repo) workflows, tools = get_cwl_from_repo(repo)
# Extract tool paths for step processing later # Extract tool paths for step processing later
tool_paths = [item["path"] for item in tools] tool_paths = [item["path"] for item in tools]
clean_relationship(driver)
# Combine workflows and tools into one list of entities to process # Combine workflows and tools into one list of entities to process
all_entities = workflows + tools all_entities = workflows + tools
...@@ -48,4 +50,4 @@ def process_repos(repo_list: list[str], driver: Driver) -> None: ...@@ -48,4 +50,4 @@ def process_repos(repo_list: list[str], driver: Driver) -> None:
# elif entity['class'] == 'ExpressionTool': # elif entity['class'] == 'ExpressionTool':
# process_cwl_expression(driver, entity) # process_cwl_expression(driver, entity)
elif entity['class'] == 'CommandLineTool': elif entity['class'] == 'CommandLineTool':
process_cwl_commandline(driver, entity) process_cwl_commandline(driver, entity, links)
\ No newline at end of file \ No newline at end of file
...@@ -110,31 +110,32 @@ def create_control_relationship(driver: Driver, from_internal_node_id: int, to_i ...@@ -110,31 +110,32 @@ def create_control_relationship(driver: Driver, from_internal_node_id: int, to_i
record = result.single() record = result.single()
return record["id_1"], record["id_2"] return record["id_1"], record["id_2"]
def create_has_child_relationship(driver: Driver, parent_internal_node_id: int, child_internal_node_id: int) -> tuple[int,int]:
"""
Creates a "has child" relationship in Neo4j between the two nodes with Neo4j internal IDs given as parameters.
This relationship is an outgoing "has child" edge from the parent node to the child node.
Parameters:
driver (Driver): the Neo4j driver
parent_internal_node_id (int): the internal Neo4j ID of the parent node
child_internal_node_id (int): the internal Neo4j ID of the child node
Returns: def create_references_relationship(driver: Driver, prefixed_component_id: int, git_internal_node_id: int, reference: str) -> tuple[int,int]:
tuple[int,int]: parent_internal_node_id, child_internal_node_id component_id = clean_component_id(prefixed_component_id)
"""
query = """ query = """
MATCH (parent), (child) MATCH (component: Component), (git)
WHERE elementId(parent) = $parent_id AND elementId(child) = $child_id WHERE component.component_id = $component_id AND elementId(git) = $git_internal_node_id
MERGE (parent)-[:HAS_CHILD]->(child) MERGE (component)-[:REFERENCES{component_id: $component_id, reference: $reference}]->(git)
RETURN elementId(parent) AS id_1, elementId(child) AS id_2 RETURN elementId(component) AS id_1, elementId(git) AS id_2
""" """
with driver.session() as session: with driver.session() as session:
result = session.run(query, parent_id=parent_internal_node_id, result = session.run(query, component_id=component_id,
child_id=child_internal_node_id) git_internal_node_id=git_internal_node_id, reference=reference)
record = result.single() record = result.single()
return record["id_1"], record["id_2"] return record["id_1"], record["id_2"]
def clean_relationship(driver: Driver) -> tuple[int,int]:
query = """
MATCH ()-[r:REFERENCES]-()
WHERE r.reference IS NULL
DELETE r
"""
with driver.session() as session:
session.run(query)
def simplify_data_and_control_edges(driver: Driver): def simplify_data_and_control_edges(driver: Driver):
with driver.session() as session: with driver.session() as session:
create_data_edges_query = """ create_data_edges_query = """
......
...@@ -25,6 +25,17 @@ def ensure_component_node(driver: Driver, prefixed_component_id: str) -> tuple[i ...@@ -25,6 +25,17 @@ def ensure_component_node(driver: Driver, prefixed_component_id: str) -> tuple[i
record = result.single() record = result.single()
return record["node_internal_id"], record["component_id"] return record["node_internal_id"], record["component_id"]
def ensure_git_node(driver: Driver, git_url: str) -> tuple[int,str]:
query = """
MERGE (c:Git {git_url: $git_url})
RETURN elementId(c) AS node_internal_id, c.git_url AS git_url
"""
with driver.session() as session:
result = session.run(query, git_url=git_url)
record = result.single()
return record["node_internal_id"], record["git_url"]
def ensure_in_parameter_node(driver: Driver, node_id: str, prefixed_component_id: str, param_type: str = None) \ def ensure_in_parameter_node(driver: Driver, node_id: str, prefixed_component_id: str, param_type: str = None) \
-> tuple[int,str,str,str]: -> tuple[int,str,str,str]:
""" """
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment