From f1adb477e526cde0fff42b998fc7a72acce11f88 Mon Sep 17 00:00:00 2001
From: Chiara Liotta <liotta@astron.nl>
Date: Thu, 21 Nov 2024 12:42:51 +0100
Subject: [PATCH] first phase algorithm dependenciesdata

---
 .gitignore                                    |   3 +
 graph_creation/__init__.py                    |   0
 .../__pycache__/__init__.cpython-312.pyc      | Bin 0 -> 215 bytes
 .../__pycache__/cwl_parsing.cpython-312.pyc   | Bin 0 -> 973 bytes
 .../cwl_processing.cpython-312.pyc            | Bin 0 -> 3423 bytes
 .../repo_processing.cpython-312.pyc           | Bin 0 -> 1040 bytes
 .../__pycache__/utils.cpython-312.pyc         | Bin 0 -> 1164 bytes
 graph_creation/cwl_parsing.py                 |  15 ++++
 graph_creation/cwl_processing.py              |  72 ++++++++++++++++++
 graph_creation/repo_processing.py             |  17 +++++
 graph_creation/utils.py                       |  19 +++++
 main.py                                       |  37 +++++++++
 neo4j_queries/__init__.py                     |   0
 .../__pycache__/__init__.cpython-312.pyc      | Bin 0 -> 206 bytes
 .../__pycache__/edge_queries.cpython-312.pyc  | Bin 0 -> 2261 bytes
 .../__pycache__/node_queries.cpython-312.pyc  | Bin 0 -> 2624 bytes
 .../__pycache__/utils.cpython-312.pyc         | Bin 0 -> 473 bytes
 neo4j_queries/edge_queries.py                 |  42 ++++++++++
 neo4j_queries/node_queries.py                 |  41 ++++++++++
 neo4j_queries/utils.py                        |   3 +
 requirements.txt                              |   5 ++
 21 files changed, 254 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 graph_creation/__init__.py
 create mode 100644 graph_creation/__pycache__/__init__.cpython-312.pyc
 create mode 100644 graph_creation/__pycache__/cwl_parsing.cpython-312.pyc
 create mode 100644 graph_creation/__pycache__/cwl_processing.cpython-312.pyc
 create mode 100644 graph_creation/__pycache__/repo_processing.cpython-312.pyc
 create mode 100644 graph_creation/__pycache__/utils.cpython-312.pyc
 create mode 100644 graph_creation/cwl_parsing.py
 create mode 100644 graph_creation/cwl_processing.py
 create mode 100644 graph_creation/repo_processing.py
 create mode 100644 graph_creation/utils.py
 create mode 100644 main.py
 create mode 100644 neo4j_queries/__init__.py
 create mode 100644 neo4j_queries/__pycache__/__init__.cpython-312.pyc
 create mode 100644 neo4j_queries/__pycache__/edge_queries.cpython-312.pyc
 create mode 100644 neo4j_queries/__pycache__/node_queries.cpython-312.pyc
 create mode 100644 neo4j_queries/__pycache__/utils.cpython-312.pyc
 create mode 100644 neo4j_queries/edge_queries.py
 create mode 100644 neo4j_queries/node_queries.py
 create mode 100644 neo4j_queries/utils.py
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4c36478
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.venv/
+repos/
+Neo4j-25ebc0db-Created-2024-11-17.txt
\ No newline at end of file
diff --git a/graph_creation/__init__.py b/graph_creation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/graph_creation/__pycache__/__init__.cpython-312.pyc b/graph_creation/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbad6cf95681231d978322d11e618805beb3bb82
GIT binary patch
literal 215
zcmX@j%ge>Uz`*d8+ddscKL!!#_%jOw1H*KNN(N0vzm*I{ObiSRpFwJVl_gungche3
z6~`DE7+M;b8=J=X=cT$7WtOEX=qiMSD!69mrDWunrRK$iWTX~n7VE;*CFiH4#uQ{0
zq~>JirRt`n7Nq8-q~;}8>L%tT=2RAE7RRI)B^G4FCl{qAmSpDV#l**F=4F<|$LkeT
l-r}&y%}*)KNwq6tWnf@nWME(@1~EP|Gcqz3F*7hQ000}gIpqKV

literal 0
HcmV?d00001

diff --git a/graph_creation/__pycache__/cwl_parsing.cpython-312.pyc b/graph_creation/__pycache__/cwl_parsing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58d71c29805070e2d59eda693a27cace85366622
GIT binary patch
literal 973
zcmX@j%ge>Uz`&3oX`jx?#K7<v#DQT}DC4so0|Uc!h7^Vr#vF!R#wbQc5SuB7DVI5l
z8O&zRVaa8UVr67VVM%36VNGRE<&b1xU`S(0VQXQD;;dxX<ah}(Oq1~zOF&{t28b?B
zEKAj7yv3K3pO_M#nO9I+l9rj1dW*9twIDyfAh9Il7F$tjNoi4DG9ySU6ocHv%)r3#
zc^}wK=?t}uC2;u~hSgAZ7Dx?L3yex(gvr)2)iAj*#9A{j)H0WVOav>bVTQ}Il(2(^
z7#J98Sm2_pC7f_k))b~S%&VCo{93jemKwGc78{Uy#w>1#C^A{Y1~Y|`p~semp@uP-
zL6g<5ibG3FUrR5!JSRkx@fL4UYEEKFW?5={X;EhJEf!E9XtEW7ymE^rJtsfu7ISe)
zQ4z?&MJx;q47b=43kp*6QZ(6%*g#yK<no;O)Vz|+lFZcNTO1(mIhn;JMW7hI#SP-d
zXXeEhmlWM%2dT_Y%DTk~iiMQ;{G_a6kdGB0;8$g`RZM7cYEf~Fk%6J5fw{41jDKFL
zOHpQ7s)DXUNT`BqW?o80epzZ>Oh`s*ab~eDTwQX0N@`3&W<hFBW?rgpN-D^Ksd>qj
zx`}y-IhDnk#WCqci3J()$wjG&C7JnoF(9WDBo-BC=B4WuRNfLyPc4ZDiKP|g=f;B)
zLy-sr0|Us@#VHI73=Ir7ghd;C?(hpw@a(R?&aZrtUwMJoWq$nzkJ|zg9V~amBxk5h
z&g|g6Bc!lE`m&I22m2ini5cP>M6QUKb#UI0QSacKpmvc%@&<><bq?u^9MUs9uX3n;
z;ARk%{J_V+BXNO6{4*%({4}|4v4a9FCo>5W2Ld2xm6l}Y6zhQ;oLU5l1$lH~P<+Oh
z8O9sMgB3#}j0Y4$ETH(f#gdYlTmtshN`@j)1_p*AVUQ$;O>TZlX-=wLkqiR^11KRE
vt1>V!d|+l|WW38DahE~yE`!wv7A{7K1;QT~xELi?h^$CiDfI=!1RDbYDXI3f

literal 0
HcmV?d00001

diff --git a/graph_creation/__pycache__/cwl_processing.cpython-312.pyc b/graph_creation/__pycache__/cwl_processing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..232a17bed36eb80bed003a243e6982549795994b
GIT binary patch
literal 3423
zcmX@j%ge>Uz`)QWX`jALfPvvLhy%kcP{!vR1_p-d3@HpLj5!Rsj8TlaOi@gXAU;zL
zb1q91OD<~^Yc5+98(54vhdq}giX)dZiWAId$>GZ7j^buyNMTLoNn=W3Yhj7vg|gXO
zSfcnq>=cev{xp^p&K8y^fl4k-?w25|HJNUyBp0P7mZZjK<`tBd#OLLwq!z~~=B30J
zrRF4-Waj4;XJi%>-;ypU%1=%$E{-qGFD*(=MUmBHz9pKPS6o_@8lRk>TacfZng=oH
zmH<>ZC9xzCY&2Y9L1Iy2ZfZ$t5m;E0>6Qf48DOQzcHfeP%H)@p#Dle=TF!WjB_OdR
z;}%zPc}{$4UP)$2Wik`UZ&1v}z`(%Fz`*dCixCvWHH;~Yk_-$CwM-?9P!$X{j44bY
zvDHwK61e_s4F-lHJ(#=;L#!$TLoG9sPG-0sSX45A^b|Aph?TIz&1a}#PGJVwP{Xtu
z<Q4`721bS+bryz71|+l4U4dd^ks8buS)4F)5%g*((}f|HpMjy41;uU_6k7#BwpKD|
zviRL%DF6i=8#v~RZ!u-2XtLg7DXA<-y~UD~SzL09B_%VtqzL5TTP)eBmBpH@x7bpO
zGRsnnintjV7;f<(qB}F?7Ax4CTg)JptPBhc3Lx;SCfO<`v^ce>IL64p(9*!%*fhpJ
zFVzKPih`~}NT`BqW?o80epzZ>Oh`s*ab~eDTwQX0N@`3&W<hFBW?rgpN@_uBUP@|S
za;0u!USdvVab|H$dQoCQMm#uuWaj6^fKmW7oo42x>lIYq5`uE$K~iA%6&Hcx0hB0;
z+ZY%a8W?T}%g#u<D6HDxbBCR~!)rptMIMFA?21c5F0pIg5S3rRaYa<K!}Sio@C4WH
z>NzQw`IYVnm~05!5qiL6XY>i3s{-L4SsBE%zOXTH@pWX)khsXFbeU6mNy#NnZIIC!
z7lqZY3+r4I*10CE*WrAJ$6$ll4&fa+S9!cYGBb$ie_&?d()kREK9p3;0ZX+`XsH&K
z3S1asC*e$sDCrlR7C{jLvV@@qDJ`b3tcIqsT9z7?6jo5k)Uu+a9i|#qM2byet6?r<
zC}!-jLrS&GDQqBp@O%JrIoKvpJ}43@0i_f$A8a#NZ4Jm&sLhNFJ*jw1;{j`BV1VVS
z>KdqhE)1~(3=FkwDCV=FnJmP@&@%;;-Zk0%ig-aumOa0;q@c8<xQHLbW6m!vDFOx7
zEgpz?Ft~KnWG@n6U|=W`0ui8GTm;H;x0s7dii$waE)ry5V9;bQ5&_AB{8uCjGK~#l
z#w{*LDFrG@Z;2t;@tJugsYQ8-Iq{h(w;1z_K?Z}eh=PJbkpxJ;Fmhsrm{nY)!oa`)
za!m0iwA6Y-NM=Ul6(RKwwmaPX6WF?K=9paOmVY9tx1wxA*^ZQrH3woY1V>z!jO_4w
zpsK%tabfHQ9;G`xnk&>c=v;6JyTTKGgI{)m*d>1TJHp~K*k;sSl+e2@tiQ$hlCaqw
z5&aEpJJ@#AU9|AK5D;@AuH>pn=^atY8EOmUE=n0(7B$=wa#_^!hOF`ezbmrFGZ^nk
z7;kXf;kcvbf^YIwiIk7b400wP#TkS(iljh3V<}24$WI0pB(UfN1t^IB8C2Fz2bI9!
zQrU%J0%NQ`TDoR}rL7ufNI6}_4KCwA`oZSau)tK*vX&sHbk^0N$OV=5Y&C3<61bMV
zhB<`=lww^NV%r%QYB^BSI|n2e)UYDu4bB=)s9vrTkh>vvGSslwaG~T44shPkMoN(!
zXn6zb8c1p`60d>E*K(JD5+}@D?i$7vc908exM2B{k)bCXkGXtct%zKb3p3e;Ay$}y
zfhhAuSr~dcSwN)&2gq(0hS&@ShFTsZTX|riTmnjvU@O6SoUMij5^A-)c}z7twY(@H
z%3H&l!U6L~Ev~TSs$oZjWeqdLyx9yXoHe|&8RjzA@|6gKEo5K-m0*0Z5Q6Ec<*(s~
z#*qL@9C6hMz|=4@^klIxOknIW0+$F}ez#bQOHvDpZ!s5@=0QpUrp!D@!N3M-=77tL
zB3V!pX4Kc@Dgrg+isV7L9+b+8KsmSwRNg^~3wY50DL6oNjHVz`A)yK~hXrKkE#Bn(
z+@#FB)RcH|70d||gfQ8P!7ZX&LJ%6Y(8w%O1X->GGKdr6#Q4mVB4rStB_C9-a6-xr
zFcaLA0+&Lfa3->O#h_vgS^^;`NLeI^To!>`S6pNcs?!-57>cVH!S$wq)C}{h0?HjM
zcO+zHgv{nz5VAsbVe~}_gAU$1BH}Z6FN>&kaDHK95K>=IwxaT~pk)UesK9WX8F_(2
z>V}}`gp?^#9gdI!t8j<N#_|K4CrmC&26TAcfR|TKcvKdIUC_0^;23^|C*lFW_zc6Y
zT2Oh#I74G5_hn&~HO!ZUwQndWF3?!WJ;4)FZgC#4I+1qKIrKt!?p2XIP_d;r!(*oQ
zWl^;?Dwjp|Zm6m)s8|v`gA-C{c^;^`5EOCIJ@P_y;Z=#EkIW2eh94vugk(O*F$jrG
zNSz|x;Rvez9lOitsC3oe5SN%CGO>Ij%L8$V2`muTm+Xi<!FpNJ4~xs4FSvqS{(*x*
zMqvTVOsg3>3ldkbEGWAmq1WMgM?zzT*apcRUI)r9+W1|T@bB=vp`yDYaiPxz9!03v
zoiDgXUFC`Xq|6{B`%#xcPz79~`e_Q@5<zYK=#`da<`jclYv4-imQ-G9zDZVmVQFen
zW@@n>C?UW&Md0KMN}%90jIJ~_B|R0c6r8?6#ojG;P+HB&OoC(saODUpDZo_}Bv*iv
z$VvvV3n9sf!zMRBr8FniuE>@F)aGSiD0X9DVEDky$jEq~A?ZFt>P3ds`wVe+86-Zi
ta4`BXt}yw`0HQyL@G+(`u5kIx0HQxAXfVbwu2A{F0HQY}e*x2AdjNA=Pf-8>

literal 0
HcmV?d00001

diff --git a/graph_creation/__pycache__/repo_processing.cpython-312.pyc b/graph_creation/__pycache__/repo_processing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c48a27062eeca18a32843f0a54b79e7c414a85e
GIT binary patch
literal 1040
zcmX@j%ge>Uz`$@y&^~<vGXuk85C?{tpp4Hp3=9m@8B!Qh7;_kM8KW2(L2RZRrd;MI
z=3JI2mR!~-R<Iaz4qGmJ6gwkB3QH<S8dC~u3riGdC7UMuOOR2TjJE{SQ%mBL%X8w>
zit=;gi&6{nHJNV-6%^$srxq83B{K60N=u4w2_uW;mzID91(5}dOHvDpH5qS-rsfrw
z7Ny1~=jRsW=cVSA#OLLwq~78L86KaLSzM9~^9RTc><kPH%nS?+pO1k3l*v%bRKwuH
z5F5$BP|IAx2vP#VHO#A_>>8#TW|03uv?K!qNUb9SLoG`UO9~^1U&~qoG7qGQp@y-B
zbv4vBMurkrupk2iNSp;p9OMSDR1ISdl6VbE3KPi06y|IV28LqB9@7#|urjC#HO#A_
zHZd~vII}SHM6)ndGH9~+-C`+7EXlaVnw*nZTzrcoJijPAEhoQRleGw>z6fMp5eowY
zLlMY9MQjWV44Q0391IK$x7bpOGRsnnZt;MkBsH%jvm`UM_!bK&=x(ur`IWbL5CNN+
zQVi0q00zHmldWPxi&Kk=V~h+8Ee*_#O=JA?Qe8kMD(EVNgetgZ=A~rhm!;;#gk+=^
zXBO+i)g|Yrq{b9v7Nq86=B4VUq!y&+rKIL1SL!C_CFWEXXBNk#7bO;C#3vV}CYEI8
z=f!|r6c3I4%)E5Hg34RGP%bzX6&DFJFff4pS6slrz|g>OLs+!I=eB@k2g@B9-4!X9
zWsEy`CWKt%kh&qJw7}$wm_~=&2Q~&7r3Gp;?K*t!@F*@&TcLA>$NUbD+>EjXWmkCg
z?(oPjU|Yd<g-8DezvzVY&iVyfm-%(>2!K?oZBV%?VE$2%L0t1AD}$iaXHY!(X>#3?
zLUp$uC|nB?i;6Sz(!r5&OBR~~Nb(1%L5-$cQhBNQCRy=?rKv@ksl|Grgbm|>qxBXG
zD1n3hzLKFx1Qf?Ypm^o5$<0qG%}KQ@l44+B0Hx$&Sq27%56p~=jCUC<?lYuZWJtNo
UVDo{6htXn2$QK44MhmcZ0CCS6s{jB1

literal 0
HcmV?d00001

diff --git a/graph_creation/__pycache__/utils.cpython-312.pyc b/graph_creation/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20e22c7ac192b92d0f4fd508e6ec00afe23d91f3
GIT binary patch
literal 1164
zcmX@j%ge>Uz`*dF$3A^K3j@Pr5C?{tpp4H33=9m@8B!Qh7;_kM8KW3;nWC5&L42kh
z=3JI2mR!~-Rz`*t<`#x1wiK2YhA8$*R!z2-Anlq=w**r2ic5=9<5Ln#665poQ&Mk<
zLPZJ^ixP8FOHzx#!kSFCB$A6#6HA~<i&AqEOEUBGiZe0`ZplDpGV|iWy5f;#lbJzQ
zK`|Qx0|SWvxe4s*5=N*vLk&X>V+~UZ<7yU&NG)>>a|)9r0|P@XO9@B^Sh$9vhN*^S
zHI&cDP{IlpW?+Eohso5k*04f#vz4$TsbNFbUCE%y?01VPGp`5~n3^m_AiIk|PAmf1
zPy{kull>N3N>OH6YSAr@%)ElqlK9M&TRh46xdr)osd-=?7sSutz!O8T<1_P0Qj78u
zb3i&ck-`_wL{?D@a-{+U{3=eiiU}=FEh>&NGBC6>FgG@h@y|<j0XaiKS0N-+!8J24
zB_qEqH7_P4BegiQSQoA?IX@*erXaH*H77GKRW~KIAT=)~H7~hRH!&|Ur?NP+I3~R)
zu^=NJ9AlaJc`>CWnK{LJ1(ml{z(T3<5MP3PQyia|mx2<P#YKV)3=E)PD)wh!U}#`?
zBB-{YY(>QmnFA%41$`R4Zpf;yV4N9yfko;Gi^2l26*3pheXp?iJrMwLBsSPw7I0|r
zydk5yAaSP81s2ICEHX2~E~r{vunD@#68sqyQOO|FK@1QE#Q-QOK09GeF7W7PsbNWB
z1Sb|&l*GbZ!wOTw$WY0k$>ax#Y9^!zW-H=kU|_h#S)5;5l$;u$nNq~fz`#(%3nKVH
z1U~}<Lop~-Kw_sz0Hj>HpeR2%wYWGQstq~C7l|@3Fo0AQYk&jk4%qjS8+<McI3oN0
z4#fA?7i>YkFA`#4VDQuAxFwaBns1U7Us#%2l$lzr2Z{+82jXl{2%)P=O-WCMs{#cZ
zBprjI3+$Yg3}Evij_0t+%}*)KNwq5ig&8Oh7K<}5FnnNUWMsV0;CG)P_7e*iV+iAC
L31-F+MzBTzSYI(5

literal 0
HcmV?d00001

diff --git a/graph_creation/cwl_parsing.py b/graph_creation/cwl_parsing.py
new file mode 100644
index 0000000..9417087
--- /dev/null
+++ b/graph_creation/cwl_parsing.py
@@ -0,0 +1,15 @@
+from pathlib import Path
+from cwl_utils.parser import save
+from cwl_utils.parser.cwl_v1_2_utils import load_inputfile
+
+def get_cwl_from_repo(repo_path: str) -> list[dict]:
+    cwl_entities = []
+    pathlist = Path(repo_path).glob('**/*.cwl')
+    for path in pathlist:
+        path_in_str = str(path)   
+        cwl_obj = load_inputfile(path_in_str)
+        saved_obj = save(cwl_obj,  relative_uris=True)
+        saved_obj['path'] = path_in_str
+        cwl_entities.append(saved_obj)
+
+    return cwl_entities
\ No newline at end of file
diff --git a/graph_creation/cwl_processing.py b/graph_creation/cwl_processing.py
new file mode 100644
index 0000000..847d7df
--- /dev/null
+++ b/graph_creation/cwl_processing.py
@@ -0,0 +1,72 @@
+from graph_creation.utils import create_input_nodes_and_relationships, process_source_relationship
+from neo4j_queries.node_queries import ensure_component_node, ensure_data_node, ensure_parameter_node
+from neo4j_queries.edge_queries import create_data_relationship, create_out_param_relationship
+from pathlib import Path
+
+def process_cwl_inputs(driver, cwl_entity: dict):
+    component_id = cwl_entity['path']
+    if type(cwl_entity['inputs']) == list:
+        for input in cwl_entity['inputs']:
+            if type(input) == dict:
+                create_input_nodes_and_relationships(driver, input['id'], component_id)
+    elif type(cwl_entity['inputs']) == dict:
+        for key in cwl_entity['inputs'].keys():
+            create_input_nodes_and_relationships(driver, key, component_id)
+
+def process_cwl_outputs(driver, cwl_entity: dict):
+    component_id = cwl_entity['path']
+    for output in cwl_entity['outputs']:
+        if type(output) == dict:
+            # Create out-parameter node o_node with id = o.id and component_id = c_node.id
+            param_node = ensure_parameter_node(driver, output['id'], component_id, 'out')
+            # Create a directed data edge from o_node to c_node
+            param_node_internal_id = param_node[0]
+            create_out_param_relationship(driver, component_id, param_node_internal_id)
+            if 'outputSource' in output:
+                if type(output['outputSource']) == str:
+                    process_source_relationship(driver, output['outputSource'], component_id, param_node_internal_id)
+                elif type(output['outputSource']) == list:
+                    for o in output['outputSource']:
+                        process_source_relationship(driver, o, component_id, param_node_internal_id)
+                        
+def process_cwl_steps(driver, cwl_entity: dict, repo: str):
+    for step in cwl_entity['steps']:
+        combined_path = Path(repo) / step['run']
+        step_path = str(combined_path)
+        # if a component node with the same path (run) as s does not exist then
+        # Create component node s_node unique to s with id equal to run 
+        s_node = ensure_component_node(driver, step_path)
+        s_node_internal_id = s_node[0]
+        for i in step['in']:
+            # Create in-parameter node i_node with id = i.id and component_id = s.run
+            param_node = ensure_parameter_node(driver, i['id'], step_path, 'in')
+            param_node_internal_id = param_node[0]
+            # Create a data edge from s_node to i_node
+            create_data_relationship(driver, s_node_internal_id, param_node_internal_id)
+
+            if 'source' in i:
+                if type(i['source']) == str:
+                    source_id = i['source']
+                    process_source_relationship(driver, source_id, cwl_entity['path'], param_node_internal_id)
+                elif type(i['source']) == list:
+                    for source_id in i['source']:
+                        process_source_relationship(driver, source_id, cwl_entity['path'], param_node_internal_id)
+
+        for o in step['out']:
+            if type(o) == dict:
+                o_id = o['id']
+            else:
+                o_id = o
+            # Create out-parameter node o_node with id = o.id and component_id = s.run
+            param_node = ensure_parameter_node(driver, o_id, step_path, 'out')
+            param_node_internal_id = param_node[0]
+            # Create a data edge from o_node to s_node
+            create_data_relationship(driver, param_node_internal_id, s_node_internal_id)
+            # Workflow-level outputs of a step have \texttt{id} corresponding to \texttt{[[step ID]/[output ID as defined in workflow]]} 
+            # and a \texttt{component\_id} property equal to the ID of the workflow
+            # Create data node o_data_node with id = step_id/output_id and component_id = c_node.id
+            output_id = f"{step['id']}/{o_id}"
+            data_node = ensure_data_node(driver, output_id, cwl_entity['path'])
+            data_node_internal_id = data_node[0]
+            # Create a data edge from o_node to o_data_node
+            create_data_relationship(driver, param_node_internal_id, data_node_internal_id)
\ No newline at end of file
diff --git a/graph_creation/repo_processing.py b/graph_creation/repo_processing.py
new file mode 100644
index 0000000..c22cdfb
--- /dev/null
+++ b/graph_creation/repo_processing.py
@@ -0,0 +1,17 @@
+from graph_creation.cwl_parsing import get_cwl_from_repo
+from graph_creation.cwl_processing import process_cwl_inputs, process_cwl_outputs, process_cwl_steps
+from neo4j_queries.node_queries import ensure_component_node
+
+def process_repos(repo_list: list, driver):
+    cwl_entities = {}
+    for repo in repo_list:
+        cwl_entities[repo]= get_cwl_from_repo(repo)
+        for entity in cwl_entities[repo]:
+            # if a component node with the same path as c does not exist then
+            # create component node c_node unique to c with id equal to path and alias equal to a empty dictionary
+            component_id = entity['path']
+            ensure_component_node(driver, component_id)
+            process_cwl_inputs(driver, entity)
+            process_cwl_outputs(driver, entity)
+            if entity['class'] == 'Workflow':
+                process_cwl_steps(driver, entity, repo)
diff --git a/graph_creation/utils.py b/graph_creation/utils.py
new file mode 100644
index 0000000..bf44831
--- /dev/null
+++ b/graph_creation/utils.py
@@ -0,0 +1,19 @@
+from neo4j_queries.node_queries import ensure_data_node, ensure_parameter_node
+from neo4j_queries.edge_queries import create_data_relationship, create_in_param_relationship
+
+def create_input_nodes_and_relationships(driver, input_id, component_id):
+    # Create in-parameter node i_node with id = i.id and component_id = c_node.id
+    param_node = ensure_parameter_node(driver, input_id, component_id, 'in')
+    param_node_internal_id = param_node[0]
+    # Create a directed data edge from c_node to i_node
+    create_in_param_relationship(driver, component_id, param_node_internal_id)
+    # Create a data node i_data_node with id = i.id and component_id = c_node.id
+    data_node = ensure_data_node(driver, input_id, component_id)
+    data_node_internal_id = data_node[0]
+    # Create a data edge from i_data_node to i_node
+    create_data_relationship(driver, data_node_internal_id, param_node_internal_id)
+
+def process_source_relationship(driver, source_id, component_id, param_node_internal_id):
+    data_node = ensure_data_node(driver, source_id, component_id)
+    data_node_internal_id = data_node[0]
+    create_data_relationship(driver, param_node_internal_id, data_node_internal_id)
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..1a9e532
--- /dev/null
+++ b/main.py
@@ -0,0 +1,37 @@
+from graph_creation.repo_processing import process_repos
+from neo4j import GraphDatabase
+import dotenv
+import os
+import gitlab
+import subprocess
+
+def clone_repos(repo_list: list, folder_name: str):
+    gl = gitlab.Gitlab('https://git.astron.nl')
+    projects = gl.projects.list(iterator=True, get_all=True)
+    for project in projects:
+        repo_name = project.attributes['path_with_namespace']
+        if repo_name in repo_list:
+            git_url = project.ssh_url_to_repo
+            subprocess.call(['git', 'clone', git_url, f'./{folder_name}/{repo_name}'])
+
+if __name__ == '__main__':
+    relevant_repos = ['ldv/imaging_compress_pipeline']
+    folder = 'repos'
+    clone_repos(relevant_repos)
+
+    load_status = dotenv.load_dotenv("Neo4j-25ebc0db-Created-2024-11-17.txt")
+    if load_status is False:
+        raise RuntimeError('Environment variables not loaded.')
+
+    URI = os.getenv("NEO4J_URI")
+    AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))
+
+    repo_paths = [f'{folder}/{path}' for path in relevant_repos]
+    print(repo_paths)
+    with GraphDatabase.driver(URI, auth=AUTH) as driver:
+        driver.verify_connectivity()
+        print("Connection established.")
+        driver = GraphDatabase.driver(URI, auth=AUTH)
+        process_repos(repo_paths, driver)
+        driver.close()
+
diff --git a/neo4j_queries/__init__.py b/neo4j_queries/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/neo4j_queries/__pycache__/__init__.cpython-312.pyc b/neo4j_queries/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7189352bf21b6440c0bba245ba0830a5ca09c06
GIT binary patch
literal 206
zcmX@j%ge>Uz`$^p+ddscKL!!#_%jOw1H*KNN(N0vzm*I{ObiSRpFwJV<tAIjgche3
z6~`DE7+M;b8=J=X=cT$7WtOEX=qiMSD!69mrDWunrRK$iWTX~n7VE;*CFiH4#uQ{0
zq~>JirRt`n7Nq8-q~;}8>L%tT=2RAE7RThJ=9^^2#K&jmWtPOp>lIYq;;_lhPbtkw
cwJTy}U|?WmU|=W)F+MUgGBOr1GcYg!0ITFR&Hw-a

literal 0
HcmV?d00001

diff --git a/neo4j_queries/__pycache__/edge_queries.cpython-312.pyc b/neo4j_queries/__pycache__/edge_queries.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41101cd7aaf942db65346904cc0998e1b13494fa
GIT binary patch
literal 2261
zcmX@j%ge>Uz`$^V&py4Hoq^#ohy%k+P{wB^1_p-d3@HpLj5!Rsj8Tk?3@J=43{gxe
z%q<L2%qc7_3{fnVteR{uK^im}ZwV#mq$cLYC+FuD<maX4mBeSJBr}8LpqPz;fdRz-
zd<1N82_saTp@wlalwHf5!dS~v!;l412a|@<tD&?hLoI6!YZfb%i$vA1)G*gDtzlXX
z)m_V0!<NO4qyWl<nN-VO!=Az{$-ux+!=AzdqSKg?nNmPrW>^gk21bTp24x0AhGNEY
zCPs!x21W)(hDwGC=17KeMom_~sv}$q3JMCojv>w-3L42)&M?m_R3nFjm4XT~w^ma}
zL8CwutTo)jHON&VGex67Q^8h2r692=F*mg&wJ1I_uOzi7FEJ-RGX<>B*EPu9RY4<J
zQ#abm#WBP&R@V+@Mv!YrXpo;mvR-D2f@81(vQ;_?1$szkg5{C<V53(u-Qq#kbW0RT
z3BnmgpcuTxgCv)-lBEa~*tgh=Q;Ul;^Yd;o7nSDSVk^$fOV3GN$#IJ<r6{v3wdj^;
zK~ZX2W<_cWO7bWIMGnMN*22=%qRJvrvb@Dslv-SxQv#xs^NUi7IT#oiK%or6zbcch
zVnT~ki;8263=Ay|%#BTB{PR*>K-MVeDuje8xMt?1WaO8n=Ea0$q!wou>%!F~=clB`
z6l4~p=49rj>ZYU?q~@ih<|S9^CgvsPR2F9z$K<8vn`Fg<oRXPZ9Fv-oo(kjW6;$4m
zNiIrFEJ=;e%!>zyL3~kaPGU)BeqM1#W<ik%EP1CfFfcSQ+!0p2pk{qp*rvhf0~0GR
zM}zAJ9tJLfez{J$E1Z%u!fyylOqZT0JtK63%nf6Y56lcgvY#0kg}fL$*d7RqPnVr2
zJ6XPi?FPT-guILVstd|i<Xu#^y})n#os)q}>N=<7B~Hm-pSc+r80P6YiZLG)V|A2g
zAO6|x95}twHM`ls^BYEXL(Vf~WxOI#DF)5}MW94p#L2+GP{hl?z<`qGi-Z{%7;ecz
zQ(t~*3AW@X!N9;E2+}SJBE%RN7_g^?b>PA`g|U{oh9L`D>_W>$Fa;|dO&MxgYFMxp
z#5K${j3~t{j^Y?*5~vu47PD;R7PE_y<2?~nd?uk4pNZh&Gp#5;7g2KN<)@^^gUUod
z7f=bC1Xfa#kD&l!IHY_{L@r;UmO#tbL}>Y%nG$cPqmYE)8G#MZWV$7a)&5(;SnRyT
z0x||n8fh|v3%(*yb}r&!U|`T>FX975qZEjc1`z@v0#rH{fg@Q6ltw_MC#XzRP*8v*
z5eaA#Nl7e8L@k`;7#JAjL3*OVg%kb~>4KI24MP`rk>rM1B&BuM%_v)tHnZ*mzb<lt
zR1EULWIac2=7Zd<jw0-Snyj~kQ3_SP(vr-aVsNDaE-{Nh>9z>uyp;@}K?xAz6Aqi)
y{FKt1RJ$TjD+H8Gi^Ulj7(OsFGBVy}F#5#8!)W(OhKJGavkW(*UXdaL0|Nk`5*l{^

literal 0
HcmV?d00001

diff --git a/neo4j_queries/__pycache__/node_queries.cpython-312.pyc b/neo4j_queries/__pycache__/node_queries.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26b0b655e6fc677c6b4e3f5e389f1a50654b52d3
GIT binary patch
literal 2624
zcmX@j%ge>Uz`(G8&p!PyI|IXG5C?{tpp4H-3=9m@8B!Qh7;_kM8KW2(8B&;97^0X`
zm|Ga4m{V9<7@}AzSvA>Sf;4C{-V#dANlna)PtMOR$j?j7D~ZocNro8)Qo_c-z`)GF
z!0`DH*x(XIkRS-xFs_EOYnf6QYnf{pvOwxUQb>3;lx@mT%TmLV#R}yjQ8mmpOlz1{
zL+z|(tzpe#N0NpzVdm7b)v%>7OENGp)Uc(nfao-)WTq65hZ$gIFfs%)C^Hx`6f>4H
zF)~ClFfuSQR5DaBM>3Q%YO?xOm2xR4C@A>42D!T`Xe3)X!@RCgjT{PA3M$ClT1~L7
zAlH!4AU}o76pds}1;=28y!@2Z_{_YL)S|q^ocPQX9ff4Q%oLDJW=ecPQGP*cQAs6O
zrzYbq9%KV=383q}#f_wAB})+~IB&5Rrxq7y=I7mFE-KBt#a5h|m!6ZV$$pD1r6{v3
zwdj^;K~ZX2W<_cWN)#4>0{RwfVQFenWf3O>1H&!0qSWHjoDvY7oL`hu3<_gVFeoS}
z{Hjd0iU}=FEh>&NGBC6>FgG@h@y|<j0a>7+s}K^Z;F_72l96AQnimt2ky@NttP59{
zoS%{!Q;=DZnv<EAs+*EpkeZj0nwMOuo0ylFQ(2r@9Fv!tZ;}-ca!F=taSYhcP>x<f
z<t@?FyyDWLR3yiPR1^s@Ffgz%FfbIyF)%PRFx(MVy`W}&S=gq*=K~WnFGqvx2Mz`<
zfquD8xhtHKGs15ON=%oYC_N)|gUkf!%Yt?tY&QhOr^`;1oh;wMc7tE^f|$lde$5qT
z8+0#fJ6zy*_|DG2C3T%s@)D=yuVQWn28MaMj-t#5MOhu?*por_LQ@?BD3n0a@c9~g
z>SuzdewGx*T2_LopRI-sYwBmMVXR@PVP3<u8ftGXdks6@9Kcb-0nGs%kQ`9Mk-`ef
z2{jxkY#=&~g`^z8?w1(O$iM(jq|i)}XBCiGl$e`Zl3JutT>xXoXQo&wsDR=JlF(uK
z1|p7}wxOBD-%r6g$kj2#RUz0lL;-98D9PpNA*)reMa^hBNa_$)msA#{f>eS<;z2yH
zGDLRF!;u~H^uSs`UWv!Zm^un*ZgC7&Kz18E2jg*+W3U3k29U$xn!qm9WWL1?4Khfk
z<N~V(d!YzaxE6uZ*ey}4*4`2aD+W6r#pogq1_lOA&LSQL28JRL5Wx!~L_q{7>lg8X
zSfGqw#1CQ#fbtTE3krWQE)oQl3b5>lum<GTA_)ct29Unudd%#{4$gi8ShL>>_X9c?
zqEkV6Fs%b$9z39S!875aQ{si>%nu9<*^Dks9~l@@7+sk@Ffa(XGGXRVb4PjRgYvA7
z8tloS{0PgRpa2B%KcB&!Kbcb)Ygq{9Pf)!B3OSg`P`ZYth7p`QLAEe3Fo5bGyt$LT
zh8>za*`c|U9g;iKm`Tc&Y<^X*!3hYSC0!Cr5*4ab5=#;x84eUvw9JTL+lk3wP$!{P
zdPL_fusxbgkYo&s0z|eb5(6b(a9Za;PwQZn*pfL&IXH<cC@4Tuxd6110Xq~FY(-KG
z3=DFhJduH!z*)fwoCjwD2c>R*;!-z%B4$c9a+G2|D8=fi%<iYjdP^9kdekc|$;>GR
zSNz~A5S$Ugj$O&{8I&R+{^796%}*)KNwq5i`2o}lD+YP!12ZEd<5LEsPb{2_j*Oqg
Uco|)pKFe`4CNUPtGB7Xz0JH6h)&Kwi

literal 0
HcmV?d00001

diff --git a/neo4j_queries/__pycache__/utils.cpython-312.pyc b/neo4j_queries/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1071760f3536644afec6f0c65efebc415c6af69d
GIT binary patch
literal 473
zcmX@j%ge>Uz`&p@WS>5jfq~&Mhy%kcP{wBw1_p-d3@Hq$3@MDM3~4MWOf4)?jFrrq
zEVo1pic-@uD^gS9lk;;6^7B&jO5!t9Zm|`mmXsFdB{M>e08z{g3=AOrSp#fb4MP^P
zEJy_d149bqYN%)}V+~_4gC>(-6<bkiL4I+JCgUxhqSV~{vQ&tjnoLDZ3=9mnc#v%_
z1_>!B{3=SeiU}=FEh>&NGBC6>FgG@h@y|<jDatHMRnS!k2~}{-%uC70FH6mf3CTz;
z&Mel2t4q#LNsTGUEJ)4C%uCfxNi9gtOG(X3uGCG;OU$V(&Mc0}OU*aQiZ3ipEy_$S
zjwvn4%qi9@sJtbVoRgZEhY}D)91IK$AP*HwFfcGQFx(K<xGrpZQP^}t=4D}r2A><;
z;ul!NKZ88)r^$4SxwxbVtYsxb5hnu!LlHa3gB&)w`6;D2sdhy?$fkW@W@Kdiz{JET
Tu_E~c0~4dfhU70G3ak?V=Pr4}

literal 0
HcmV?d00001

diff --git a/neo4j_queries/edge_queries.py b/neo4j_queries/edge_queries.py
new file mode 100644
index 0000000..f0ee233
--- /dev/null
+++ b/neo4j_queries/edge_queries.py
@@ -0,0 +1,42 @@
+from neo4j_queries.utils import clean_component_id
+
+def create_in_param_relationship(driver, prefixed_component_id, parameter_internal_id):
+    component_id = clean_component_id(prefixed_component_id)
+    query = """
+    MATCH (c:Component {component_id: $component_id}), (p)
+    WHERE id(p) = $parameter_internal_id
+    MERGE (c)-[:DATA]->(p)
+    RETURN c.id AS component_id, p.parameter_id AS parameter_id
+    """
+    with driver.session() as session:
+        result = session.run(query, component_id=component_id, 
+                             parameter_internal_id=parameter_internal_id)
+        record = result.single()
+        return record["component_id"], record["parameter_id"]
+    
+def create_out_param_relationship(driver, prefixed_component_id, parameter_internal_id):
+    component_id = clean_component_id(prefixed_component_id)
+    query = """
+    MATCH (c:Component {component_id: $component_id}), (p)
+    WHERE id(p) = $parameter_internal_id
+    MERGE (c)<-[:DATA]-(p)
+    RETURN c.component_id AS component_id, p.parameter_id AS parameter_id
+    """
+    with driver.session() as session:
+        result = session.run(query, component_id=component_id, 
+                             parameter_internal_id=parameter_internal_id)
+        record = result.single()
+        return record["component_id"], record["parameter_id"]
+    
+def create_data_relationship(driver, from_internal_node_id, to_internal_node_id):
+    query = """
+    MATCH (a), (b)
+    WHERE id(a) = $from_internal_node_id AND id(b) = $to_internal_node_id
+    MERGE (a)-[:DATA]->(b)
+    RETURN a.id AS id_1, b.id AS id_2
+    """
+    with driver.session() as session:
+        result = session.run(query, from_internal_node_id=from_internal_node_id,
+                             to_internal_node_id=to_internal_node_id)
+        record = result.single()
+        return record["id_1"], record["id_2"]
\ No newline at end of file
diff --git a/neo4j_queries/node_queries.py b/neo4j_queries/node_queries.py
new file mode 100644
index 0000000..b78b58b
--- /dev/null
+++ b/neo4j_queries/node_queries.py
@@ -0,0 +1,41 @@
+
+from neo4j_queries.utils import clean_component_id
+
+def ensure_component_node(driver, prefixed_component_id):
+    component_id = clean_component_id(prefixed_component_id)
+    query = """
+    MERGE (c:Component {component_id: $component_id})
+    RETURN id(c) AS node_internal_id, c.id AS id_property
+    """
+    with driver.session() as session:
+        result = session.run(query, component_id=component_id)
+        record = result.single()
+        return record["node_internal_id"], record["id_property"]
+
+def ensure_parameter_node(driver, node_id, prefixed_component_id, param_type):
+    component_id = clean_component_id(prefixed_component_id)
+    query = """
+    MERGE (n:Parameter {parameter_id: $node_id, component_id: $component_id})
+    ON CREATE SET 
+        n.component_id = $component_id,
+        n.parameter_type = $param_type
+    RETURN id(n) AS node_internal_id, n.parameter_id AS id_property, n.component_id AS component_id_property,
+        n.parameter_type AS parameter_type_property
+    """
+    with driver.session() as session:
+        result = session.run(query, node_id=node_id, component_id=component_id, param_type=param_type)
+        record = result.single()
+        return record["node_internal_id"], record["id_property"], record["component_id_property"], record['parameter_type_property']
+    
+def ensure_data_node(driver, node_id, prefixed_component_id):
+    component_id = clean_component_id(prefixed_component_id)
+    query = """
+    MERGE (n:Data {data_id: $node_id, component_id: $component_id})
+    ON CREATE SET 
+        n.component_id = $component_id
+    RETURN id(n) AS node_internal_id, n.data_id AS id_property, n.component_id AS component_id_property
+    """
+    with driver.session() as session:
+        result = session.run(query, node_id=node_id, component_id=component_id)
+        record = result.single()
+        return record["node_internal_id"], record["id_property"], record["component_id_property"]
\ No newline at end of file
diff --git a/neo4j_queries/utils.py b/neo4j_queries/utils.py
new file mode 100644
index 0000000..c00f1b5
--- /dev/null
+++ b/neo4j_queries/utils.py
@@ -0,0 +1,3 @@
+def clean_component_id(prefixed_component_id: str) -> str:
+    component_id = prefixed_component_id.removeprefix("repos\\")
+    return component_id
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..31918a8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+python-gitlab
+python-dotenv
+neo4j
+ruamel.yaml
+cwl-utils
\ No newline at end of file
-- 
GitLab