diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..47d62666edd086b26b9327a09285f9a795162d86
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,19 @@
+
+docker-build:
+  image: docker:stable
+  stage: build
+  services:
+    - docker:dind
+  before_script:
+    - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY
+  # Default branch leaves tag empty (= latest tag)
+  # All other branches are tagged with the escaped branch name (commit ref slug)
+  script:
+
+    - docker build --pull -t "$CI_REGISTRY_IMAGE:latest" .
+    - docker push "$CI_REGISTRY_IMAGE:latest"
+  # Run this job in a branch where a Dockerfile exists
+  rules:
+    - if: $CI_COMMIT_BRANCH
+      exists:
+        - Dockerfile
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..638cab4d47c597167c297370a262b1f4697c80a2
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,8 @@
+FROM python:3.10
+
+COPY . /src
+RUN cd /src/bf_pulp_utils && pip install .
+
+
+RUN collect_unspecified_metadata --help && \
+    double_tgz_elimination --help
diff --git a/bf_pulp_utils/bf_pulp_utils/__init__.py b/bf_pulp_utils/bf_pulp_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d36c6618127061f6331844e079146f836a9a4f76
--- /dev/null
+++ b/bf_pulp_utils/bf_pulp_utils/__init__.py
@@ -0,0 +1,7 @@
+from importlib.metadata import version, PackageNotFoundError
+
+try:
+    __version__ = version("bf_pulp_utils")
+except PackageNotFoundError:
+    # package is not installed
+    pass
\ No newline at end of file
diff --git a/scripts/collect-unspecified-metadata.py b/bf_pulp_utils/bf_pulp_utils/collect_unspecified_metadata.py
similarity index 70%
rename from scripts/collect-unspecified-metadata.py
rename to bf_pulp_utils/bf_pulp_utils/collect_unspecified_metadata.py
index e0caa39fa591ee3647e8a04da6d6aa23ba9a3f68..110120f435f70889349c249acde8b33cdd137539 100755
--- a/scripts/collect-unspecified-metadata.py
+++ b/bf_pulp_utils/bf_pulp_utils/collect_unspecified_metadata.py
@@ -5,33 +5,34 @@
 #
 # (c) Vlad Kondratiev - 01.11.2021
 #
-import os, sys, re, glob
-from argparse import ArgumentParser
+import glob
 import json
-import datetime as dt
-from datetime import datetime
-import time
-from ldv_obs import *
-from ldv_pulp import *
+import os
+import re
+import sys
+from argparse import ArgumentParser
+
+from bf_pulp_utils.ldv_obs import *
+from bf_pulp_utils.ldv_pulp import *
+
 
 # directory with all PulP log- and feedback files
 # when executed on Spider in folder '/project/ldv/Data/beamformed/' the path is relative
 DEFAULT_ROOTDIR = "./pulp-logs"
 
-# dictionaries
-observation = {}
-pulp = {}
 
-# to save to JSON file
-global_dict = {
-        "Observation" : observation,
-        "Pulsar Pipeline" : pulp
-        }
-# suffix for the filename of the output JSON file
-json_filename_suffix = "_unspecified.json"
+def main():
+    # dictionaries
+    observation = {}
+    pulp = {}
 
-# main
-if __name__ == "__main__":
+    # to save to JSON file
+    global_dict = {
+        "Observation": observation,
+        "Pulsar Pipeline": pulp
+    }
+    # suffix for the filename of the output JSON file
+    json_filename_suffix = "_unspecified.json"
 
     parser = ArgumentParser(description='Collect metadata from pulp-log files')
     parser.add_argument('obs_id')
@@ -39,24 +40,17 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     if args.rootdir is None:
-        print("No rootdir as option is given, use default path - %s\n", DEFAULT_ROOTDIR)
+        print("No rootdir as option is given use default path")
         rootdir = DEFAULT_ROOTDIR
     else:
         rootdir = args.rootdir
-        
-    if args.parsetdir None:
-        print("No parsetdir as option is given, use default path - %s\n", DEFAULT_PARSETDIR)
-        parsetsdir = DEFAULT_PARSETDIR
-    else:
-        parsetsdir = args.parsetdir
-        
-        
+
+    parsetsdir = os.path.join(rootdir, 'parsets')
 
     # TODO use are argument for this like obs_id is 'all' ??
     # if ObsID is not given, then will be collecting all ObsIDs from the corresponding directories in the "root" directory
     # for now will be printing the message and exiting...
 
-
     sasid = args.obs_id
 
     # checking if directory <ObsID> exists
@@ -73,21 +67,22 @@ if __name__ == "__main__":
         if not os.path.exists(feedbackfile):
             feedbackfile = "%s/L%s/pulpL%s_feedback" % (rootdir, sasid, sasid)
             if not os.path.exists(feedbackfile):
-                print("ERROR: Feedback file is not found (neither '%s/Observation%s_feedback' nor '%s')" % (rootdir, sasid, feedbackfile))
+                print("ERROR: Feedback file is not found (neither '%s/Observation%s_feedback' nor '%s')" % (
+                    rootdir, sasid, feedbackfile))
                 sys.exit(1)
 
     # checking if parset file exists
-    foundparsets=glob.glob("%s/parsets/*%s*" % (rootdir, sasid))
+    foundparsets = glob.glob("%s/*%s*" % (parsetsdir, sasid))
 
-    parset=""
-    if len(foundparsets) > 0: 
+    parset = ""
+    if len(foundparsets) > 0:
         parset = sorted(foundparsets, key=len)[0]
 
     # reading log-file file into the list of lines
     f = open(logfile, 'r')
     # ignoring empty lines
-    #comments_and_empty=re.compile(r"(^\s*#+.*$)|(^\s*$)")
-    empty=re.compile(r"(^\s*$)")
+    # comments_and_empty=re.compile(r"(^\s*#+.*$)|(^\s*$)")
+    empty = re.compile(r"(^\s*$)")
     loglines = [ff for ff in f.read().splitlines() if empty.search(ff) is None]
     f.close()
     # reading feedback-file file into the list of lines
@@ -98,15 +93,16 @@ if __name__ == "__main__":
     # reading parset-file file into the list of lines
     if parset != "":
         f = open(parset, 'r')
-        empty=re.compile(r"(^\s*$)")
+        empty = re.compile(r"(^\s*$)")
         parsetlines = [ff for ff in f.read().splitlines() if empty.search(ff) is None]
         f.close()
-    else: parsetlines = []
+    else:
+        parsetlines = []
 
     # populating observation info
-    observation = populating_observation (sasid, observation, parsetlines, loglines, feedlines)
-    observation["Parset"] = parset.split("/")[-1] # removing the path
-    
+    observation = populating_observation(sasid, observation, parsetlines, loglines, feedlines)
+    observation["Parset"] = parset.split("/")[-1]  # removing the path
+
     # populating pipeline info
     pulp = populating_pipeline(sasid, pulp, loglines, feedlines)
     pulp["Project"] = observation["Project"]
@@ -120,3 +116,7 @@ if __name__ == "__main__":
         json.dump(global_dict, outfile)
 
     print("File %s created" % outname)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/double_tgz_elimination.py b/bf_pulp_utils/bf_pulp_utils/double_tgz_elimination.py
similarity index 80%
rename from scripts/double_tgz_elimination.py
rename to bf_pulp_utils/bf_pulp_utils/double_tgz_elimination.py
index 80ed50c614bddcf73582cf0e64a96d45cbfb1287..ebd2c3444908c71ccaeac0dc25c9f98d1977a0b5 100644
--- a/scripts/double_tgz_elimination.py
+++ b/bf_pulp_utils/bf_pulp_utils/double_tgz_elimination.py
@@ -14,29 +14,27 @@
 
 import os, sys, re
 import tarfile
-import glob
 import fnmatch
-import numpy as np
 import optparse as opt
 import shutil
 
 # files that are not in the duplicate tarballs but we still need to keep them
-excluded=["*_pulp.log", \
-          "*all_bestRMs.out", \
-          "*StokeStats.out", \
-          "*rmfit_results.out"]
+excluded = ["*_pulp.log", \
+            "*all_bestRMs.out", \
+            "*StokeStats.out", \
+            "*rmfit_results.out"]
 
 # main
-if __name__=="__main__":
+def main():
 
     usage = "Usage: %prog [options] <input LTA tarball> <output tarball>"
     cmdline = opt.OptionParser(usage)
     # adding options
-    #cmdline.add_option('-o', '--output', dest='outfile', metavar='OUTPUT TARBALL NAME', help='If not given, \
-#then "_new" suffix will be added to the name of the input tarball', default="", type='str')
+    # cmdline.add_option('-o', '--output', dest='outfile', metavar='OUTPUT TARBALL NAME', help='If not given, \
+    # then "_new" suffix will be added to the name of the input tarball', default="", type='str')
 
     # reading cmd options
-    (opts,args) = cmdline.parse_args()
+    (opts, args) = cmdline.parse_args()
 
     # check if input file is given
     if len(args) == 0:
@@ -48,15 +46,15 @@ if __name__=="__main__":
 
     # output tarball
     if len(args) < 2:
-        print ("The name of the output tarball is not given!")
+        print("The name of the output tarball is not given!")
         sys.exit(1)
 
     output_tarball = args[1]
 
-#    if opts.outfile == "":
-#        output_tarball = input_tarball.split(".tar")[0] + "_new" + ".tar"
-#     else:
-#        output_tarball = opts.outfile
+    #    if opts.outfile == "":
+    #        output_tarball = input_tarball.split(".tar")[0] + "_new" + ".tar"
+    #     else:
+    #        output_tarball = opts.outfile
 
     # getting all *.tar.gz in the LTA tarball
     matches = []
@@ -66,6 +64,11 @@ if __name__=="__main__":
         matches.append(filename)
     inputtar.close()
 
+    ascii_name = output_tarball + "_filecontent.txt"
+    with open(ascii_name, "w") as outfile:
+        outfile.write("\n".join(dircontent))
+
+
     # checking whether _all_ files in dircontent have the same prefix dirname in the path
     dirprefix = dircontent[0].split("/")[0]
     for ii in dircontent:
@@ -75,10 +78,10 @@ if __name__=="__main__":
 
     # checking if dirprefix has any one of these patterns: "_CVplots", "_CSplots", "_ISplots", "_redIS", "_red_locus"
     # if this is the case, then we will create a new tarball with this prefix removed
-    is_restructured=False
-    pattern=re.compile(r"(_CVplots)|(_CSplots)|(_ISplots)|(_redIS)|(_red_locus)")
-    if pattern.search(dirprefix) is not None: # i.e. we need to rewrite the tarball
-        is_restructured=True
+    is_restructured = False
+    pattern = re.compile(r"(_CVplots)|(_CSplots)|(_ISplots)|(_redIS)|(_red_locus)")
+    if pattern.search(dirprefix) is not None:  # i.e. we need to rewrite the tarball
+        is_restructured = True
         inputtar = tarfile.open(input_tarball, "r")
         outputtar = tarfile.open(output_tarball, "w")
         for member in inputtar.getmembers():
@@ -87,12 +90,12 @@ if __name__=="__main__":
             outputtar.addfile(member, obj)
         outputtar.close()
         inputtar.close()
-    else: # making the tarball copy
+    else:  # making the tarball copy
         shutil.copyfile(input_tarball, output_tarball)
 
     # if no *.tar.gz found, then there is nothing to do
     if len(matches) == 0:
-        print ("No duplicate tarballs found. Output tarball is the same as input")
+        print("No duplicate tarballs found. Output tarball is the same as input")
         inputtar.close()
         sys.exit(0)
 
@@ -101,7 +104,7 @@ if __name__=="__main__":
         dircontent = [ii.split("%s/" % (dirprefix))[-1] for ii in dircontent]
 
     # merging contents of all tarballs together in a single list
-    flist=[]
+    flist = []
     inputtar = tarfile.open(input_tarball, "r")
     for ff in matches:
         tgz = inputtar.extractfile(ff)
@@ -114,9 +117,9 @@ if __name__=="__main__":
 
     # cross-checking files in the LTA tarball with files in the internal *.tar.gz files
     # extra files will be added to to_delete list for removal
-    to_delete=[]
+    to_delete = []
     for ff in dircontent:
-        is_excluded=[]
+        is_excluded = []
         for jj in excluded:
             is_excluded.extend(fnmatch.filter([ff], jj))
         if len(is_excluded) != 0: continue
@@ -125,19 +128,19 @@ if __name__=="__main__":
 
     # getting files that are possibly only in the internal *.tar.gz, but not in the LTA tarball.
     # these files will be extracted and kept
-    to_extract=list(set(flist)-set(flist).intersection(set(dircontent)))
+    to_extract = list(set(flist) - set(flist).intersection(set(dircontent)))
     if len(to_extract) == 0:
-        print ("No extra files to extract from *.tar.gz")
+        print("No extra files to extract from *.tar.gz")
     else:
         # opening output tarball for writing
         outputtar = tarfile.open(output_tarball, "a")
-        print ("To extract:")
+        print("To extract:")
         # if we have only one internal *.tar.gz (usually)
         if len(matches) == 1:
             jj = inputtar.extractfile(matches[0])
             tar = tarfile.open(matches[0], "r:gz", fileobj=jj)
             for ii in to_extract:
-                print (ii)
+                print(ii)
                 # !!! here we should extract from an internal tgz into the LTA tarball !!!
                 # if dirprefix != "", then it should be added also for the extracted file
                 if dirprefix != "":
@@ -155,12 +158,12 @@ if __name__=="__main__":
         else:
             # if we have several *.tar.gz we need first to know which tarball this file belong to
             for ii in to_extract:
-                print (ii, "[from: ", end='')
+                print(ii, "[from: ", end='')
                 for tgz in matches:
                     jj = inputtar.extractfile(tgz)
                     with tarfile.open(tgz, "r:gz", fileobj=jj) as tar:
                         if ii in tar.getmembers():
-                            print (tgz, "]")
+                            print(tgz, "]")
                             # !!!! here we should extract from an internal tgz into the LTA tarball !!!
                             # if dirprefix != "", then it should be added also for the extracted file
                             if dirprefix != "":
@@ -183,9 +186,9 @@ if __name__=="__main__":
     if dirprefix != "":
         if not is_restructured:
             to_delete = [dirprefix + "/" + ii for ii in to_delete]
-    print ("To delete: ")
+    print("To delete: ")
     for ii in to_delete:
-        print (ii)
+        print(ii)
         # Python tarfile module does not allow to delete files from the archives, using system calls for now
         os.system("tar -vf %s --delete %s" % (output_tarball, ii))
 
@@ -193,6 +196,10 @@ if __name__=="__main__":
     outputtar = tarfile.open(output_tarball, "r")
     newtarcontent = [ii.name for ii in outputtar.getmembers()]
     outputtar.close()
-    ascii_name=output_tarball + "_filecontent.txt"
+    ascii_name = output_tarball + "_filecontent.txt"
     with open(ascii_name, "w") as outfile:
         outfile.write("\n".join(newtarcontent))
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/scripts/ldv_obs.py b/bf_pulp_utils/bf_pulp_utils/ldv_obs.py
similarity index 96%
rename from scripts/ldv_obs.py
rename to bf_pulp_utils/bf_pulp_utils/ldv_obs.py
index 55ea3eb5e7716ab022d000adb7edb9de4c3a9454..0b8cf4c9007d85f7494b712a26deaafa306e0773 100644
--- a/scripts/ldv_obs.py
+++ b/bf_pulp_utils/bf_pulp_utils/ldv_obs.py
@@ -2,6 +2,7 @@
 #
 import datetime as dt
 from datetime import datetime
+import sys
 
 # populating observation info
 def populating_observation (sasid, observation, parsetlines, loglines, feedlines):
@@ -12,7 +13,7 @@ def populating_observation (sasid, observation, parsetlines, loglines, feedlines
         project=res[-1].split("Project:", 1)[-1].split("PI:", 1)[0].strip()
         observation["Project"] = project
     except:
-        print "(E) Bad logfile for SASid %s. Pipelines has probably failed" % (sasid)
+        print("(E) Bad logfile for SASid %s. Pipelines has probably failed" % (sasid))
         sys.exit(1)
 
     # Creator - it is always AWTIER0  ??
@@ -71,7 +72,7 @@ def populating_observation (sasid, observation, parsetlines, loglines, feedlines
             res=[ii for ii in parsetlines if "Observation.channelsPerSubband" in ii]
             nchan_per_sub=int(res[0].split("=")[1].strip())
         else: 
-            print "(W) Parset file is not available for SASid %s" % (sasid)
+            print("(W) Parset file is not available for SASid %s" % (sasid))
             nchan_per_sub = 1
 
     chanwidth = subwidth / nchan_per_sub
@@ -98,12 +99,15 @@ def populating_observation (sasid, observation, parsetlines, loglines, feedlines
     # Duration [s]
     res = [ii for ii in loglines if "Start UTC:" in ii]
     dur = res[0].split("Duration:")[1].strip()
+
     if dur[-1] == "s":
         duration = float(dur.split("s")[0])
-    if dur[-1] == "m":
+    elif dur[-1] == "m":
         duration = float(dur.split("m")[0]) * 60.
-    if dur[-1] == "h":
+    elif dur[-1] == "h":
         duration = float(dur.split("h")[0]) * 3600.
+    else:
+        duration = 0
     observation["Duration [s]"] = duration
     
     # End Time
diff --git a/scripts/ldv_pulp.py b/bf_pulp_utils/bf_pulp_utils/ldv_pulp.py
similarity index 95%
rename from scripts/ldv_pulp.py
rename to bf_pulp_utils/bf_pulp_utils/ldv_pulp.py
index 1be6ee2ee75970ac2221582c8ed11f0a731bdc41..548ad3317fe350f5cc21c8524f53221ffe29a832 100644
--- a/scripts/ldv_pulp.py
+++ b/bf_pulp_utils/bf_pulp_utils/ldv_pulp.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 #
+import sys
 from datetime import datetime
 
 
@@ -110,13 +111,13 @@ def populating_pipeline(sasid, pulp, loglines, feedlines):
     pulp["skipDynamicSpectrum"] = skipDynamicSpectrum
 
     # pulp start, stop times and wall time
-    res=[ii for ii in loglines if "UTC" in ii and "Start" not in ii]
+    res = [ii for ii in loglines if "UTC" in ii and "Start" not in ii]
     try:
-        st=res[-2].split("is:", 1)[-1].strip()
-    except: 
-        print "(E) Pipeline for the SASid %s has probably failed" % (sasid)
+        st = res[-2].split("is:", 1)[-1].strip()
+    except:
+        print("(E) Pipeline for the SASid %s has probably failed" % (sasid))
         sys.exit(1)
-        
+
     # Start Time
     starttime = datetime.strptime(st, '%a %b  %d %H:%M:%S %Y').strftime('%Y-%m-%d %H:%M:%S')
     pt = datetime.strptime(st, '%a %b  %d %H:%M:%S %Y')
diff --git a/bf_pulp_utils/pyproject.toml b/bf_pulp_utils/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..e2baac484ff89020f8f4abbbd765044594db7c9c
--- /dev/null
+++ b/bf_pulp_utils/pyproject.toml
@@ -0,0 +1,24 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel",  "setuptools_scm[toml]>=6.2"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "bf_pulp_utils"
+description = "Small set of utils to process PULP data"
+dynamic = ['version']
+requires-python = ">=3.7"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent"
+]
+dependencies = [
+    "numpy"
+]
+
+[tool.setuptools_scm]
+root=".."
+
+
+[project.scripts]
+collect_unspecified_metadata = "bf_pulp_utils.collect_unspecified_metadata:main"
+double_tgz_elimination = "bf_pulp_utils.double_tgz_elimination:main"
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index f6d09a405da84004b4284d123919ba01eadc9cd9..0000000000000000000000000000000000000000
--- a/setup.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from setuptools import setup, find_packages
-
-def readme():
-    with open('README.rst') as f:
-        return f.read()
-
-setup(name='bcf_pipeline',
-      version='1.0.0',
-      description='Beamformed data compression pipeline',
-      url='https://git.astron.nl/ldv/bf_double_tgz/',
-      author='Vlad Kondratiev',
-      author_email='kondratiev@astron.nl',
-      license='Apache 2',
-      install_requires=[
-          'cwltool'],
-      packages=find_packages(),
-      include_package_data=True,
-      scripts=['scripts/double_tgz_elimination.py']
-      )
diff --git a/steps/double_tgz_elimination.cwl b/steps/double_tgz_elimination.cwl
index 1cebf66afea89d5bce0a2db9fc90d9b5da2b7a2b..4ad25f5cf030dd97c294b9556c5245840901fa60 100644
--- a/steps/double_tgz_elimination.cwl
+++ b/steps/double_tgz_elimination.cwl
@@ -6,20 +6,28 @@ inputs:
     type: File
     inputBinding:
       position: 1
-  - id: output_filename
-    doc: output filename for the tar archive
-    default: 'output' 
-    type: string?
-    inputBinding:
-      position: 2
+
+arguments:
+  - valueFrom: $(inputs.source_tgz.basename.replace(/_\w{8}.tar/gm, '.tar'))
+    position: 2
     
 outputs:
   - id: output_tar
     doc: output tar archive
     type: File
     outputBinding:
-      glob: $(inputs.output_filename)
-baseCommand: 
-  - double_tgz_elimination.py 
+      glob: "*.tar"
+  - id: file_content
+    doc: file_content
+    type: string[]
+    outputBinding:
+      glob: "*_filecontent.txt"
+      loadContents: true
+      outputEval: $(self[0].contents.split('\n'))
+baseCommand:
+  - double_tgz_elimination
+
+hints:
+  - class: DockerRequirement
+    dockerPull: git.astron.nl:5000/ldv/bf_double_tgz:latest
 
-requirements: []
diff --git a/steps/extract_metadata.cwl b/steps/extract_metadata.cwl
index 66058c4da6981eb0d3926301acee970e4615a093..c46aceedb34f879bd5af4c6f795ff4ddcdf0318c 100644
--- a/steps/extract_metadata.cwl
+++ b/steps/extract_metadata.cwl
@@ -1,7 +1,7 @@
 class: CommandLineTool
 cwlVersion: v1.2
 baseCommand:
-  - collect-unspecified-metadata.py
+  - collect_unspecified_metadata
 inputs:
   - id: sas_id
     doc: SAS ID (ObservationID)
@@ -17,8 +17,14 @@ inputs:
 outputs:
   - id: output_json
     doc: Resulting json file with metadata of given SAS_ID
-    type: File
+    type: Any
     outputBinding:
       glob: 'L$(inputs.sas_id)_unspecified.json'
+      loadContents: true
+      outputEval: | 
+        $(self[0] ? JSON.parse(self[0].contents): null)
 requirements:
   - class: InlineJavascriptRequirement
+hints:
+  - class: DockerRequirement
+    dockerPull: git.astron.nl:5000/ldv/bf_double_tgz:latest
diff --git a/steps/fetch_data.cwl b/steps/fetch_data.cwl
new file mode 100644
index 0000000000000000000000000000000000000000..600ea407b762f277d49e04db6ee1cc330907b228
--- /dev/null
+++ b/steps/fetch_data.cwl
@@ -0,0 +1,34 @@
+id: fetchdata
+label: fetch_data
+class: CommandLineTool
+cwlVersion: v1.1
+inputs: 
+  - id: surl_link
+    type: string
+    inputBinding:
+      position: 0
+    
+outputs: 
+  - id: tar_archive
+    type: File
+    outputBinding:
+      glob: 'out/*'
+baseCommand: 
+ - 'bash'
+ - 'fetch.sh'
+doc: 'Untar a compressed file'
+requirements:
+  InlineJavascriptRequirement: {}
+  InitialWorkDirRequirement:
+    listing:
+      - entryname: 'fetch.sh' 
+        entry: |
+          #!/bin/bash
+          mkdir out
+          cd out
+          turl=`echo $1 | awk '{gsub("srm://srm.grid.sara.nl[:0-9]*","gsiftp://gridftp.grid.sara.nl"); print}'`
+          file_name=$(inputs.surl_link.split('/').pop())
+          echo "Downloading $turl to $file_name"
+          globus-url-copy $turl file://$PWD/$file_name
+
+    
diff --git a/steps/format_ingest.cwl b/steps/format_ingest.cwl
new file mode 100644
index 0000000000000000000000000000000000000000..8e6013f35206320f914c16d6488f996c1dce85d8
--- /dev/null
+++ b/steps/format_ingest.cwl
@@ -0,0 +1,26 @@
+cwlVersion: v1.2
+class: ExpressionTool
+inputs:
+  - id: metadata
+    type: Any
+  - id: file_content
+    type: string[]
+  - id: output_name
+    type: string
+  - id: file_name
+    type: string
+outputs:
+  - id: ingest
+    type: Any
+requirements:
+  - class: InlineJavascriptRequirement
+expression: |
+  ${
+    inputs.metadata['fileContent'] = inputs.file_content
+    return { "ingest": {
+        "path": inputs.output_name,
+        "file_name": inputs.file_name,
+        "metadata": inputs.metadata
+      }
+    }
+  }
diff --git a/workflow/bf_remove_double_tgz.cwl b/workflow/bf_remove_double_tgz.cwl
new file mode 100644
index 0000000000000000000000000000000000000000..98afd76c205901546a6b5da38d70770620be5c3a
--- /dev/null
+++ b/workflow/bf_remove_double_tgz.cwl
@@ -0,0 +1,53 @@
+cwlVersion: v1.2
+class: Workflow
+inputs:
+  - id: bf_tar_archive
+    doc: Tar archive with the BeamFormed dataset
+    type: File
+  - id: pulp_log_folder
+    type: Directory
+
+outputs:
+  - id: tar_archive
+    type: File
+    outputSource: eliminate_double_tgz/output_tar
+  - id: ingest
+    type: Any
+    outputSource: format_ingest/ingest
+
+steps:
+- id: eliminate_double_tgz
+  run: ../steps/double_tgz_elimination.cwl
+  in: 
+  - id: source_tgz
+    source: bf_tar_archive
+  out:
+  - output_tar
+  - file_content
+- id: extract_metadata
+  run: ../steps/extract_metadata.cwl
+  in:
+  - id: sas_id
+    source: bf_tar_archive
+    valueFrom: $(self.basename.match('L([0-9]+)')[1])
+  - id: log_root_folder
+    source: pulp_log_folder
+  out:
+  - id: output_json
+- id: format_ingest
+  run: ../steps/format_ingest.cwl
+  in:
+  - id: metadata
+    source: extract_metadata/output_json
+  - id: file_content
+    source: eliminate_double_tgz/file_content
+  - id: output_name
+    default: tar_archive
+  - id: file_name
+    source: eliminate_double_tgz/output_tar
+    valueFrom: $(self.basename)
+  out:
+  - id: ingest
+requirements:
+- class: StepInputExpressionRequirement
+- class: InlineJavascriptRequirement
diff --git a/workflow/download_and_run_bf_remove.cwl b/workflow/download_and_run_bf_remove.cwl
new file mode 100644
index 0000000000000000000000000000000000000000..6d116c13b140ec6d09b9169888fbf0d87a4826b7
--- /dev/null
+++ b/workflow/download_and_run_bf_remove.cwl
@@ -0,0 +1,40 @@
+cwlVersion: v1.2
+class: Workflow
+inputs:
+  - id: surls
+    type: string[]
+  - id: pulp_log_folder
+    doc: Pulp Log Folder
+    type: Directory
+
+outputs: 
+- id: tar_archive
+  type: File[]
+  outputSource: bf_process/tar_archive
+- id: ingest
+  type: Any
+  outputSource: bf_process/ingest
+
+steps:
+- id: fetch_data
+  in:
+  - id: surl_link
+    source: surls
+  out:
+  - id: tar_archive
+  scatter: surl_link
+  run: ../steps/fetch_data.cwl
+- id: bf_process
+  in: 
+  - id: bf_tar_archive
+    source: fetch_data/tar_archive
+  - id: pulp_log_folder
+    source: pulp_log_folder
+  out:
+  - id: tar_archive
+  - id: ingest
+  run: ./bf_remove_double_tgz.cwl
+  scatter: bf_tar_archive
+requirements:
+- class: ScatterFeatureRequirement
+- class: SubworkflowFeatureRequirement
\ No newline at end of file