From e2c65a3d174d9371f8a837c16ab55544d6c2d420 Mon Sep 17 00:00:00 2001
From: Vlad Kondratiev <vlad.kondratiev@gmail.com>
Date: Wed, 5 Feb 2025 12:49:01 +0100
Subject: [PATCH 1/7] added initial CWL workflow, digitize CWL description,
 Dockerfile and test json for CEP4

---
 docker/Dockerfile                            |  27 ++
 scripts/digitize3.py                         | 354 +++++++++++++++++++
 steps/digitize.cwl                           |  51 +++
 tests/digitize_input.json                    |  18 +
 workflows/pulp2-xxyy-8bit-requantisation.cwl |  35 ++
 5 files changed, 485 insertions(+)
 create mode 100755 scripts/digitize3.py
 create mode 100644 tests/digitize_input.json

diff --git a/docker/Dockerfile b/docker/Dockerfile
index e69de29..a34c420 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -0,0 +1,27 @@
+FROM ubuntu:24.04
+
+#
+# Lightweight container with minimum to run digitize3.py
+#
+ENV INSTALL_DIR=/usr/local
+
+RUN export DEBIAN_FRONTEND=noninteractive && \
+    apt-get -y update && apt-get -y upgrade && \
+    apt-get install -y  python-is-python3 && \
+    apt-get autoremove --purge && \
+#    apt-get install -y wget python3 python3-pip python3-numpy git vim \
+#        python3-pkgconfig libhdf5-dev hdf5-tools python3-setuptools cython3 && \
+    apt-get install -y wget python3 python3-numpy git vim \
+        libhdf5-dev hdf5-tools python3-h5py cwltool && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install cwltool and toil
+#RUN python3 -m pip install cwltool toil[cwl]
+
+#RUN cd ${INSTALL_DIR} && wget https://files.pythonhosted.org/packages/cc/0c/5c2b0a88158682aeafb10c1c2b735df5bc31f165bfe192f2ee9f2a23b5f1/h5py-3.12.1.tar.gz && \
+#    tar xvfz h5py-3.12.1.tar.gz && cd h5py-3.12.1 && python setup.py install --prefix=${INSTALL_DIR}
+	
+RUN cd ${INSTALL_DIR} && mkdir -p src && cd src/ && git clone https://git.astron.nl/kondratiev/lofar-scripts-repo && \
+    cd lofar-scripts-repo && cp digitize3.py ${INSTALL_DIR}/bin
+
+RUN digitize3.py -h
diff --git a/scripts/digitize3.py b/scripts/digitize3.py
new file mode 100755
index 0000000..eed993f
--- /dev/null
+++ b/scripts/digitize3.py
@@ -0,0 +1,354 @@
+#!/usr/bin/env python3
+#
+# -*- coding: utf-8 -*-
+""" Routines to convert raw LOFAR data between 4-byte floats and 1-byte integers.
+
+Version 1.4
+
+ Main routine is convert_dtype
+
+ Command Line usage: digitize3.py [-h] [-o OUTDIR] [-s NSIG] [--refloat] [-nc] [-v]
+                   file_SAPXXX_BEAMXXX_SX_name.h5
+                   [file_SAPXXX_BEAMXXX_SX_name.h5 ...]
+
+ Notes:
+ * we can only process files with one SUB_ARRAY_POINTING, one Beam, and one STOKES
+ * we assume the .raw files can be found from file.replace('.h5', '.raw')
+ * the float32 to int8 conversion creates a new h5 Dataset at the Beam level
+   with the name convention STOKES_X_i2f.
+   This array provides the scaling/mapping from int8 back to float32.
+   The dataset also has attributes 'STOKES_X_nsig', preserving the NSIG argument
+                                   'STOKES_X_recsize', preserving the read buffer size
+   The dataset is deleted if direction='i2f', i..e. 'refloat'ing the byte stream
+ * digitization clips extreme fluctuations (nsig), so you may not recover the original
+   float stream. Noise will be clipped.
+ * We modify the STOKES_X 'DATATYPE' attribute, changing it from 'float' to 'int8' if direction == 'f2i'
+                                                                 'int8' to 'float' if direction == 'i2f'
+ * We also change the dtype of the dataset in the HDF5 file, so that tools can directly
+   read the integer data. Unless those tools are aware of the STOKES_X_i2f table, the
+   rescaling will not be applied. For typical pulsar observations this just results in
+   forcible channel equalization.
+"""
+
+#from __future__ import division, print_function
+#from __future__ import with_statement
+
+import argparse
+import numpy as np
+import os, sys
+import re
+import shutil
+
+import h5py, h5py.h5d, h5py.h5t
+
+_lofar_dtypes = {'float':'>f4', 'int8':'>i1'}
+
+def convert_dtype(files, outdir, nsig=5., direction='f2i', check=True, verbose=None):
+    """
+    Given a list of lofar h5 files, digitize the raw data to 'int8',
+    outputting it in outdir.
+
+    Args:
+    files : list of h5 files
+    outdir : directory to output the digitized raw data and the h5 file
+             with the processing history
+
+    nsig : digitize/map raw data stream from
+             (-Nsig*std(), +Nsig*std())
+           to
+             [-128, 128) (signed int8)
+           This process clips outliers.
+           (default: 5.)
+
+    direction : One of 'f2i' or 'i2f'. The 'i2f' routine undoes the original
+                digitization and should reproduce the original data.
+            (deafult: 'f2i')
+
+    check : Check the h5 structure is consistent with the filename.
+            (default: True)
+
+    Notes:
+     * we can only process files with one SUB_ARRAY_POINTING, one Beam, and one STOKES
+     * we assume the .raw files can be found from file.replace('.h5', '.raw')
+     * the float32 to int8 conversion creates a new h5 Dataset at the Beam level
+       with the name convention STOKES_X_i2f.
+       This array provides the scaling/mapping from int8 back to float32.
+       The dataset also has attributes 'STOKES_X_nsig', preserving the NSIG argument
+                                       'STOKES_X_recsize', preserving the read buffer size
+       The dataset is deleted if direction='i2f', i..e. 'refloat'ing the byte stream
+     * digitization clips extreme fluctuations (nsig), so you may not recover the original
+       float stream. Noise will be clipped.
+     * We modify the STOKES_X 'DATATYPE' attribute, changing it from 'float' to 'int8' if direction == 'f2i'
+                                                                     'int8' to 'float' if direction == 'i2f'
+
+    """
+    if not isinstance(files, list):
+        files = [files]
+
+    assert direction in ['f2i', 'i2f']
+
+    for fname in files:
+        fraw = fname.replace('.h5', '.raw')
+        fraw_out = os.path.join(outdir, os.path.basename(fraw))
+        if direction == 'f2i':
+            print("Digitizing %s to %s" % (fraw, fraw_out))
+        elif direction == 'i2f':
+            print("unDigitizing %s to %s" % (fraw, fraw_out))
+
+        if os.path.abspath(fraw) == os.path.abspath(fraw_out):
+            print("Warning, this will overwrite input files")
+
+        # copy the original h5 file to the outdir
+        fnew = os.path.join(outdir, os.path.basename(fname))
+        shutil.copy2(fname, fnew)
+
+        # get necessary processing information, possibly checking for consistency
+        nchan, ntint, dtype, nsamples, sap, beam, stokes =  lofar_h5info(fnew, check=check, verbose=verbose)
+        if verbose:
+            print("\t Input data type is %s" % dtype)
+            print("\t Processing %s_%s_%s" % (sap, beam, stokes))
+        if dtype=='int8' and direction=='f2i':
+            raise ValueError("input data is already integers but digitization requested")
+        if dtype=='float32' and direction=='i2f':
+            raise ValueError("input data is already floats but undigitization requested")
+
+
+        recsize = ntint * nchan
+        itemsize_i = np.dtype(dtype).itemsize
+        if direction == 'f2i':
+            itemsize_o = np.dtype('>i1').itemsize
+        elif direction == 'i2f':
+            itemsize_o = np.dtype('<f4').itemsize
+
+        with open(fraw_out, 'wb', recsize*itemsize_o) as fh1out:
+            with open(fraw, 'rb', recsize*itemsize_i) as fh1:
+                with h5py.File(fnew, 'a') as h5:
+                    if verbose >= 1:
+                        print("\t",fraw," has dtype", dtype)
+                        print("\t",fraw," has DATATYPE",h5[sap][beam][stokes].attrs['DATATYPE'])
+                    # create/access dataset used to convert back to floats from int8's
+                    if direction == 'f2i':
+                        convert_dataset_dtype(h5[sap][beam][stokes],'i1')
+                        h5[sap][beam][stokes].attrs['DATATYPE'] = 'int8'
+                        # create the scaling data set
+                        diginfo = h5[sap][beam].create_dataset(
+                            "%s_i2f" % stokes,
+                            (0, nchan),
+                            maxshape=(None, nchan),
+                            dtype='f')
+                        diginfo.attrs['%s_recsize' % stokes] = recsize
+                        diginfo.attrs['%s_nsig' % stokes] = nsig
+                        out_dtype = h5[sap][beam][stokes].dtype
+                    elif direction == 'i2f':
+                        convert_dataset_dtype(h5[sap][beam][stokes],'<f4')
+                        out_dtype = h5[sap][beam][stokes].dtype
+                        h5[sap][beam][stokes].attrs['DATATYPE'] = 'float'
+                        diginfo = h5[sap][beam]["%s_i2f" % stokes]
+                    h5.flush()
+                    if verbose >= 1:
+                        print("\tInput dtype", dtype)
+                        print("\tOutput dtype", out_dtype)
+                    idx = 0
+                    while True:
+                        raw1 = np.fromfile(fh1,count=recsize,dtype=dtype)
+                        if len(raw1) == 0:
+                            break
+
+                        raw1 = raw1.reshape(-1,nchan)
+
+                        if direction == 'f2i':
+                            # record the stddev for possible recovery
+                            #std = raw1.std(axis=0) # FIXME: assumes mean is zero
+                            std = np.sqrt(np.mean(raw1.astype(np.float32)**2,axis=0))
+                            if np.any(np.isnan(std)):
+                                raise ValueError("NaNs found in input data set; possible endianness or data type problems")
+                            if np.any(std==0):
+                                print("Warning: {0} channels in block {1} contain all zeros".format(np.sum(std==0), idx))
+                                std[std==0] = nsig # Avoid division by zero later
+                            scale = nsig*std
+                            digshape = diginfo.shape
+                            diginfo.resize(digshape[0] + 1, axis=0)
+
+                            # apply the digitization
+                            # by variance in individual channels
+                            scale = nsig*std
+                            intmap = (128*(raw1/scale))
+                            diginfo[idx] = scale/128.
+                            # clip
+                            if verbose >= 2:
+                                nclip = np.sum(intmap> 127) + np.sum(intmap<-128)
+                                print("Read# {0}, Clipping {1}/{2} samples".format(idx, nclip, intmap.size))
+                            raw1_o = np.clip(intmap, -128, 127).astype('>i1')
+
+                        elif direction == 'i2f':
+                            # convert back to float32
+                            scale = diginfo[idx]
+                            raw1_o = (raw1*scale).astype(out_dtype)
+
+                        raw1_o.tofile(fh1out)
+
+                        if verbose >= 3:
+                            print("f2i rescale = {0}".format(scale))
+                        idx += 1
+
+                    if direction == 'i2f':
+                        del h5[sap][beam]["%s_i2f" % stokes]
+
+def convert_dataset_dtype(dataset, dtype):
+    # delete and recreate the dataset with a new dtype, preserving everything including external storage
+    #name = dataset.name
+    name = dataset.name.encode("utf-8")
+    parent = dataset.parent
+    space = dataset.id.get_space()
+    plist = dataset.id.get_create_plist()
+    attrs = dict(dataset.attrs)
+    del parent[name]
+    h5py.h5d.create(parent.id,
+                    name,
+                    h5py.h5t.py_create(dtype),
+                    space,
+                    dcpl=plist
+                    )
+    for k, v in attrs.items():
+        parent[name].attrs[k] = v
+    
+
+def lofar_h5info(fname, check=True, verbose=None):
+    """
+    open the lofar hdf5 file and return:
+    (nchan, ntint, recsize, dtype, SUBARRRAY_POINTING, BEAM, STOKES)
+
+    Note: we only process files with one SUB_ARRAY_POINTING
+    and one BEAM per SUB_ARRAY_POINTING
+
+    Args:
+    fname : h5 file
+    check : check that some of the h5 structure is consistent with
+            the filename. For instance, there is only one
+            SUB_ARRAY_POINTING and only one BEAM per file.
+            *_SAP???_B???_S?_*
+
+    """
+    f5 = h5py.File(fname, 'r')
+
+    # get SAP, beam, stokes from filename
+    info = re.search(r'SAP\d{3}_B\d{3}_S\d{1}_', fname)
+    if info is None:
+        print("small warning, filename conventions not met. ", end='')
+        #if not check:
+        #    print("Processing first SAP, BEAM, and STOKES.")
+
+    # get SAP, beam, stokes from hdf5 file.
+    # recall, we assume there is only one SAP, one BEAM and one STOKES per file
+    h5saps = sorted([i for i in f5.keys() if 'SUB_ARRAY_POINTING' in i])
+    s0 = f5[h5saps[0]]
+    h5beams = sorted([i for i in s0.keys() if 'BEAM' in i])
+    b0 = s0[h5beams[0]]
+    h5stokes = sorted([i for i in b0.keys() if 'STOKES' in i and 'i2f' not in i])
+    st0 = b0[h5stokes[0]]
+
+    if check:
+        force_fileconsistency(fname, h5saps, h5beams, h5stokes)
+    if verbose >= 1:
+        print("File produced by %s" % f5.attrs['BF_VERSION'])
+
+    # future fix: currently only handle NOF_SUBBANDS
+    # with 1 channel in each subband
+    nchan = st0.attrs['NOF_SUBBANDS']
+    if st0.attrs['DATATYPE']=='int8':
+        # some files exist where the dtype is set wrong
+        # in particular, the dtype can still be <f4 but the data
+        # might be integers
+        dtype = np.dtype('int8')
+    else:
+        # Try to infer the correct byte order from the dtype
+        #dtype = _lofar_dtypes[st0.attrs['DATATYPE']]
+        dtype = st0.dtype
+    # process sets of ~32 MB, hence // np.log2(nchan)
+    ntint = 2**25//4//int(np.log2(nchan))  # 16=~nchan -> power of 2, but sets of ~32 MB
+    nsamples = b0.attrs['NOF_SAMPLES']
+    f5.close()
+    return (nchan, ntint, dtype, nsamples, h5saps[0], h5beams[0], h5stokes[0])
+
+
+def force_fileconsistency(fname, h5saps, h5beams, h5stokes):
+    """ Raise Warning if the inferred sap, beam, and stokes
+        differ from the content of the hdf5 file
+    """
+    # get subarray_pointing information from the filename
+    info = re.search(r'SAP\d{3}_B\d{3}_S\d{1}_', fname)
+    if info is None:
+        txt = "Filename %s doesn't adhere to convention *_SAP???_B???_S?_* "
+        txt += "processing may proceed with the -nc switch"
+        raise Warning(txt)
+    fsap = re.search(r'SAP\d{3}', info.group()).group().strip('SAP')
+    fbeam = re.search(r'_B\d{3}', info.group()).group().strip('_B')
+    fstokes = re.search(r'_S\d', info.group()).group().strip('_S')
+
+    # parse the info from the hdf5 file
+    hsap = re.search(r'\d{3}', h5saps[0]).group()
+    hbeam = re.search(r'\d{3}', h5beams[0]).group()
+    hstokes = re.search(r'\d', h5stokes[0]).group()
+
+    # compare the results, raising warnings as necessary
+    if len(h5saps) == 0:
+        raise Exception("No SUB_ARRAY_POINTINGS found")
+    elif len(h5saps) > 1:
+        raise Warning("Too many SUB_ARRAY_POINTINGS in %s." % fname, "Processing first.")
+    elif fsap != hsap:
+        raise Warning("Filename %s had SAP %s" % (fname, hsap))
+
+    if len(h5beams) == 0:
+        raise Exception("No BEAMS found in SAP%s" % saps[0])
+    elif len(h5beams) > 1:
+        raise Warning("Too many BEAMS in SAP%s." % saps[0], "Processing first")
+    elif fbeam != hbeam:
+        raise Warning("Filename %s had BEAM %s" % (fname, hbeam))
+
+    if len(h5stokes) == 0:
+        raise Exception("No STOKES found in SAP%s_B%s" % (saps[0],beams[0]))
+    elif len(h5stokes) > 1:
+        raise Warning("Too many STOKES found in SAP%s_B%s." % (saps[0], beams[0]), "Processing first")
+    elif fstokes != hstokes:
+        raise Warning("Filename %s had STOKES %s" % (fname, hstokes))
+
+
+def CL_parser():
+    parser = argparse.ArgumentParser(prog='digitize3.py',
+                                     description='digitize beamformed lofar data to int8, or convert back to original float32. '
+                                     "Takes argument list of lofar hdf5 header files, raw files determined by filename.replace('.h5','.raw'). "
+                                     'The digitized output copies the hdf5 file to ODIR, but with the processing information. '
+                                     "Note, we can only process files with one SUB_ARRAY_POINTING, one BEAM, and one STOKES",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('files', metavar='file_SAPXXX_BEAMXXX_SX_name.h5', nargs='+',
+                        help='lofar hdf5 header files to process.')
+
+    parser.add_argument('-o', '--output_dir', type=str, dest='outdir',
+                        default='./',
+                        help='output directory for the digitized files')
+
+    parser.add_argument('-s', '--nsigma', type=float, dest='nsig',
+                        default=5.0,
+                        help='Clip raw data above this many stddevs.')
+
+    parser.add_argument('--refloat', action='store_true',
+                        help='Convert int8 back to float32 ("i2f").'
+                             'This only works on files having gone through the digitization process.')
+
+    parser.add_argument('-nc', '--nocheck', action='store_true',
+                        help='do not enforce filename consistency of *_SAP???_B???_S?_* with the actual hdf5 '
+                        'structure for SUB_ARRAY_POINTING_???, BEAM_???, and STOKES_? .')
+
+    parser.add_argument('-v', '--verbose', action='append_const', const=1)
+#    return parser.parse_args()
+    return parser.parse_args(args=None if sys.argv[1:] else ['--help'])
+
+if __name__ == '__main__':
+    args = CL_parser()
+    check = not args.nocheck
+    args.verbose = 0 if args.verbose is None else sum(args.verbose)
+    if args.refloat:
+        direction = 'i2f'
+    else:
+        direction = 'f2i'
+    convert_dtype(args.files, args.outdir, args.nsig, direction=direction, check=check, verbose=args.verbose)
diff --git a/steps/digitize.cwl b/steps/digitize.cwl
index e69de29..692d66a 100644
--- a/steps/digitize.cwl
+++ b/steps/digitize.cwl
@@ -0,0 +1,51 @@
+cwlVersion: v1.2
+class: CommandLineTool
+
+label: Converts 32-bit raw LOFAR BF XXYY data to 8-bit
+doc: |
+  This tool only performs digitizes raw 32-bit LOFAR BF XXYY data to int8.
+  Takes argument list of HDF5 header files, raw files determined by filename.replace('.h5','.raw').
+  The digitized output copies the HDF5 file to output directory, but with the processing information.
+
+requirements:
+  - class: InlineJavascriptRequirement
+  - class: InitialWorkDirRequirement
+    listing: $(inputs.h5in)
+  
+baseCommand: digitize3.py
+
+#arguments: ["-o", $(runtime.outdir)]
+arguments: ["--verbose"]
+#arguments:
+#  - prefix: --output_dir
+#    $(runtime.outdir)
+#  - prefix: --verbose
+
+#arguments:
+#  - prefix: --verbose
+
+inputs:
+  - id: h5in
+    type: File[]
+    inputBinding:
+      position: 1
+    doc: Input raw 32-bit LOFAR BF HDF5 files
+
+  - id: output_dir
+    type: string
+    inputBinding:
+      prefix: --output_dir=
+      separate: false
+    doc: Output directory for the digitized files
+    
+  - id: nsigma
+    type: float?
+    inputBinding:
+      prefix: --nsigma=
+      separate: false
+    doc: Clip raw data above this many stddevs
+    
+outputs:
+  - id: h5out
+    type: File[]
+    doc: Output 8-bit HDF5
diff --git a/tests/digitize_input.json b/tests/digitize_input.json
new file mode 100644
index 0000000..e9cd690
--- /dev/null
+++ b/tests/digitize_input.json
@@ -0,0 +1,18 @@
+{
+"h5in": [
+{
+"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S0_P000_bf.h5"
+},
+{
+"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S1_P000_bf.h5"
+},
+{
+"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S2_P000_bf.h5"
+},
+{
+"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S3_P000_bf.h5"
+}
+],
+"output_dir": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/8-bit",
+"nsigma": 5.1
+}
diff --git a/workflows/pulp2-xxyy-8bit-requantisation.cwl b/workflows/pulp2-xxyy-8bit-requantisation.cwl
index e69de29..af9b351 100644
--- a/workflows/pulp2-xxyy-8bit-requantisation.cwl
+++ b/workflows/pulp2-xxyy-8bit-requantisation.cwl
@@ -0,0 +1,35 @@
+cwlVersion: v1.2
+class: Workflow
+
+requirements:
+  - class: InlineJavascriptRequirement
+  - class: InitialWorkDirRequirement
+    listing: $(inputs.h5in)
+  
+inputs:
+  - id: h5in
+    type: File[]
+    doc: Input raw 32-bit LOFAR BF HDF5 files
+
+  - id: output_dir
+    type: string
+    doc: Output directory for the digitized files
+    
+  - id: nsigma
+    type: float?
+    doc: Clip raw data above this many stddevs
+
+steps:
+  digitize:
+    run: digitize.cwl
+    in:
+       h5in: h5in
+       output_dir: output_dir
+       nsigma: nsigma
+    out: [h5out]
+    
+outputs:
+  - id: h5out
+    type: File[]
+    outputSource: digitize/h5out
+    doc: Output 8-bit HDF5
-- 
GitLab


From f64b3b8641ddcd069697de57f3c4f106d678d28a Mon Sep 17 00:00:00 2001
From: Vlad Kondratiev <vlad.kondratiev@gmail.com>
Date: Thu, 6 Feb 2025 22:56:53 +0100
Subject: [PATCH 2/7] updated to make things work. Test does work in CEP4
 environment

---
 scripts/digitize3.py                         |  4 +++
 steps/digitize.cwl                           | 37 +++++++++++++-------
 tests/digitize_input.json                    | 17 ++++++++-
 workflows/pulp2-xxyy-8bit-requantisation.cwl | 31 ++++++++++++----
 4 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/scripts/digitize3.py b/scripts/digitize3.py
index eed993f..d432f96 100755
--- a/scripts/digitize3.py
+++ b/scripts/digitize3.py
@@ -87,6 +87,10 @@ def convert_dtype(files, outdir, nsig=5., direction='f2i', check=True, verbose=N
 
     assert direction in ['f2i', 'i2f']
 
+    # check if outdir directory exists
+    if not os.path.isdir(outdir):
+        os.makedirs(outdir)
+
     for fname in files:
         fraw = fname.replace('.h5', '.raw')
         fraw_out = os.path.join(outdir, os.path.basename(fraw))
diff --git a/steps/digitize.cwl b/steps/digitize.cwl
index 692d66a..4f30c99 100644
--- a/steps/digitize.cwl
+++ b/steps/digitize.cwl
@@ -10,26 +10,23 @@ doc: |
 requirements:
   - class: InlineJavascriptRequirement
   - class: InitialWorkDirRequirement
-    listing: $(inputs.h5in)
-  
-baseCommand: digitize3.py
+    listing:
+      - entry: $(inputs.h5in)
+      - entry: $(inputs.raw_in)
 
-#arguments: ["-o", $(runtime.outdir)]
+baseCommand: digitize3.py
 arguments: ["--verbose"]
-#arguments:
-#  - prefix: --output_dir
-#    $(runtime.outdir)
-#  - prefix: --verbose
-
-#arguments:
-#  - prefix: --verbose
 
 inputs:
   - id: h5in
     type: File[]
     inputBinding:
       position: 1
-    doc: Input raw 32-bit LOFAR BF HDF5 files
+    doc: Input LOFAR BF .h5 files
+
+  - id: raw_in
+    type: File[]
+    doc: Input raw 32-bit LOFAR BF .raw files
 
   - id: output_dir
     type: string
@@ -46,6 +43,20 @@ inputs:
     doc: Clip raw data above this many stddevs
     
 outputs:
+  - id: odir
+    type: Directory
+    doc: directory for output 8-bit .h5 and .raw files
+    outputBinding:
+      glob: "$(inputs.output_dir)"
+
   - id: h5out
     type: File[]
-    doc: Output 8-bit HDF5
+    doc: Output 8-bit .h5 files
+    outputBinding:
+      glob: "$(inputs.output_dir)/*.h5"
+
+  - id: raw_out
+    type: File[]
+    doc: Output 8-bit .raw files
+    outputBinding:
+      glob: "$(inputs.output_dir)/*.raw"
diff --git a/tests/digitize_input.json b/tests/digitize_input.json
index e9cd690..c031506 100644
--- a/tests/digitize_input.json
+++ b/tests/digitize_input.json
@@ -13,6 +13,21 @@
 "class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S3_P000_bf.h5"
 }
 ],
-"output_dir": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/8-bit",
+"raw_in": [
+{
+"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S0_P000_bf.raw"
+},
+{
+"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S1_P000_bf.raw"
+},
+{
+"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S2_P000_bf.raw"
+},
+{
+"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S3_P000_bf.raw"
+}
+],
+#"output_dir": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/8-bit",
+"output_dir": "8-bit",
 "nsigma": 5.1
 }
diff --git a/workflows/pulp2-xxyy-8bit-requantisation.cwl b/workflows/pulp2-xxyy-8bit-requantisation.cwl
index af9b351..113ce60 100644
--- a/workflows/pulp2-xxyy-8bit-requantisation.cwl
+++ b/workflows/pulp2-xxyy-8bit-requantisation.cwl
@@ -3,16 +3,19 @@ class: Workflow
 
 requirements:
   - class: InlineJavascriptRequirement
-  - class: InitialWorkDirRequirement
-    listing: $(inputs.h5in)
   
 inputs:
   - id: h5in
     type: File[]
-    doc: Input raw 32-bit LOFAR BF HDF5 files
+    doc: Input LOFAR BF .h5 files
+
+  - id: raw_in
+    type: File[]
+    doc: Input raw 32-bit LOFAR BF .raw files
 
   - id: output_dir
     type: string
+    default: "8-bit"
     doc: Output directory for the digitized files
     
   - id: nsigma
@@ -21,15 +24,31 @@ inputs:
 
 steps:
   digitize:
-    run: digitize.cwl
+    run: ../steps/digitize.cwl
+    label: Converts 32-bit raw LOFAR BF XXYY data to 8-bit
+    doc: |
+      This tool only performs digitizes raw 32-bit LOFAR BF XXYY data to int8.
+      Takes argument list of HDF5 header files, raw files determined by filename.replace('.h5','.raw').
+      The digitized output copies the HDF5 file to output directory, but with the processing information.
     in:
        h5in: h5in
+       raw_in: raw_in
        output_dir: output_dir
        nsigma: nsigma
-    out: [h5out]
-    
+    out: [odir, h5out, raw_out]
+
 outputs:
   - id: h5out
     type: File[]
     outputSource: digitize/h5out
     doc: Output 8-bit HDF5
+
+  - id: raw_out
+    type: File[]
+    outputSource: digitize/raw_out
+    doc: Output 8-bit .raw files
+
+  - id: odir
+    type: Directory
+    outputSource: digitize/odir
+    doc: directory for output 8-bit .h5 and .raw files
-- 
GitLab


From 7723aebb4bc57da0f4ba12e7f785f09c96488bf0 Mon Sep 17 00:00:00 2001
From: Vlad Kondratiev <kondratiev@astron.nl>
Date: Mon, 17 Mar 2025 13:23:20 +0000
Subject: [PATCH 3/7] Apply 1 suggestion(s) to 1 file(s)

Co-authored-by: Mick Veldhuis <veldhuis@astron.nl>
---
 scripts/digitize3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/digitize3.py b/scripts/digitize3.py
index d432f96..ff6965f 100755
--- a/scripts/digitize3.py
+++ b/scripts/digitize3.py
@@ -199,7 +199,7 @@ def convert_dtype(files, outdir, nsig=5., direction='f2i', check=True, verbose=N
                         del h5[sap][beam]["%s_i2f" % stokes]
 
 def convert_dataset_dtype(dataset, dtype):
-    # delete and recreate the dataset with a new dtype, preserving everything including external storage
+    """delete and recreate the dataset with a new dtype, preserving everything including external storage"""
     #name = dataset.name
     name = dataset.name.encode("utf-8")
     parent = dataset.parent
-- 
GitLab


From 6177d7256ec83b98848ade1efa7f3af27cf535fa Mon Sep 17 00:00:00 2001
From: Vlad Kondratiev <kondratiev@astron.nl>
Date: Mon, 17 Mar 2025 13:36:19 +0000
Subject: [PATCH 4/7] Apply 1 suggestion(s) to 1 file(s)

Co-authored-by: Mick Veldhuis <veldhuis@astron.nl>
---
 scripts/digitize3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/digitize3.py b/scripts/digitize3.py
index ff6965f..cb4fe74 100755
--- a/scripts/digitize3.py
+++ b/scripts/digitize3.py
@@ -233,7 +233,7 @@ def lofar_h5info(fname, check=True, verbose=None):
             *_SAP???_B???_S?_*
 
     """
-    f5 = h5py.File(fname, 'r')
+    with h5py.File(fname, 'r') as f5:
 
     # get SAP, beam, stokes from filename
     info = re.search(r'SAP\d{3}_B\d{3}_S\d{1}_', fname)
-- 
GitLab


From f63f35a46e8faef976ab6fd1a710a9257ffcb795 Mon Sep 17 00:00:00 2001
From: Vlad Kondratiev <vlad.kondratiev@gmail.com>
Date: Mon, 24 Mar 2025 20:39:26 +0100
Subject: [PATCH 5/7] comments by Mick Veldhuis are addressed from MR 
 https://git.astron.nl/RD/pulp2-cwl/-/merge_requests/1

---
 docker/Dockerfile                            |  4 +-
 scripts/digitize3.py                         | 86 ++++++++++----------
 tests/digitize_input.json                    | 61 +++++++-------
 workflows/pulp2-xxyy-8bit-requantisation.cwl |  1 +
 4 files changed, 76 insertions(+), 76 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index a34c420..3f9614d 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -21,7 +21,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
 #RUN cd ${INSTALL_DIR} && wget https://files.pythonhosted.org/packages/cc/0c/5c2b0a88158682aeafb10c1c2b735df5bc31f165bfe192f2ee9f2a23b5f1/h5py-3.12.1.tar.gz && \
 #    tar xvfz h5py-3.12.1.tar.gz && cd h5py-3.12.1 && python setup.py install --prefix=${INSTALL_DIR}
 	
-RUN cd ${INSTALL_DIR} && mkdir -p src && cd src/ && git clone https://git.astron.nl/kondratiev/lofar-scripts-repo && \
-    cd lofar-scripts-repo && cp digitize3.py ${INSTALL_DIR}/bin
+RUN cd ${INSTALL_DIR} && mkdir -p src && cd src/ && git clone https://git.astron.nl/RD/pulp2-cwl && \
+    cd pulp2-cwl/scripts && cp digitize3.py ${INSTALL_DIR}/bin
 
 RUN digitize3.py -h
diff --git a/scripts/digitize3.py b/scripts/digitize3.py
index cb4fe74..9e14196 100755
--- a/scripts/digitize3.py
+++ b/scripts/digitize3.py
@@ -30,9 +30,6 @@ Version 1.4
    forcible channel equalization.
 """
 
-#from __future__ import division, print_function
-#from __future__ import with_statement
-
 import argparse
 import numpy as np
 import os, sys
@@ -66,6 +63,8 @@ def convert_dtype(files, outdir, nsig=5., direction='f2i', check=True, verbose=N
 
     check : Check the h5 structure is consistent with the filename.
             (default: True)
+   
+    verbose : if True provides more verbose output
 
     Notes:
      * we can only process files with one SUB_ARRAY_POINTING, one Beam, and one STOKES
@@ -199,8 +198,9 @@ def convert_dtype(files, outdir, nsig=5., direction='f2i', check=True, verbose=N
                         del h5[sap][beam]["%s_i2f" % stokes]
 
 def convert_dataset_dtype(dataset, dtype):
-    """delete and recreate the dataset with a new dtype, preserving everything including external storage"""
-    #name = dataset.name
+    """
+    delete and recreate the dataset with a new dtype, preserving everything including external storage    
+    """
     name = dataset.name.encode("utf-8")
     parent = dataset.parent
     space = dataset.id.get_space()
@@ -231,47 +231,48 @@ def lofar_h5info(fname, check=True, verbose=None):
             the filename. For instance, there is only one
             SUB_ARRAY_POINTING and only one BEAM per file.
             *_SAP???_B???_S?_*
+    verbose : if True provides more verbose output
 
     """
     with h5py.File(fname, 'r') as f5:
 
-    # get SAP, beam, stokes from filename
-    info = re.search(r'SAP\d{3}_B\d{3}_S\d{1}_', fname)
-    if info is None:
-        print("small warning, filename conventions not met. ", end='')
-        #if not check:
-        #    print("Processing first SAP, BEAM, and STOKES.")
-
-    # get SAP, beam, stokes from hdf5 file.
-    # recall, we assume there is only one SAP, one BEAM and one STOKES per file
-    h5saps = sorted([i for i in f5.keys() if 'SUB_ARRAY_POINTING' in i])
-    s0 = f5[h5saps[0]]
-    h5beams = sorted([i for i in s0.keys() if 'BEAM' in i])
-    b0 = s0[h5beams[0]]
-    h5stokes = sorted([i for i in b0.keys() if 'STOKES' in i and 'i2f' not in i])
-    st0 = b0[h5stokes[0]]
-
-    if check:
-        force_fileconsistency(fname, h5saps, h5beams, h5stokes)
-    if verbose >= 1:
-        print("File produced by %s" % f5.attrs['BF_VERSION'])
-
-    # future fix: currently only handle NOF_SUBBANDS
-    # with 1 channel in each subband
-    nchan = st0.attrs['NOF_SUBBANDS']
-    if st0.attrs['DATATYPE']=='int8':
-        # some files exist where the dtype is set wrong
-        # in particular, the dtype can still be <f4 but the data
-        # might be integers
-        dtype = np.dtype('int8')
-    else:
-        # Try to infer the correct byte order from the dtype
-        #dtype = _lofar_dtypes[st0.attrs['DATATYPE']]
-        dtype = st0.dtype
-    # process sets of ~32 MB, hence // np.log2(nchan)
-    ntint = 2**25//4//int(np.log2(nchan))  # 16=~nchan -> power of 2, but sets of ~32 MB
-    nsamples = b0.attrs['NOF_SAMPLES']
-    f5.close()
+        # get SAP, beam, stokes from filename
+        info = re.search(r'SAP\d{3}_B\d{3}_S\d{1}_', fname)
+        if info is None:
+            print("small warning, filename conventions not met. ", end='')
+            #if not check:
+            #    print("Processing first SAP, BEAM, and STOKES.")
+
+        # get SAP, beam, stokes from hdf5 file.
+        # recall, we assume there is only one SAP, one BEAM and one STOKES per file
+        h5saps = sorted([i for i in f5.keys() if 'SUB_ARRAY_POINTING' in i])
+        s0 = f5[h5saps[0]]
+        h5beams = sorted([i for i in s0.keys() if 'BEAM' in i])
+        b0 = s0[h5beams[0]]
+        h5stokes = sorted([i for i in b0.keys() if 'STOKES' in i and 'i2f' not in i])
+        st0 = b0[h5stokes[0]]
+
+        if check:
+            force_fileconsistency(fname, h5saps, h5beams, h5stokes)
+        if verbose >= 1:
+            print("File produced by %s" % f5.attrs['BF_VERSION'])
+
+        # future fix: currently only handle NOF_SUBBANDS
+        # with 1 channel in each subband
+        nchan = st0.attrs['NOF_SUBBANDS']
+        if st0.attrs['DATATYPE']=='int8':
+            # some files exist where the dtype is set wrong
+            # in particular, the dtype can still be <f4 but the data
+            # might be integers
+            dtype = np.dtype('int8')
+        else:
+            # Try to infer the correct byte order from the dtype
+            #dtype = _lofar_dtypes[st0.attrs['DATATYPE']]
+            dtype = st0.dtype
+        # process sets of ~32 MB, hence // np.log2(nchan)
+        ntint = 2**25//4//int(np.log2(nchan))  # 16=~nchan -> power of 2, but sets of ~32 MB
+        nsamples = b0.attrs['NOF_SAMPLES']
+        
     return (nchan, ntint, dtype, nsamples, h5saps[0], h5beams[0], h5stokes[0])
 
 
@@ -344,7 +345,6 @@ def CL_parser():
                         'structure for SUB_ARRAY_POINTING_???, BEAM_???, and STOKES_? .')
 
     parser.add_argument('-v', '--verbose', action='append_const', const=1)
-#    return parser.parse_args()
     return parser.parse_args(args=None if sys.argv[1:] else ['--help'])
 
 if __name__ == '__main__':
diff --git a/tests/digitize_input.json b/tests/digitize_input.json
index c031506..67dd7b1 100644
--- a/tests/digitize_input.json
+++ b/tests/digitize_input.json
@@ -1,33 +1,32 @@
 {
-"h5in": [
-{
-"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S0_P000_bf.h5"
-},
-{
-"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S1_P000_bf.h5"
-},
-{
-"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S2_P000_bf.h5"
-},
-{
-"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S3_P000_bf.h5"
-}
-],
-"raw_in": [
-{
-"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S0_P000_bf.raw"
-},
-{
-"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S1_P000_bf.raw"
-},
-{
-"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S2_P000_bf.raw"
-},
-{
-"class": "File", "path": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S3_P000_bf.raw"
-}
-],
-#"output_dir": "/home/kondratiev/pulp2-data/digitize/L2049631/32-bit/8-bit",
-"output_dir": "8-bit",
-"nsigma": 5.1
+    "h5in": [
+        {
+            "class": "File", "path": "/data/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S0_P000_bf.h5"
+        },
+        {
+            "class": "File", "path": "/data/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S1_P000_bf.h5"
+        },
+        {
+            "class": "File", "path": "/data/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S2_P000_bf.h5"
+        },
+        {
+            "class": "File", "path": "/data/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S3_P000_bf.h5"
+        }
+    ],
+    "raw_in": [
+        {
+            "class": "File", "path": "/data/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S0_P000_bf.raw"
+        },
+        {
+            "class": "File", "path": "/data/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S1_P000_bf.raw"
+        },
+        {
+            "class": "File", "path": "/data/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S2_P000_bf.raw"
+        },
+        {
+            "class": "File", "path": "/data/kondratiev/pulp2-data/digitize/L2049631/32-bit/L2049631_SAP000_B000_S3_P000_bf.raw"
+        }
+    ],
+    "output_dir": "8-bit",
+    "nsigma": 5.1
 }
diff --git a/workflows/pulp2-xxyy-8bit-requantisation.cwl b/workflows/pulp2-xxyy-8bit-requantisation.cwl
index 113ce60..f03a8b3 100644
--- a/workflows/pulp2-xxyy-8bit-requantisation.cwl
+++ b/workflows/pulp2-xxyy-8bit-requantisation.cwl
@@ -20,6 +20,7 @@ inputs:
     
   - id: nsigma
     type: float?
+    default: 5.0
     doc: Clip raw data above this many stddevs
 
 steps:
-- 
GitLab


From 856ef587dd955ccfc79637eed4b6570b75283436 Mon Sep 17 00:00:00 2001
From: Vlad Kondratiev <vlad.kondratiev@gmail.com>
Date: Tue, 25 Mar 2025 10:09:56 +0100
Subject: [PATCH 6/7] using COPY to put digitize3.py to bin/ instead of cloning
 the repo

---
 docker/Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 3f9614d..301597e 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -21,7 +21,8 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
 #RUN cd ${INSTALL_DIR} && wget https://files.pythonhosted.org/packages/cc/0c/5c2b0a88158682aeafb10c1c2b735df5bc31f165bfe192f2ee9f2a23b5f1/h5py-3.12.1.tar.gz && \
 #    tar xvfz h5py-3.12.1.tar.gz && cd h5py-3.12.1 && python setup.py install --prefix=${INSTALL_DIR}
 	
-RUN cd ${INSTALL_DIR} && mkdir -p src && cd src/ && git clone https://git.astron.nl/RD/pulp2-cwl && \
-    cd pulp2-cwl/scripts && cp digitize3.py ${INSTALL_DIR}/bin
+#RUN cd ${INSTALL_DIR} && mkdir -p src && cd src/ && git clone https://git.astron.nl/RD/pulp2-cwl && \
+#    cd pulp2-cwl/scripts && cp digitize3.py ${INSTALL_DIR}/bin
+COPY ./scripts/digitize3.py ${INSTALL_DIR}/bin
 
 RUN digitize3.py -h
-- 
GitLab


From 2c04e2db708d33b96248d7c6accfbde62d1aa1ea Mon Sep 17 00:00:00 2001
From: Vlad Kondratiev <vlad.kondratiev@gmail.com>
Date: Tue, 25 Mar 2025 10:17:25 +0100
Subject: [PATCH 7/7] changed odir to out_dir in outputs for both digitze.cwl
 step and the workflow

---
 steps/digitize.cwl                           | 2 +-
 workflows/pulp2-xxyy-8bit-requantisation.cwl | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/steps/digitize.cwl b/steps/digitize.cwl
index 4f30c99..ec7d415 100644
--- a/steps/digitize.cwl
+++ b/steps/digitize.cwl
@@ -43,7 +43,7 @@ inputs:
     doc: Clip raw data above this many stddevs
     
 outputs:
-  - id: odir
+  - id: out_dir
     type: Directory
     doc: directory for output 8-bit .h5 and .raw files
     outputBinding:
diff --git a/workflows/pulp2-xxyy-8bit-requantisation.cwl b/workflows/pulp2-xxyy-8bit-requantisation.cwl
index f03a8b3..b2856ae 100644
--- a/workflows/pulp2-xxyy-8bit-requantisation.cwl
+++ b/workflows/pulp2-xxyy-8bit-requantisation.cwl
@@ -49,7 +49,7 @@ outputs:
     outputSource: digitize/raw_out
     doc: Output 8-bit .raw files
 
-  - id: odir
+  - id: out_dir
     type: Directory
-    outputSource: digitize/odir
+    outputSource: digitize/out_dir
     doc: directory for output 8-bit .h5 and .raw files
-- 
GitLab