From b066493cce4032320b4fc1ec4f348eff19b246cc Mon Sep 17 00:00:00 2001
From: mevius <mevius@astron.nl>
Date: Fri, 12 May 2023 22:29:24 +0200
Subject: [PATCH] data reading in chunks for large (raw) data files

---
 scintillation/averaging.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/scintillation/averaging.py b/scintillation/averaging.py
index 9c67824..bb3d263 100644
--- a/scintillation/averaging.py
+++ b/scintillation/averaging.py
@@ -169,7 +169,7 @@ def extract_root_metadata(dataset):
     metadata['OBSERVATION_END_UTC'] = metadata['OBSERVATION_END_UTC'].split(' ')[0]
     if not 'TARGET' in list(metadata.keys()):
         if 'TARGETS' in list(metadata.keys()):
-            metadata['TARGET']=metadata['TARGETS'][0]
+            metadata['TARGET']=metadata['TARGETS'][-1].split("_")[0]
     return metadata
 
 
@@ -517,9 +517,12 @@ def split_samples(dynspec_name,
         station_name, *_ = metadata['OBSERVATION_STATIONS_LIST']
 
     if 'DATA' in dataset[dynspec_name]:
-        data_array = dataset[dynspec_name]['DATA'][:, :, 0]
+        data_array = dataset[dynspec_name]['DATA']
+        nofch = 1 #TODO check if this is always the case for DYNSPEC data
     else:
-        data_array = dataset[dynspec_name]['BEAM_000']['STOKES_0'][:, :]
+        data_array = dataset[dynspec_name]['BEAM_000']['STOKES_0']
+        #take median over channels for raw data
+        nofch = dataset[dynspec_name]['BEAM_000']['STOKES_0'].attrs['NOF_CHANNELS'][0]
     averaging_window_in_samples = int(numpy.ceil(averaging_window / time_delta))
     averaging_window_in_seconds = averaging_window_in_samples * time_delta
     S4_60s_window_in_samples = int(60. / time_delta)
@@ -533,7 +536,6 @@ def split_samples(dynspec_name,
     time_obs = numpy.linspace(obs_start_time.timestamp(), obs_end_time.timestamp(),
                               total_time_samples)
     n_samples = int((end_obs_datetime - start_obs_datetime).seconds // sample_window)
-
     for i in range(n_samples):
         start_sample_datetime = round_down_datetime(
             start_obs_datetime + timedelta(seconds=sample_window * i),
@@ -567,10 +569,21 @@ def split_samples(dynspec_name,
         compute_start_end_azimuth_elevation(sample_info)
         sample_rate = int(
             averaging_window_in_samples * 3. / averaging_window_in_seconds)
+        logging.info("taking median over %d channels",nofch)
+        if nofch>1:
+            nsb = data_array.shape[1]//nofch
+            tmp_data = data_array[start_index:end_index]
+            #fill again in loop since otherwise some data is filled with zeros, not understood why
+            for ich in range(nofch):
+                tmp_data[:,ich*nsb:(ich+1)*nsb] = data_array[start_index:end_index,ich*nsb:(ich+1)*nsb]
+            tmp_data = numpy.median(tmp_data.reshape((-1,nsb,nofch)),axis=-1)
+        else:
+            tmp_data = data_array[start_index:end_index]
+        tmp_data = tmp_data.squeeze()
         frequency_axis = numpy.linspace(start_frequency, end_frequency,
-                                        data_array.shape[1])
+                                        tmp_data.shape[1])
         filtered_data, flags, flux, bandpass = apply_bandpass(
-            data_array[start_index:end_index], frequency_axis,
+            tmp_data, frequency_axis,
             freqaxis=1, timeaxis=0, target=metadata["TARGET"],
             sample_rate=sample_rate,
             # sample every 3 seconds
@@ -590,8 +603,7 @@ def split_samples(dynspec_name,
                                  axis=0, has_nan=False)
         averaged_data_array, time_axis, frequency_axis, flags, bandpass = \
             create_averaged_dataset(sample_info,
-                                    data_array[
-                                    start_index:end_index],
+                                    tmp_data,
                                     flags,
                                     bandpass)  # make sure the data is shaped to contain integer of window
 
-- 
GitLab