From b066493cce4032320b4fc1ec4f348eff19b246cc Mon Sep 17 00:00:00 2001 From: mevius <mevius@astron.nl> Date: Fri, 12 May 2023 22:29:24 +0200 Subject: [PATCH] data reading in chunks for large (raw) data files --- scintillation/averaging.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/scintillation/averaging.py b/scintillation/averaging.py index 9c67824..bb3d263 100644 --- a/scintillation/averaging.py +++ b/scintillation/averaging.py @@ -169,7 +169,7 @@ def extract_root_metadata(dataset): metadata['OBSERVATION_END_UTC'] = metadata['OBSERVATION_END_UTC'].split(' ')[0] if not 'TARGET' in list(metadata.keys()): if 'TARGETS' in list(metadata.keys()): - metadata['TARGET']=metadata['TARGETS'][0] + metadata['TARGET']=metadata['TARGETS'][-1].split("_")[0] return metadata @@ -517,9 +517,12 @@ def split_samples(dynspec_name, station_name, *_ = metadata['OBSERVATION_STATIONS_LIST'] if 'DATA' in dataset[dynspec_name]: - data_array = dataset[dynspec_name]['DATA'][:, :, 0] + data_array = dataset[dynspec_name]['DATA'] + nofch = 1 #TODO check if this is always the case for DYNSPEC data else: - data_array = dataset[dynspec_name]['BEAM_000']['STOKES_0'][:, :] + data_array = dataset[dynspec_name]['BEAM_000']['STOKES_0'] + #take median over channels for raw data + nofch = dataset[dynspec_name]['BEAM_000']['STOKES_0'].attrs['NOF_CHANNELS'][0] averaging_window_in_samples = int(numpy.ceil(averaging_window / time_delta)) averaging_window_in_seconds = averaging_window_in_samples * time_delta S4_60s_window_in_samples = int(60. / time_delta) @@ -533,7 +536,6 @@ def split_samples(dynspec_name, time_obs = numpy.linspace(obs_start_time.timestamp(), obs_end_time.timestamp(), total_time_samples) n_samples = int((end_obs_datetime - start_obs_datetime).seconds // sample_window) - for i in range(n_samples): start_sample_datetime = round_down_datetime( start_obs_datetime + timedelta(seconds=sample_window * i), @@ -567,10 +569,21 @@ def split_samples(dynspec_name, compute_start_end_azimuth_elevation(sample_info) sample_rate = int( averaging_window_in_samples * 3. / averaging_window_in_seconds) + logging.info("taking median over %d channels",nofch) + if nofch>1: + nsb = data_array.shape[1]//nofch + tmp_data = data_array[start_index:end_index] + #fill again in loop since otherwise some data is filled with zeros, not understood why + for ich in range(nofch): + tmp_data[:,ich*nsb:(ich+1)*nsb] = data_array[start_index:end_index,ich*nsb:(ich+1)*nsb] + tmp_data = numpy.median(tmp_data.reshape((-1,nsb,nofch)),axis=-1) + else: + tmp_data = data_array[start_index:end_index] + tmp_data = tmp_data.squeeze() frequency_axis = numpy.linspace(start_frequency, end_frequency, - data_array.shape[1]) + tmp_data.shape[1]) filtered_data, flags, flux, bandpass = apply_bandpass( - data_array[start_index:end_index], frequency_axis, + tmp_data, frequency_axis, freqaxis=1, timeaxis=0, target=metadata["TARGET"], sample_rate=sample_rate, # sample every 3 seconds @@ -590,8 +603,7 @@ def split_samples(dynspec_name, axis=0, has_nan=False) averaged_data_array, time_axis, frequency_axis, flags, bandpass = \ create_averaged_dataset(sample_info, - data_array[ - start_index:end_index], + tmp_data, flags, bandpass) # make sure the data is shaped to contain integer of window -- GitLab