Skip to content
Snippets Groups Projects
Commit a8af4dae authored by Jan David Mol's avatar Jan David Mol
Browse files

bug 1362: paper update

parent 73f6d27c
No related branches found
No related tags found
No related merge requests found
...@@ -11,4 +11,4 @@ V 12 -> 16 int factor in grafieken en tekst ...@@ -11,4 +11,4 @@ V 12 -> 16 int factor in grafieken en tekst
V c vs asm beamformer speedup = 16 V c vs asm beamformer speedup = 16
- ionproc stats & graph V ionproc stats & graph
...@@ -7,13 +7,13 @@ ...@@ -7,13 +7,13 @@
5 = IQUV, 64s, 10b 5 = IQUV, 64s, 10b
6 = I, 4i, 64s, 42b 6 = I, 4i, 64s, 42b
pre_bf_dsp_&_I/O -> 3.44 % 4.69 % 7.70 % 4.07 % 6.72 % 7.04 %
beam_forming -> 14.36 % 49.19 % 67.27 % 2.19 % 4.56 % 17.50 %
coh_dedispersion -> 0 % 0 % 0 % 2.76 % 2.12 % 9.18 %
stokes -> 45.21 % 26.98 % 13.32 % 0.27 % 0.20 % 0.66 %
2nd_xpose -> 5.87 % 2.73 % 0.85 % 2.88 % 3.40 % 3.37 %
stokes_reorder -> 4.99 % 4.44 % 2.11 % 5.77 % 4.45 % 4.68 %
output_I/O -> 1.45 % 0.48 % 0.20 % 0.80 % 0.36 % 0.38 % output_I/O -> 1.45 % 0.48 % 0.20 % 0.80 % 0.36 % 0.38 %
stokes_reorder -> 4.99 % 4.44 % 2.11 % 5.77 % 4.45 % 4.68 %
2nd_xpose -> 5.87 % 2.73 % 0.85 % 2.88 % 3.40 % 3.37 %
stokes -> 45.21 % 26.98 % 13.32 % 0.27 % 0.20 % 0.66 %
coh_dedispersion -> 0 % 0 % 0 % 2.76 % 2.12 % 9.18 %
beam_forming -> 14.36 % 49.19 % 67.27 % 2.19 % 4.56 % 17.50 %
pre_bf_dsp_&_I/O -> 3.44 % 4.69 % 7.70 % 4.07 % 6.72 % 7.04 %
The above data is read from this file and accumulated using the following command, where N is the line number we need: The above data is read from this file and accumulated using the following command, where N is the line number we need:
...@@ -53,55 +53,61 @@ Case (see Table 1) ...@@ -53,55 +53,61 @@ Case (see Table 1)
*) *)
newcurve newcurve
(* output core pure I/O *) (* input core I/O + prebf dsp *)
marktype xbar marksize 0.8 fill 0 pattern solid cfill 1 0 0 marktype xbar marksize 0.8 fill 0 pattern solid cfill 0.494117618 0 0.129411757
pts pts
shell : awk -v N=7 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr shell : awk -v N=7 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
label : Output to I/O node label : 1st all-to-all exchange & \
input handling
newcurve newcurve
(* output reorder *) (* beam forming *)
marktype xbar marksize 0.8 fill 0.8 pattern solid cfill 1 1 0 marktype xbar marksize 0.8 fill 0.8 pattern solid cfill 1 .5 1
pts pts
shell : awk -v N=6 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr shell : awk -v N=6 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
label : Output reordering label : Beam forming
newcurve newcurve
(* 2nd xpose *) (* coh dd *)
marktype xbar marksize 0.8 fill 1 pattern solid cfill 0 0.7 0 marktype xbar marksize 0.8 fill 1 pattern solid cfill 0 0.270588219 0.525490165
pts pts
shell : awk -v N=5 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr shell : awk -v N=5 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
label : 2nd all-to-all exchange label : Channel dedispersion \
(cases D-F)
newcurve newcurve
(* stokes calculation *) (* stokes calculation *)
marktype xbar marksize 0.8 pattern stripe 45 cfill 0 1 1 marktype xbar marksize 0.8 pattern solid cfill 1 0.258823514 0.0549019575
pts pts
shell : awk -v N=4 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr shell : awk -v N=4 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
label : Stokes calculations label : Stokes calculations
newcurve newcurve
(* coh dd *) (* 2nd xpose *)
marktype xbar marksize 0.8 fill 0.2 pattern solid cfill 1 0 1 marktype xbar marksize 0.8 fill 0.2 pattern solid cfill 1 0.827450931 0.125490189
pts pts
shell : awk -v N=3 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr shell : awk -v N=3 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
label : Channel dedispersion \ label : 2nd all-to-all exchange
(cases D-F)
newcurve newcurve
(* beam forming *) (* output reorder *)
marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0 0 1 marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0.34117645 0.615686238 0.109803915
pts pts
shell : awk -v N=2 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr shell : awk -v N=2 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
label : Beam forming label : Output reordering
newcurve newcurve
(* input core I/O + prebf dsp *) (* output core pure I/O *)
marktype xbar marksize 0.8 fill 0.5 pattern stripe -45 cfill 0 0.7 0 marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0.51372546 0.792156816 1
pts pts
shell : awk -v N=1 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr shell : awk -v N=1 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
label : 1st all-to-all exchange & \ label : Output to I/O node
input handling
legend top defaults hjl linelength 75 x 4 y 110 legend top defaults hjl linelength 75 x 4 y 110
......
...@@ -154,35 +154,40 @@ Case (see Table 1) ...@@ -154,35 +154,40 @@ Case (see Table 1)
newcurve newcurve
(* station->ion *) (* station->ion *)
marktype xbar marksize 0.8 fill 0.5 pattern stripe -45 cfill 0 0.7 0 marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0 0.270588219 0.525490165
pts pts
shell : awk -v N=5 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr shell : awk -v N=5 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
label hjl x 1 y 105 : Input from station label hjl x 1 y 105 : Input from station
newcurve newcurve
(* delaycomp *) (* delaycomp *)
marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0 0 1 marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 1 0.258823514 0.0549019575
pts pts
shell : awk -v N=4 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr shell : awk -v N=4 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
label hjl x 1 y 100 : Weight computations label hjl x 1 y 100 : Weight computations
newcurve newcurve
(* ion<->cn *) (* ion<->cn *)
marktype xbar marksize 0.8 pattern stripe 45 cfill 0 1 1 marktype xbar marksize 0.8 pattern solid cfill 1 0.827450931 0.125490189
pts pts
shell : awk -v N=3 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr shell : awk -v N=3 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
label hjl x 1 y 95 : I/O with compute cores label hjl x 1 y 95 : I/O with compute cores
newcurve newcurve
(* ion->storage *) (* ion->storage *)
marktype xbar marksize 0.8 fill 0.2 pattern solid cfill 1 0 1 marktype xbar marksize 0.8 fill 0.2 pattern solid cfill 0.34117645 0.615686238 0.109803915
pts pts
shell : awk -v N=2 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr shell : awk -v N=2 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
label hjl x 5 y 105 : Output to storage label hjl x 5 y 105 : Output to storage
newcurve newcurve
(* interrupts *) (* interrupts *)
marktype xbar marksize 0.8 fill 1 pattern solid cfill 0 0.7 0 marktype xbar marksize 0.8 fill 1 pattern solid cfill 0.51372546 0.792156816 1
pts pts
shell : awk -v N=1 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr shell : awk -v N=1 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
label hjl x 5 y 100 : IRQ handling label hjl x 5 y 100 : IRQ handling
......
No preview for this file type
...@@ -132,15 +132,7 @@ The beam former combines the chunks from all stations, producing a chunk for eac ...@@ -132,15 +132,7 @@ The beam former combines the chunks from all stations, producing a chunk for eac
%The delays are applied to the station data through complex multiplications and additions, programmed in assembly. In order to take full advantage of the L1 cache and the available registers, data is processed in sets of 6 stations, producing 3 beams, in portions of 128 samples, or a subset thereof to cover the remainders. While the exact ideal set size in which the data is to be processed is platform specific, we have shown in previous work that similar tradeoffs exist for similar problems across different architectures~\cite{Nieuwpoort:09}. %The delays are applied to the station data through complex multiplications and additions, programmed in assembly. In order to take full advantage of the L1 cache and the available registers, data is processed in sets of 6 stations, producing 3 beams, in portions of 128 samples, or a subset thereof to cover the remainders. While the exact ideal set size in which the data is to be processed is platform specific, we have shown in previous work that similar tradeoffs exist for similar problems across different architectures~\cite{Nieuwpoort:09}.
\comment{
\begin{listing} \begin{listing}
\caption{Pseudo code for the processing loops around the beam former assembly.}
\label{lst:beam-forming}
\end{listing}
}
All time-consuming pipeline components are written in assembly, to achieve maximum performance. The assembly code minimises the number of memory accesses, minimises load delays, minimises FPU pipeline stalls, and maximises instruction-level parallelism. We learnt that optimal performance is often achieved by combining multiple iterations of a multi-dimensional loops:
\lstset{language=pseudo} \lstset{language=pseudo}
\begin{lstlisting}{} \begin{lstlisting}{}
FOR Channel IN 1 .. NrChannels DO FOR Channel IN 1 .. NrChannels DO
...@@ -149,8 +141,11 @@ FOR Channel IN 1 .. NrChannels DO ...@@ -149,8 +141,11 @@ FOR Channel IN 1 .. NrChannels DO
FOR Beam IN 1 .. NrBeams STEP 3 DO FOR Beam IN 1 .. NrBeams STEP 3 DO
BeamForm6StationsAnd128TimesTo3BeamsAssembly(...) BeamForm6StationsAnd128TimesTo3BeamsAssembly(...)
\end{lstlisting} \end{lstlisting}
\caption{Pseudo code for the processing loops around the beam former assembly.}
\label{lst:beam-forming}
\end{listing}
This is much more efficient than to create all beams one at a time, due to better reuse of data loaded from main memory. Finding the most efficient way to group work is a combination of careful analysis and, unfortunately, trial-and-error. The coherent beam former achieves 85\% of the FPU peak performance, not as high as the 96\% of the correlator~\cite{Romein:10a}, but still 16 times more than the C++ reference implementation. All time-consuming pipeline components are written in assembly, to achieve maximum performance. The assembly code minimises the number of memory accesses, minimises load delays, minimises FPU pipeline stalls, and maximises instruction-level parallelism. We learnt that optimal performance is often achieved by combining multiple iterations of a multi-dimensional loops like shown in Listing~\ref{lst:beam-forming}. This is much more efficient than to create all beams one at a time, due to better reuse of data loaded from main memory. Finding the most efficient way to group work is a combination of careful analysis and, unfortunately, trial-and-error. The coherent beam former achieves 86\% of the FPU peak performance, not as high as the 96\% of the correlator~\cite{Romein:10a}, but still 16 times more than the C++ reference implementation.
%Because each beam is an accumulation of the data from all stations, the bandwidth of each beam is equal to the bandwidth of data from a single station, which is 6.2~Gb/s now that the samples are 32-bit floats. Once the beams are formed, they are kept as complex voltages or transformed into the Stokes IQUV or the Stokes I parameters. In the latter case, the beams can also be integrated temporally to reduce the resulting data rate. Finally, an incoherent beam can be created in parallel, and converted into either Stokes I or Stokes IQUV parameters. %Because each beam is an accumulation of the data from all stations, the bandwidth of each beam is equal to the bandwidth of data from a single station, which is 6.2~Gb/s now that the samples are 32-bit floats. Once the beams are formed, they are kept as complex voltages or transformed into the Stokes IQUV or the Stokes I parameters. In the latter case, the beams can also be integrated temporally to reduce the resulting data rate. Finally, an incoherent beam can be created in parallel, and converted into either Stokes I or Stokes IQUV parameters.
%Our beam former supports several pipelines: \emph{complex voltages}, \emph{Stokes IQUV}, and \emph{Stokes I}. The complex voltages pipeline outputs the raw tied-array beams, which consist of two 3.1~Gb/s streams of 32-bit complex floating points numbers (floats), one stream for each polarisation. The Stokes IQUV pipeline applies a domain transformation to each sample of the raw tied-array beams, which is useful for polarisation-related studies. The four Stokes parameters, calculated through $I = X\overline{X} + Y\overline{Y}$, $Q = X\overline{X} - Y\overline{Y}$, $U = 2\mathrm{Re}(X\overline{Y})$, $V = 2\mathrm{Im}(X\overline{Y})$, with each parameter being a 32-bit float, resulting in four 1.5~Gb/s streams. The Stokes I pipeline calculates only the first Stokes parameter, which represents the signal strength in both polarisations. The Stokes I pipeline supports temporal integration to trade time resolution for a reduced bandwidth per beam, allowing more beams to be created. %Our beam former supports several pipelines: \emph{complex voltages}, \emph{Stokes IQUV}, and \emph{Stokes I}. The complex voltages pipeline outputs the raw tied-array beams, which consist of two 3.1~Gb/s streams of 32-bit complex floating points numbers (floats), one stream for each polarisation. The Stokes IQUV pipeline applies a domain transformation to each sample of the raw tied-array beams, which is useful for polarisation-related studies. The four Stokes parameters, calculated through $I = X\overline{X} + Y\overline{Y}$, $Q = X\overline{X} - Y\overline{Y}$, $U = 2\mathrm{Re}(X\overline{Y})$, $V = 2\mathrm{Im}(X\overline{Y})$, with each parameter being a 32-bit float, resulting in four 1.5~Gb/s streams. The Stokes I pipeline calculates only the first Stokes parameter, which represents the signal strength in both polarisations. The Stokes I pipeline supports temporal integration to trade time resolution for a reduced bandwidth per beam, allowing more beams to be created.
...@@ -217,7 +212,6 @@ We will focus our performance analysis on the most challenging cases that are of ...@@ -217,7 +212,6 @@ We will focus our performance analysis on the most challenging cases that are of
\vspace{-1cm} \vspace{-1cm}
\end{wrapfigure} \end{wrapfigure}
% TODO: getallen kloppen niet.. 13 beams is 80.6 Gb/s, en met 70 Gb/s zouden we 11 beams aan moeten kunnen
Figure \ref{fig:stations-beams} shows the maximum number of beams that can be created when using a various number of stations, in each of the three pipelines: complex voltages, Stokes IQUV, and Stokes I. Both the complex voltages and the Stokes IQUV pipelines are I/O bound. Each beam is 6.2~Gb/s wide. We can make at most 13 beams without exceeding the available 81~Gb/s to our storage cluster. If 64 stations are used, the available bandwidth is 70~Gb/s due to the fact that an I/O node can only output 1.1~Gb/s if it also has to process station data. The granularity with which the output can be distributed over the I/O nodes, as well as scheduling details, determine the actual number of beams that can be created, but in all cases, the beam former can create at least 10 beams at LOFAR's full observational bandwidth. Figure \ref{fig:stations-beams} shows the maximum number of beams that can be created when using a various number of stations, in each of the three pipelines: complex voltages, Stokes IQUV, and Stokes I. Both the complex voltages and the Stokes IQUV pipelines are I/O bound. Each beam is 6.2~Gb/s wide. We can make at most 13 beams without exceeding the available 81~Gb/s to our storage cluster. If 64 stations are used, the available bandwidth is 70~Gb/s due to the fact that an I/O node can only output 1.1~Gb/s if it also has to process station data. The granularity with which the output can be distributed over the I/O nodes, as well as scheduling details, determine the actual number of beams that can be created, but in all cases, the beam former can create at least 10 beams at LOFAR's full observational bandwidth.
In the Stokes I pipeline, we applied several integration factors (1, 2, 4, 8, and 16) in order to show the trade-off between beam quality and the number of beams. Integration factors higher than 16 does not allow significantly more beams to be created, but could be used in order to further reduce the total output rate. For low integration factors, the beam former is again limited by the available output bandwidth. Once the Stokes I streams are integrated sufficiently, the system becomes bounded by the compute nodes: if only signals from a few stations have to be combined, the beam former is limited by the amount of available memory required to store the beams. If more input has to be combined, the beam former becomes limited by the CPU power available in the compute cores. For observations for which a high integration factor is acceptable, the beam former is able to create 155 up to 543 tied-array beams, depending on the number of stations used. For observations that need a high time resolution and thus a low integration factor, the beam former is still able to create at least 42 tied-array beams. In the Stokes I pipeline, we applied several integration factors (1, 2, 4, 8, and 16) in order to show the trade-off between beam quality and the number of beams. Integration factors higher than 16 does not allow significantly more beams to be created, but could be used in order to further reduce the total output rate. For low integration factors, the beam former is again limited by the available output bandwidth. Once the Stokes I streams are integrated sufficiently, the system becomes bounded by the compute nodes: if only signals from a few stations have to be combined, the beam former is limited by the amount of available memory required to store the beams. If more input has to be combined, the beam former becomes limited by the CPU power available in the compute cores. For observations for which a high integration factor is acceptable, the beam former is able to create 155 up to 543 tied-array beams, depending on the number of stations used. For observations that need a high time resolution and thus a low integration factor, the beam former is still able to create at least 42 tied-array beams.
...@@ -276,7 +270,7 @@ Case & Mode & Channel & Int. & Stations & Beams & Input & Output & Bound & Used ...@@ -276,7 +270,7 @@ Case & Mode & Channel & Int. & Stations & Beams & Input & Output & Bound & Used
\circlenumber{C} & Stokes I & N & 8 & 64 & 155 & 198 Gb/s & 30 Gb/s & CPU & Surveys \\ \circlenumber{C} & Stokes I & N & 8 & 64 & 155 & 198 Gb/s & 30 Gb/s & CPU & Surveys \\
\circlenumber{D} & Stokes IQUV & Y & - & 24 & 13 & 74 Gb/s & 81 Gb/s & I/O & Known sources \\ \circlenumber{D} & Stokes IQUV & Y & - & 24 & 13 & 74 Gb/s & 81 Gb/s & I/O & Known sources \\
\circlenumber{E} & Stokes IQUV & Y & - & 64 & 10 & 198 Gb/s & 62 Gb/s & I/O & Known sources \\ \circlenumber{E} & Stokes IQUV & Y & - & 64 & 10 & 198 Gb/s & 62 Gb/s & I/O & Known sources \\
\circlenumber{F} & Stokes I & Y & 1 & 64 & 42 & 198 Gb/s & 65 Gb/s & CPU & Known sources \circlenumber{F} & Stokes I & Y & 1 & 64 & 42 & 198 Gb/s & 65 Gb/s & I/O & Known sources
\end{tabular} \end{tabular}
\caption{Several highlighted cases.} \caption{Several highlighted cases.}
\label{table:cases} \label{table:cases}
...@@ -296,7 +290,7 @@ The costs for both the first and the second all-to-all exchange are mostly hidde ...@@ -296,7 +290,7 @@ The costs for both the first and the second all-to-all exchange are mostly hidde
For the I/O-bound cases \circlenumber{D}\circlenumber{E}\circlenumber{F}, only a few tied-array beams are formed and transformed into Stokes I(QUV) parameters, which produces a lot of data but requires little CPU time. Enough CPU time is therefore available to include channel-level dedispersion, which scales with the number of beams and is an expensive operation. For the I/O-bound cases \circlenumber{D}\circlenumber{E}\circlenumber{F}, only a few tied-array beams are formed and transformed into Stokes I(QUV) parameters, which produces a lot of data but requires little CPU time. Enough CPU time is therefore available to include channel-level dedispersion, which scales with the number of beams and is an expensive operation.
Figure \ref{fig:ionperf} shows the workload for the busiest I/O nodes in each case, including the time spent in the kernel to handle IRQs. Processing the station data and communicating with the compute cores cause most of the load. For cases \circlenumber{A} and \circlenumber{B}, the output is handled by other I/O nodes. Both cases form many beams, and thus require time to compute beam former weights. Cases \circlenumber{C}\circlenumber{D}\circlenumber{E}\circlenumber{F} do handle output, and show nearly identical behaviour. Figure \ref{fig:ionperf} shows the workload for the busiest I/O nodes in each case, including the time spent in the kernel to handle IRQs. Processing the station data and communicating with the compute cores cause most of the load. For cases \circlenumber{A} and \circlenumber{B}, the output is handled by other I/O nodes. Both cases form many beams, and thus require time to compute the positional weights that are required by the beam former. Cases \circlenumber{C}\circlenumber{D}\circlenumber{E}\circlenumber{F} do handle output, and show nearly identical behaviour.
\section{Related Work} \section{Related Work}
\label{Sec:related-work} \label{Sec:related-work}
...@@ -317,7 +311,7 @@ The use of a software solution on powerful interconnected hardware is a key aspe ...@@ -317,7 +311,7 @@ The use of a software solution on powerful interconnected hardware is a key aspe
The BG/P supercomputer provides us with enough computing power and powerful networks to be able to implement the signal processing and all-to-all-exchanges that we require, without having to resort to a dedicated system which inevitably curbs the design freedom that the supercomputer provides. As with any system, platform-specific parameters nevertheless become important when maximal performance is desired. Although a C reference implementation allowed us to quickly develop and test features, we needed handcrafted assembly to keep the double FPUs of each compute core busy. The BG/P supercomputer provides us with enough computing power and powerful networks to be able to implement the signal processing and all-to-all-exchanges that we require, without having to resort to a dedicated system which inevitably curbs the design freedom that the supercomputer provides. As with any system, platform-specific parameters nevertheless become important when maximal performance is desired. Although a C reference implementation allowed us to quickly develop and test features, we needed handcrafted assembly to keep the double FPUs of each compute core busy.
The architecture of the BG/P makes some tasks more difficult as well. The fact that an I/O node can only communicate with its own compute cores prevents us from freely scheduling the workload. Instead, we had to manually route the data using two all-to-all exchanges in order to stream the data from and to the right I/O nodes. To achieve maximum performance, we tuned the distribution of the workload over the cores to avoid network collisions. The architecture of the BG/P makes some tasks more difficult as well. The fact that an I/O node can only communicate with its own compute cores prevents us from freely scheduling the workload. Instead, we have to manually route the data using two all-to-all exchanges in order to stream the data from and to the right I/O nodes. To achieve maximum performance, we tuned the distribution of the workload over the cores to avoid network collisions.
\bibliographystyle{plain} \bibliographystyle{plain}
\bibliography{lofar} \bibliography{lofar}
......
...@@ -26,7 +26,6 @@ newgraph ...@@ -26,7 +26,6 @@ newgraph
hash_label at 17 : 289 hash_label at 17 : 289
hash_label at 19 : 361 hash_label at 19 : 361
hash_label at 21 : 441 hash_label at 21 : 441
hash_label at 23 : 529
(* (*
hash_label at 1 : 1 hash_label at 1 : 1
hash_label at 2 : 4 hash_label at 2 : 4
...@@ -54,7 +53,7 @@ newgraph ...@@ -54,7 +53,7 @@ newgraph
hash_label at 24 : 576 hash_label at 24 : 576
*) *)
min 0 min 0
max 24 max 22
newline newline
linetype dotted linetype dotted
...@@ -188,6 +187,7 @@ newstring : Stokes I, \ ...@@ -188,6 +187,7 @@ newstring : Stokes I, \
x 2 y 18.2 x 2 y 18.2
hjl vjc hjl vjc
(*
newline newline
linetype dotted linetype dotted
linethickness 2.0 linethickness 2.0
...@@ -197,15 +197,19 @@ newline ...@@ -197,15 +197,19 @@ newline
8 20.15 (* 406 *) 8 20.15 (* 406 *)
12 19.60 (* 384 *) 12 19.60 (* 384 *)
16 19.60 (* 384 *) 16 19.60 (* 384 *)
*)
newline newline
linetype solid linetype solid
linethickness 2.0 linethickness 2.0
color 1 0 1 color 1 0 1
pts pts
24 17.03 (* 290 *)
(*
16 19.60 (* 384 *) 16 19.60 (* 384 *)
20 18.97 (* 360 *) 20 18.97 (* 360 *)
24 18.08 (* 327 *) 24 18.08 (* 327 *)
*)
28 17.26 (* 298 *) 28 17.26 (* 298 *)
32 16.43 (* 270 *) 32 16.43 (* 270 *)
36 15.84 (* 251 *) 36 15.84 (* 251 *)
...@@ -221,6 +225,7 @@ newstring : Stokes I, 16x integration ...@@ -221,6 +225,7 @@ newstring : Stokes I, 16x integration
x 15 y 22 x 15 y 22
hjl vjc hjl vjc
(*
newline newline
linetype dotted linetype dotted
linethickness 2.0 linethickness 2.0
...@@ -228,16 +233,23 @@ newline ...@@ -228,16 +233,23 @@ newline
pts pts
4 23.26 (* 541 *) 4 23.26 (* 541 *)
8 23.17 (* 537 *) 8 23.17 (* 537 *)
*)
newline newline
linetype solid linetype solid
linethickness 2.0 linethickness 2.0
color 0 1 1 color 0 1 1
pts pts
4 21.21 (* 450 *)
8 21.21 (* 450 *)
12 20.62 (* 425 *)
16 19.75 (* 390 *)
(*
8 23.17 (* 537 *) 8 23.17 (* 537 *)
12 21.84 (* 477 *) 12 21.84 (* 477 *)
16 20.37 (* 415 *) 16 20.37 (* 415 *)
20 18.97 (* 360 *) 20 18.97 (* 360 *)
*)
24 18.08 (* 327 *) 24 18.08 (* 327 *)
28 17.26 (* 298 *) 28 17.26 (* 298 *)
32 16.43 (* 270 *) 32 16.43 (* 270 *)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment