diff --git a/doc/papers/2010/ppopp/CN-processing.pdf b/doc/papers/2010/ppopp/CN-processing.pdf index 036f72d7ca78c5199837ce803e84217e15942c8c..f6704c7334e71ff36db24061db4ef8dd74fe9632 100644 Binary files a/doc/papers/2010/ppopp/CN-processing.pdf and b/doc/papers/2010/ppopp/CN-processing.pdf differ diff --git a/doc/papers/2010/ppopp/CN-processing.svg b/doc/papers/2010/ppopp/CN-processing.svg index f827dd1c3619185c04443b4bb346715dbe8b695b..594f26879d3379a6b94ab8890b660e1d74140c60 100644 --- a/doc/papers/2010/ppopp/CN-processing.svg +++ b/doc/papers/2010/ppopp/CN-processing.svg @@ -58,7 +58,7 @@ inkscape:window-width="1600" inkscape:window-height="1125" inkscape:window-x="0" - inkscape:window-y="25" /> + inkscape:window-y="0" /> <metadata id="metadata8414"> <rdf:RDF> @@ -324,7 +324,7 @@ font-style="normal" y="515.58063" x="261.26437" - xml:space="preserve">IONProc</text> + xml:space="preserve">CNProc</text> <text style="font-size:5.89105511px;font-style:normal;font-weight:bold;text-anchor:middle;fill:#a14000;stroke:#a14000;stroke-width:0.0007671in;font-family:Helvetica" id="text8539" diff --git a/doc/papers/2010/ppopp/cn-performance.jgr b/doc/papers/2010/ppopp/cn-performance.jgr index 9de326b48281a6d64c97e2b28358f3e469d31c59..1b21424cc09e553d68eb73cf272b76bca6b0571b 100644 --- a/doc/papers/2010/ppopp/cn-performance.jgr +++ b/doc/papers/2010/ppopp/cn-performance.jgr @@ -101,7 +101,7 @@ newstring hjc vjc x 32.7 - y 7.496 + y 7.492 lcolor 1 0.827450931 0.125490189 : 1 @@ -110,23 +110,23 @@ copystring : 2 copystring - y 6.696 + y 6.692 : 3 copystring - y 6.296 + y 6.292 : 4 copystring - y 5.896 + y 5.892 : 5 copystring - y 5.496 + y 5.492 : 6 copystring - y 5.096 + y 5.092 : 7 newcurve @@ -141,7 +141,7 @@ newcurve marktype box marksize 3.4 .32 gray 0 - cfill 1 0 1 + cfill 1 .5 1 pts 32.7 7.1 @@ -187,7 +187,7 @@ newcurve newcurve marktype circle - marksize 1.2 + marksize 1.3 gray 0 fill 0 pts @@ -262,7 +262,7 @@ marktype xbar newcurve marktype xbar marksize 3.4 - cfill 1 0 1 + cfill 1 .5 1 pts 32 1.76 36 2.07 @@ -428,8 +428,8 @@ newcurve marktype none rarrow asize .6 .1 - (* color 1 0 1 - acfill 1 0 1 *) + (* color 1 .5 1 + acfill 1 .5 1 *) pts 68 5.15 66 5.48 diff --git a/doc/papers/2010/ppopp/correlation-triangle.fig b/doc/papers/2010/ppopp/correlation-triangle.fig index 577c8313ebc2c2f8b2a47ff56c29654db9599554..cf1aad1a923e6000794d3ee94e0c5b1ba5130a52 100644 --- a/doc/papers/2010/ppopp/correlation-triangle.fig +++ b/doc/papers/2010/ppopp/correlation-triangle.fig @@ -9,54 +9,9 @@ Single 1200 2 0 32 #ffd21f 0 33 #ffffc0 -0 34 #ff410d +0 34 #569d1b 0 35 #c0c0c0 0 36 #f0c0c0 -6 7125 1425 8475 2775 -2 3 0 5 0 34 50 -1 20 0.000 0 0 -1 0 0 7 - 7200 2700 7200 2100 7800 2100 7800 1500 8400 1500 8400 2700 - 7200 2700 -2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 3 - 7800 2700 7800 2100 8400 2100 -4 1 0 40 -1 18 30 0.0000 4 360 360 7500 2550 A\001 -4 1 0 40 -1 18 30 0.0000 4 360 360 8100 1950 A\001 --6 -6 5925 2625 7275 3975 -2 3 0 5 0 34 50 -1 20 0.000 0 0 -1 0 0 7 - 6000 3900 6000 3300 6600 3300 6600 2700 7200 2700 7200 3900 - 6000 3900 -2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 3 - 6600 3900 6600 3300 7200 3300 -4 1 0 40 -1 18 30 0.0000 4 360 360 6300 3750 A\001 -4 1 0 40 -1 18 30 0.0000 4 360 360 6900 3150 A\001 --6 -6 4725 3825 6075 5175 -2 3 0 5 0 34 50 -1 20 0.000 0 0 -1 0 0 7 - 4800 5100 4800 4500 5400 4500 5400 3900 6000 3900 6000 5100 - 4800 5100 -2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 3 - 5400 5100 5400 4500 6000 4500 -4 1 0 40 -1 18 30 0.0000 4 360 360 5100 4950 A\001 -4 1 0 40 -1 18 30 0.0000 4 360 360 5700 4350 A\001 --6 -6 3525 5025 4875 6375 -2 3 0 5 0 34 50 -1 20 0.000 0 0 -1 0 0 7 - 3600 6300 3600 5700 4200 5700 4200 5100 4800 5100 4800 6300 - 3600 6300 -2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 3 - 4200 6300 4200 5700 4800 5700 -4 1 0 40 -1 18 30 0.0000 4 360 360 3900 6150 A\001 -4 1 0 40 -1 18 30 0.0000 4 360 360 4500 5550 A\001 --6 -6 2325 6225 3675 7575 -2 3 0 5 0 34 50 -1 20 0.000 0 0 -1 0 0 7 - 2400 7500 2400 6900 3000 6900 3000 6300 3600 6300 3600 7500 - 2400 7500 -2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 3 - 3000 7500 3000 6900 3600 6900 -4 1 0 40 -1 18 30 0.0000 4 360 360 2700 7350 A\001 -4 1 0 40 -1 18 30 0.0000 4 360 360 3300 6750 A\001 --6 6 7125 2625 8475 3975 2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 2 7200 3300 8400 3300 @@ -143,6 +98,31 @@ Single 2 1 0 3 0 35 40 -1 -1 0.000 0 0 -1 1 0 2 2 1 3.00 150.00 150.00 6600 8550 7275 8550 +2 3 0 5 0 34 50 -1 20 0.000 0 0 -1 0 0 7 + 2400 7500 2400 6900 3000 6900 3000 6300 3600 6300 3600 7500 + 2400 7500 +2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 3 + 3000 7500 3000 6900 3600 6900 +2 3 0 5 0 34 50 -1 20 0.000 0 0 -1 0 0 7 + 3600 6300 3600 5700 4200 5700 4200 5100 4800 5100 4800 6300 + 3600 6300 +2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 3 + 4200 6300 4200 5700 4800 5700 +2 3 0 5 0 34 50 -1 20 0.000 0 0 -1 0 0 7 + 4800 5100 4800 4500 5400 4500 5400 3900 6000 3900 6000 5100 + 4800 5100 +2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 3 + 5400 5100 5400 4500 6000 4500 +2 3 0 5 0 34 50 -1 20 0.000 0 0 -1 0 0 7 + 7200 2700 7200 2100 7800 2100 7800 1500 8400 1500 8400 2700 + 7200 2700 +2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 3 + 7800 2700 7800 2100 8400 2100 +2 3 0 5 0 34 50 -1 20 0.000 0 0 -1 0 0 7 + 6000 3900 6000 3300 6600 3300 6600 2700 7200 2700 7200 3900 + 6000 3900 +2 1 0 2 0 7 40 -1 -1 0.000 0 0 -1 0 0 3 + 6600 3900 6600 3300 7200 3300 4 1 0 40 -1 18 30 1.5708 4 360 1605 9450 5175 station\001 4 1 0 40 -1 18 30 0.0000 4 360 1605 5625 8700 station\001 4 1 0 40 -1 16 30 0.0000 4 345 270 2700 8100 0\001 @@ -165,3 +145,13 @@ Single 4 0 0 40 -1 16 30 0.0000 4 345 270 8625 1950 9\001 4 0 0 40 -1 16 30 0.0000 4 345 270 8625 4950 4\001 4 0 0 40 -1 16 30 0.0000 4 345 270 8625 6150 2\001 +4 1 32 40 -1 18 30 0.0000 4 360 360 2700 7350 A\001 +4 1 32 40 -1 18 30 0.0000 4 360 360 3300 6750 A\001 +4 1 32 40 -1 18 30 0.0000 4 360 360 3900 6150 A\001 +4 1 32 40 -1 18 30 0.0000 4 360 360 4500 5550 A\001 +4 1 32 40 -1 18 30 0.0000 4 360 360 5100 4950 A\001 +4 1 32 40 -1 18 30 0.0000 4 360 360 5700 4350 A\001 +4 1 32 40 -1 18 30 0.0000 4 360 360 6300 3750 A\001 +4 1 32 40 -1 18 30 0.0000 4 360 360 6900 3150 A\001 +4 1 32 40 -1 18 30 0.0000 4 360 360 7500 2550 A\001 +4 1 32 40 -1 18 30 0.0000 4 360 360 8100 1950 A\001 diff --git a/doc/papers/2010/ppopp/lofar.tex b/doc/papers/2010/ppopp/lofar.tex index d72c54e41b58fb138d4dc5497e9e605974a1f7df..b3be6edb86b262d2eae71214e18ad682445ab220 100644 --- a/doc/papers/2010/ppopp/lofar.tex +++ b/doc/papers/2010/ppopp/lofar.tex @@ -49,19 +49,22 @@ combines the signals from many thousands of simple antennas. Its revolutionary design allows observations in a frequency range that has hardly been studied before. -This paper focuses on another novel feature: where traditional telescopes -combine data using customized hardware, we process the data in \emph{software}. +Another novel feature of LOFAR is the elaborate use of \emph{software\/} to +process data, where traditional telescopes use customized hardware. This dramatically increases flexibility and substantially reduces costs, but the high processing and bandwidth requirements compel the use of a supercomputer. The antenna signals are centrally combined, filtered, optionally beam-formed, and correlated by an IBM Blue Gene/P. +This paper describes the implementation of the so-called correlator. To meet the real-time requirements, the application is highly optimized, and reaches exceptionally high computational and I/O efficiencies. -This allows us to use only half the planned amount of resources, \emph{and\/} -process 50\% more telescope data, significantly improving the effectiveness -of the entire telescope. +Additionally, we study the scalability of the system, and show that it scales +well beyond the requirements. +The optimizations allows us to use only half the planned amount of resources, +\emph{and\/} process 50\% more telescope data, significantly improving the +effectiveness of the entire telescope. \end{abstract} @@ -73,17 +76,19 @@ range. It is the first of a new generation of radio telescopes, that breaks with the concepts of traditional telescopes in several ways. Rather than using large, expensive dishes, LOFAR uses many thousands of -simple antennas that have no movable parts~\cite{Butcher:04,deVos:09}, see Figure~\ref{fig:lba-field}. +simple antennas that have no movable parts~\cite{Butcher:04,deVos:09} (see +Figure~\ref{fig:lba-field}). Essentially, it is a distributed sensor network that monitors the sky and combines all signals centrally. This concept requires much more signal processing, but the additional costs of silicon are easily offset by cost savings in steel that would be needed for dishes. Moreover, LOFAR can observe the sky in many directions simultaneously and switch directions instantaneously. -In several ways, LOFAR will be the largest telescope of the world, -and will enable groundbreaking research in several areas of astronomy and particle -physics~\cite{Bruyn:02}. The different goals and observation types require -several different processing pipelines, however. +In several ways, LOFAR will be the largest telescope of the world, and will +enable groundbreaking research in several areas of astronomy and particle +physics~\cite{Bruyn:02}. +The different goals and observation types require several different processing +pipelines, however. \begin{figure}[t] \vspace{-4mm} @@ -103,8 +108,8 @@ However, the desire for a flexible and reconfigurable instrument with different processing pipelines for different observation types demands a software solution. The availability of sufficiently powerful supercomputers allows this. -We have to perform all processing in real time, -since the data streams simply are too large to store. +%We have to perform all processing in real time, +%since the data streams simply are too large to store. \begin{figure*} \begin{minipage}[b]{11cm} @@ -122,13 +127,7 @@ since the data streams simply are too large to store. The most common mode of operation for LOFAR is the \emph{standard imaging pipeline}, which is used to generate sky images. -This mode filters and correlates the data sent by the stations. -This paper describes the implementation and -performance characteristics of the real-time part of this pipeline on an IBM Blue Gene/P (BG/P) -supercomputer. -We present a highly-optimized implementation that achieves very high -computational performance: the correlator sustains 96\% of the theoretical -floating-point peak performance during the computational phase. +This mode filters and correlates the data sent by the stations in the field. %The \emph{Epoch of Reionization (EoR)\/} pipeline should detect the faint, %very first sky objects. %It is a similar pipeline, but with even higher computational demands, due to @@ -136,9 +135,26 @@ floating-point peak performance during the computational phase. Several \emph{pulsar pipelines\/} are being developed as well, that either search large sky regions to find unknown pulsars or, once found, sensitively observe their characteristics. +We also started development of a \emph{transient pipeline}, that observes the +sky for transient events. The pipelines share common components, shortening their development time. -The software also supports multiple simultaneous observations, even of different -types. +The software also supports multiple simultaneous observations, even of +different types. +The first part of each pipeline runs in real time, since the receivers produce +too much data to store on disk. +Only after substantial reduction of the data volume, intermediate products +are written to disk. + +In this paper, we focus on the real-time part of the standard imaging pipeline, +commonly called ``the correlator''. +We first present an integral overview of the correlator and a discussion of +the optimizations that we implemented. +The main contribution of this paper is an in-depth study of all performance +aspects, real-time behavior, and scalability characteristics of the correlator +as a whole (previous papers only focused on single aspects and did not +comprise a scalability study). +We also enumerate the conditions that are necessary to obtain good, +real-time performance, and assert that none of these conditions can be ignored. The receivers produce hundreds of gigabits per second. To handle the high data rate, @@ -151,24 +167,33 @@ Additionally, we developed a low-overhead network protocol~\cite{Romein:09a} for communication between I/O~nodes and compute nodes, since we were not able to achieve the required internal input and output data rates with the standard network software. - -% @@@ TODO, change back to ``we'' -In earlier work, Romein et al.~\cite{Romein:06}, presented an initial prototype -implementation of the LOFAR processing on our previous platform, the -Blue Gene/L. The work presented in this paper differs in several -important ways. First, we have now implemented the complete I/O path: -from the LOFAR stations in the field to the Blue Gene/P, -from the I/O~nodes to the compute nodes, the internal -data transpose, and finally the transfer -to the storage cluster. In contrast, in~\cite{Romein:06} -we focused only on computing, and not on I/O. -Second, we now have a fully functional \emph{real time} processing pipeline -for the most important standard imaging mode, and prototypes for other -pipelines, showing the flexibility of our design. Finally, for the -first time, we perform a scalability analysis, proving that we can -support the requirements of the full LOFAR instrument, both in -terms of computing and I/O, in real time. In fact, we even surpass the -requirements. +The correlator achieves very high computational performance: it sustains +96\% of the theoretical floating-point peak performance during the +computational phase~\cite{Romein:06}. +The application scales to data rates well beyond the requirements; +in fact, the performance is so good that it led to the decision to change +the LOFAR specifications: the instrument can observe over 50\% more sources +or frequencies simultaneously than in its original design specifications, +greatly enhancing the efficiency of the entire instrument. +%This paper describes the implementation of the LOFAR correlator and shows +%how the optimized system scales to sizes beyond what is required +%In earlier work, we presented an initial prototype +%implementation of the LOFAR processing on our previous platform, the +%Blue Gene/L~\cite{Romein:06}. +%The work presented in this paper differs in several +%important ways. First, we have now implemented the complete I/O path: +%from the LOFAR stations in the field to the Blue Gene/P, +%from the I/O~nodes to the compute nodes, the internal +%data transpose, and finally the transfer +%to the storage cluster. In contrast, in~\cite{Romein:06} +%we focused only on computing, and not on I/O. +%Second, we now have a fully functional \emph{real time} processing pipeline +%for the most important standard imaging mode, and prototypes for other +%%pipelines, showing the flexibility of our design. Finally, for the +%first time, we perform a scalability analysis, proving that we can +%support the requirements of the full LOFAR instrument, both in +%terms of computing and I/O, in real time. In fact, we even surpass the +%requirements. This paper is structured as follows. We mention related work in Section~\ref{sec:related-work}. @@ -266,7 +291,8 @@ studying the origin of high-energy Neither the source, nor the physical process that accelerates such particles is known. Third, LOFAR's ability to continuously monitor a large fraction of the sky -makes it uniquely suited to find new \emph{pulsars} and to study \emph{transient sources}. +makes it uniquely suited to find new \emph{pulsars\/} and to study +\emph{transient sources}. Since LOFAR has no moving parts, it can instantaneously switch focus to some galactic event. Fourth, \emph{Deep Extragalactic Surveys\/} will be carried out to find the @@ -313,10 +339,9 @@ used for all observations. The roll-out of the stations is currently in progress. Each station is equipped with 48--96 LBAs and 48--96 HBA tiles. -A station also features a cabinet where initial processing is done. -Typical operations that are performed here include analog-to-digital -conversion, filtering, frequency selection, and combination of the signals -from the different receivers. +A station also features a cabinet where initial processing is done, like +analog-to-digital conversion, filtering, frequency selection, and combination +of the signals from the different receivers. One of the distinctive properties of LOFAR is that the receivers are omni-directional, and that multiple, simultaneous observation directions are supported. @@ -715,12 +740,11 @@ Such a chunk is the unit of data that is sent to the compute node for further processing. Since processing a chunk typically takes much longer than one second, the chunks are distributed round robin over a group of processor cores, as illustrated by -Figure~\ref{fig:round-robin}. Subsequent chunks are processed by -different processor cores. A core must finish its work before it is -time to process the next chunk. A core first receives data from the -I/O~node (green in the figure, the left side of the bars), processes -them (yellow, middle part of the bars), sends back the results (red, -right side of the bars), and idles until the I/O~node sends new data. +Figure~\ref{fig:round-robin}. +Subsequent chunks are processed by different processor cores. +A core first receives data from the I/O~node, processes them, sends back the +results, and idles until the I/O~node sends new data. +A core must finish its work before it is time to process the next chunk. For simplicity, Figure~\ref{fig:round-robin} shows the processing of three subbands on six cores.