diff --git a/.gitattributes b/.gitattributes
index 89330dd5b6e63912cbcc9ef323baf33868265511..bfcf6ba07f908a33267e43d3d33f3b111a023fef 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -2609,6 +2609,7 @@ doc/papers/2011/europar/delay.fig -text
 doc/papers/2011/europar/dispersed-signal-data-2.sh -text
 doc/papers/2011/europar/dispersed-signal.jgr -text
 doc/papers/2011/europar/execution_times.jgr -text
+doc/papers/2011/europar/ionperf.jgr -text
 doc/papers/2011/europar/listings.sty -text
 doc/papers/2011/europar/llncs.cls -text
 doc/papers/2011/europar/llncs2e.zip -text
diff --git a/doc/papers/2011/europar/Makefile b/doc/papers/2011/europar/Makefile
index 8a470c6b58bf72fbf8f8f5aef54337085d6e612a..25905910dbc16135921a6de6825d3fc3d8c2db56 100644
--- a/doc/papers/2011/europar/Makefile
+++ b/doc/papers/2011/europar/Makefile
@@ -4,7 +4,7 @@ BIB_SOURCES =	lofar.bib
 
 FIG_SOURCES =	lofar-stations.fig processing.fig
 
-JGR_SOURCES =	stations-beams.jgr execution_times.jgr coherent-dedispersion.jgr dispersed-signal.jgr
+JGR_SOURCES =	stations-beams.jgr execution_times.jgr coherent-dedispersion.jgr dispersed-signal.jgr ionperf.jgr
 
 JPG_SOURCES =	
 
diff --git a/doc/papers/2011/europar/execution_times.jgr b/doc/papers/2011/europar/execution_times.jgr
index a380a7b332b1194e3c1a9888f9bc23fb5f26494d..a18fac2193b6e7738facfea16c3153835a7203bc 100644
--- a/doc/papers/2011/europar/execution_times.jgr
+++ b/doc/papers/2011/europar/execution_times.jgr
@@ -22,12 +22,12 @@ awk -v N=1 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 "
 *)
 newgraph
         X 3.8
-        Y 3.1
+        Y 2.6
 	yaxis
 		min 0
 		max 100
-		size 2.5
-		label : BG/P occupation (% CPU time)
+		size 2
+		label : System load (%)
 	xaxis
 		min 0
 		max 7
@@ -103,7 +103,7 @@ Case (see Table 1)
 	label : 1st all-to-all exchange & \
 input handling
 
-  	legend top defaults hjl linelength 75 x 4 y 97
+  	legend top defaults hjl linelength 75 x 4 y 110
 
 (* circles for cases *)
 newstring
diff --git a/doc/papers/2011/europar/ionperf.jgr b/doc/papers/2011/europar/ionperf.jgr
new file mode 100644
index 0000000000000000000000000000000000000000..6ddfca3eb11f9f547071d72d32802eb87cb386aa
--- /dev/null
+++ b/doc/papers/2011/europar/ionperf.jgr
@@ -0,0 +1,240 @@
+
+(*
+	1 = I, 12i, 4s, 543b
+	2 = I, 8i, 24s, 327b
+	3 = I, 8i, 64s, 155b
+	4 = IQUV, 24s, 13b
+	5 = IQUV, 64s, 10b
+        6 = I, 4i, 64s, 42b
+
+1: (io node without output)
+
+FCNP             4 %
+inputSection    19 %
+inputThread     35 %
+outputThread     0 %
+outputSection    0 %
+delayComp       15 %
+interrupts       8 %
+idle            19 %
+
+ION<=>CN        23 % (=FCNP+outputSection+inputSection)
+ION=>Storage     0 % (=outputThread)
+Station=>ION    35 % (=inputThread)
+comp.           15 %
+interrupts       8 %
+
+2: (io node without output)
+
+FCNP             3 %
+inputSection    16 %
+inputThread     32 %
+outputThread     0 %
+outputSection    0 %
+delayComp        9 %
+interrupts      13 %
+idle            26 %
+
+ION<=>CN        19 % (=FCNP+outputSection+inputSection)
+ION=>Storage     0 % (=outputThread)
+Station=>ION    32 % (=inputThread)
+comp.            9 %
+interrupts      13 %
+
+3:
+
+FCNP             6 %
+inputSection    14 %
+inputThread     34 %
+outputThread     6 %
+outputSection    2 %
+delayComp        5 %
+interrupts      10 %
+idle            23 %
+
+ION<=>CN        22 % (=FCNP+outputSection+inputSection)
+ION=>Storage     6 % (=outputThread)
+Station=>ION    34 % (=inputThread)
+comp.            5 %
+interrupts      10 %
+
+4:
+
+FCNP            11 %
+inputSection    13 %
+inputThread     36 %
+outputThread    12 %
+outputSection    7 %
+delayComp        1 %
+interrupts      11 %
+idle             9 %
+
+ION<=>CN        31 % (=FCNP+outputSection+inputSection)
+ION=>Storage    12 % (=outputThread)
+Station=>ION    34 % (=inputThread)
+comp.            1 %
+interrupts      11 %
+
+5:
+
+FCNP             8 %
+inputSection    13 %
+inputThread     34 %
+outputThread    12 %
+outputSection    4 %
+delayComp        1 %
+interrupts       9 %
+idle            19 %
+
+ION<=>CN        25 % (=FCNP+outputSection+inputSection)
+ION=>Storage    12 % (=outputThread)
+Station=>ION    34 % (=inputThread)
+comp.            1 %
+interrupts       9 %
+
+6:
+
+FCNP             7 %
+inputSection    13 %
+inputThread     35 %
+outputThread    11 %
+outputSection    4 %
+delayComp        2 %
+interrupts       9 %
+idle            19 %
+
+ION<=>CN        24 % (=FCNP+outputSection+inputSection)
+ION=>Storage    11 % (=outputThread)
+Station=>ION    35 % (=inputThread)
+comp.            2 %
+interrupts       9 %
+
+interrupts        ->     8 %      13 %      10 %       11 %       9 %       9 %
+ion_storage       ->     0 %       0 %       6 %       12 %      12 %      11 %
+ion_cn            ->    23 %      19 %      22 %       31 %      25 %      24 %
+delaycomp         ->    15 %       9 %       5 %        1 %       1 %       2 %
+station_ion       ->    35 %      32 %      34 %       34 %      35 %      35 %
+
+The above data is read from this file and accumulated using the following command, where N is the line number we need:
+
+awk -v N=1 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
+
+*)
+newgraph
+        X 3.8
+        Y 2.6
+	yaxis
+		min 0
+		max 100
+		size 2
+		label : System load (%)
+	xaxis
+		min 0
+		max 7
+		hash 0
+		size 2.5
+		label : \
+Case (see Table 1)
+        (*
+	hash_labels rotate 22 vjt hjr
+	hash_label at 1 : I, 16i, 4s, 543b
+	hash_label at 2 : I, 8i, 24s, 327b
+	hash_label at 3 : I, 8i, 64s, 155b
+	hash_label at 4 : IQUV, 24s, 13b
+	hash_label at 5 : IQUV, 64s, 10b
+        *)
+        (*
+	hash_label at 1 : A
+	hash_label at 2 : B
+	hash_label at 3 : C
+	hash_label at 4 : D
+	hash_label at 5 : E
+	hash_label at 6 : F
+        *)
+
+	newcurve
+	(* station->ion *)
+	marktype xbar marksize 0.8 fill 0.5 pattern stripe -45 cfill 0 0.7 0
+	pts
+        shell : awk -v N=5 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
+	label hjl x 1 y 105 : Input from station
+
+	newcurve
+	(* delaycomp *)
+	marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0 0 1
+	pts
+        shell : awk -v N=4 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
+	label hjl x 1 y 100 : Weight computations
+
+	newcurve
+	(* ion<->cn *)
+	marktype xbar marksize 0.8 pattern stripe 45 cfill 0 1 1
+	pts
+        shell : awk -v N=3 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
+	label hjl x 1 y  95 : I/O with compute cores
+
+	newcurve
+	(* ion->storage *)
+	marktype xbar marksize 0.8 fill 0.2 pattern solid cfill 1 0 1
+	pts
+        shell : awk -v N=2 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
+	label hjl x 5 y 105 : Output to storage
+
+	newcurve
+	(* interrupts *)
+	marktype xbar marksize 0.8 fill 1 pattern solid cfill 0 0.7 0
+	pts
+        shell : awk -v N=1 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
+	label hjl x 5 y 100 : IRQ handling
+
+  	legend custom
+
+(* circles for cases *)
+newstring
+  font Helvetica-Bold
+  fontsize 8
+  hjc
+  vjc
+  lcolor 1 0.827450931 0.125490189
+  x 1
+  y -7
+  : A
+
+copystring
+  x 2
+  y -7
+  : B
+
+copystring
+  x 3
+  y -7
+  : C
+
+copystring
+  x 4
+  y -7
+  : D
+
+copystring
+  x 5
+  y -7
+  : E
+
+copystring
+  x 6
+  y -7
+  : F
+
+newcurve
+  marktype circle
+  marksize 0.5
+  gray 0
+  fill 0
+  pts
+     1 -7
+     2 -7
+     3 -7
+     4 -7
+     5 -7
+     6 -7
+
diff --git a/doc/papers/2011/europar/lofar.pdf b/doc/papers/2011/europar/lofar.pdf
index cda5a0f47e56de5235de003ea45a072f39953b7a..e00393c58c6fcce445c72279337d57effe0af01c 100644
Binary files a/doc/papers/2011/europar/lofar.pdf and b/doc/papers/2011/europar/lofar.pdf differ
diff --git a/doc/papers/2011/europar/lofar.tex b/doc/papers/2011/europar/lofar.tex
index fd874ef100076b0dfe3951a3b70dd59feafcf508..0652d9643a7241c0543bbebfa1295c481f18b8e3 100644
--- a/doc/papers/2011/europar/lofar.tex
+++ b/doc/papers/2011/europar/lofar.tex
@@ -1,5 +1,5 @@
 \documentclass{llncs}
-\usepackage{graphicx, subfigure, amsmath, xspace, txfonts, float}
+\usepackage{graphicx, subfigure, amsmath, xspace, txfonts, float, wrapfig}
 \usepackage{listings,lstpseudo}
 \usepackage[usenames]{color}
 \usepackage{mathptmx}
@@ -86,7 +86,7 @@ We customised the I/O node software stack~\cite{Yoshii:10} and run a multi-threa
 \end{minipage}
 \hfill
 \begin{minipage}[b]{40mm}
-  \includegraphics[width=\textwidth]{pencilbeams.pdf}
+  \includegraphics[width=0.8\textwidth]{pencilbeams.pdf}
   \caption{Tied-array beams (hexagons) formed within two station beams (ellipse).}
   \label{fig:pencilbeams}
 \end{minipage}
@@ -128,11 +128,19 @@ The all-to-all exchange is asynchronous. Once a compute core receives a complete
 
 \subsection{Beam Forming}
 
-The beam former combines the chunks from all stations, producing a chunk for each tied-array beam. Each beam is formed using different complex weights for the frequency of the channel, the locations of the stations, and the beam coordinates. The positional weights are precomputed by the I/O nodes and sent along with the data to avoid a duplicated effort by the compute nodes. The delays are applied to the station data through complex multiplications and additions, covering both the X and the Y polarisation samples.
+The beam former combines the chunks from all stations, producing a chunk for each tied-array beam. Each beam is formed using different complex weights for the frequency of the channel, the locations of the stations, and the beam coordinates. The positional weights are precomputed by the I/O nodes and sent along with the data to avoid a duplicated effort by the compute nodes. The delays are applied to the station data through complex multiplications and additions.
 
 %The delays are applied to the station data through complex multiplications and additions, programmed in assembly. In order to take full advantage of the L1 cache and the available registers, data is processed in sets of 6 stations, producing 3 beams, in portions of 128 samples, or a subset thereof to cover the remainders. While the exact ideal set size in which the data is to be processed is platform specific, we have shown in previous work that similar tradeoffs exist for similar problems across different architectures~\cite{Nieuwpoort:09}.
 
+\comment{
 \begin{listing}
+\caption{Pseudo code for the processing loops around the beam former assembly.}
+\label{lst:beam-forming}
+\end{listing}        
+}
+
+All time-consuming pipeline components are written in assembly, to achieve maximum performance.  The assembly code minimises the number of memory accesses, minimises load delays, minimises FPU pipeline stalls, and maximises instruction-level parallelism.  We learnt that optimal performance is often achieved by combining multiple iterations of a multi-dimensional loops:
+
 \lstset{language=pseudo}
 \begin{lstlisting}{}
 FOR Channel IN 1 .. NrChannels DO
@@ -141,11 +149,8 @@ FOR Channel IN 1 .. NrChannels DO
       FOR Beam IN 1 .. NrBeams STEP 3 DO
         BeamForm6StationsAnd128TimesTo3BeamsAssembly(...)
 \end{lstlisting}
-\caption{Pseudo code for the processing loops around the beam former assembly.}
-\label{lst:beam-forming}
-\end{listing}        
 
-All time-consuming pipeline components are written in assembly, to achieve maximum performance.  The assembly code minimises the number of memory accesses, minimises load delays, minimises FPU pipeline stalls, and maximises instruction-level parallelism.  We learnt that optimal performance is often achieved by combining multiple iterations of a multi-dimensional loops like shown in Listing~\ref{lst:beam-forming}. This is much more efficient than to create all beams one at a time, due to better reuse of data loaded from main memory.  Finding the most efficient way to group work is a combination of careful analysis and, unfortunately, trial-and-error. The coherent beam former achieves 85\% of the FPU peak performance, not as high as the 96\% of the correlator~\cite{Romein:10a}, but still 16 times more than the C++ reference implementation. 
+This is much more efficient than to create all beams one at a time, due to better reuse of data loaded from main memory.  Finding the most efficient way to group work is a combination of careful analysis and, unfortunately, trial-and-error. The coherent beam former achieves 85\% of the FPU peak performance, not as high as the 96\% of the correlator~\cite{Romein:10a}, but still 16 times more than the C++ reference implementation. 
 %Because each beam is an accumulation of the data from all stations, the bandwidth of each beam is equal to the bandwidth of data from a single station, which is 6.2~Gb/s now that the samples are 32-bit floats. Once the beams are formed, they are kept as complex voltages or transformed into the Stokes IQUV or the Stokes I parameters. In the latter case, the beams can also be integrated temporally to reduce the resulting data rate. Finally, an incoherent beam can be created in parallel, and converted into either Stokes I or Stokes IQUV parameters.
 
 %Our beam former supports several pipelines: \emph{complex voltages}, \emph{Stokes IQUV}, and \emph{Stokes I}. The complex voltages pipeline outputs the raw tied-array beams, which consist of two 3.1~Gb/s streams of 32-bit complex floating points numbers (floats), one stream for each polarisation. The Stokes IQUV pipeline applies a domain transformation to each sample of the raw tied-array beams, which is useful for polarisation-related studies. The four Stokes parameters, calculated through $I = X\overline{X} + Y\overline{Y}$, $Q = X\overline{X} - Y\overline{Y}$, $U = 2\mathrm{Re}(X\overline{Y})$, $V = 2\mathrm{Im}(X\overline{Y})$, with each parameter being a 32-bit float, resulting in four 1.5~Gb/s streams. The Stokes I pipeline calculates only the first Stokes parameter, which represents the signal strength in both polarisations. The Stokes I pipeline supports temporal integration to trade time resolution for a reduced bandwidth per beam, allowing more beams to be created.
@@ -192,7 +197,7 @@ Due to memory constrains on the compute cores, the cores that performed the beam
 
 The output cores again receive the chunks asynchronously, which we overlap with computations. For each chunk, the dimensions of the data are reordered into their final ordering. Reordering is necessary, because the data order that will be written to disk is not the same order that can be produced by our computations without taking heavy cache penalties. Once all of the chunks are received and reordered, they are forwarded to the I/O node.
 
-For the distribution of the workload over the available output cores, three factors are considered. First, all of the data belonging to the same beam has to be processed by output cores in the same pset, to ensure that one I/O node can concatenate all of the 0.25 second chunks that belong to the beam. Second, the maximum output rate per I/O node has to be respected. Finally, the presence of the first all-to-all exchange, which uses the same network at up to 198~Gb/s. The second exchange uses up to 80~Gb/s. Even though each link sustains 3.4~Gb/s, it has to process the traffic from four cores, as well as traffic routed through it between other nodes. The network links in the BG/P become overloaded unless the output cores are scattered sufficiently.
+For the distribution of the workload over the available output cores, three factors are considered. First, all of the data belonging to the same beam has to be processed by output cores in the same pset, to ensure that one I/O node can concatenate all of the 0.25 second chunks that belong to the beam. Second, the maximum output rate per I/O node has to be respected. Finally, the presence of the first all-to-all exchange, which uses the same network at up to 198~Gb/s. The second exchange uses up to 81~Gb/s. Even though each link sustains 3.4~Gb/s, it has to process the traffic from four cores, as well as traffic routed through it between other nodes. The network links in the BG/P become overloaded unless the output cores are scattered sufficiently.
 
 \subsection{Transport to Disks}
 Once an output core has received and reordered all of its data, the data are sent to the core's I/O node. The I/O node forwards the data over TCP/IP to the storage cluster. To avoid any stalling in our pipeline due to network congestion or disk issues, the I/O node uses a best-effort buffer which drops data in the unusual case that it cannot be sent.
@@ -204,11 +209,61 @@ We will focus our performance analysis on the most challenging cases that are of
 
 \subsection{Overall Performance}
 
+\begin{wrapfigure}{r}{0.5\textwidth}
+\vspace{-1.65cm}
+\includegraphics[width=0.5\textwidth]{stations-beams.pdf}
+\caption{The maximum number of beams that can be created in various configurations.}
+\label{fig:stations-beams}
+\vspace{-1cm}
+\end{wrapfigure}
+
 % TODO: getallen kloppen niet.. 13 beams is 80.6 Gb/s, en met 70 Gb/s zouden we 11 beams aan moeten kunnen
-Figure \ref{fig:stations-beams} shows the maximum number of beams that can be created when using a various number of stations, in each of the three pipelines: complex voltages, Stokes IQUV, and Stokes I. Both the complex voltages and the Stokes IQUV pipelines are I/O bound. Each beam is 6.2~Gb/s wide. We can make at most 12 beams without exceeding the available 80~Gb/s to our storage cluster. If 64 stations are used, the available bandwidth is 70~Gb/s due to the fact that an I/O node can only output 1.1~Gb/s if it also has to process station data. The granularity with which the output can be distributed over the I/O nodes, as well as scheduling details, determine the actual number of beams that can be created, but in all cases, the beam former can create at least 10 beams at LOFAR's full observational bandwidth.
+Figure \ref{fig:stations-beams} shows the maximum number of beams that can be created when using a various number of stations, in each of the three pipelines: complex voltages, Stokes IQUV, and Stokes I. Both the complex voltages and the Stokes IQUV pipelines are I/O bound. Each beam is 6.2~Gb/s wide. We can make at most 13 beams without exceeding the available 81~Gb/s to our storage cluster. If 64 stations are used, the available bandwidth is 70~Gb/s due to the fact that an I/O node can only output 1.1~Gb/s if it also has to process station data. The granularity with which the output can be distributed over the I/O nodes, as well as scheduling details, determine the actual number of beams that can be created, but in all cases, the beam former can create at least 10 beams at LOFAR's full observational bandwidth.
 
 In the Stokes I pipeline, we applied several integration factors (1, 2, 4, 8, and 16) in order to show the trade-off between beam quality and the number of beams. Integration factors higher than 16 does not allow significantly more beams to be created, but could be used in order to further reduce the total output rate. For low integration factors, the beam former is again limited by the available output bandwidth. Once the Stokes I streams are integrated sufficiently, the system becomes bounded by the compute nodes: if only signals from a few stations have to be combined, the beam former is limited by the amount of available memory required to store the beams. If more input has to be combined, the beam former becomes limited by the CPU power available in the compute cores. For observations for which a high integration factor is acceptable, the beam former is able to create 155 up to 543 tied-array beams, depending on the number of stations used. For observations that need a high time resolution and thus a low integration factor, the beam former is still able to create at least 42 tied-array beams.
 
+\comment{
+\begin{table}[t]
+\begin{minipage}[t]{0.5\textwidth}
+\center
+\begin{tabular}{l|l|r|r|r|r|r|l}
+Case & Mode           & Sta- & Beams  & Input & Output & Bound \\
+     & (int. factor)  & tions  &        & rate  & rate   &       \\
+\hline
+\hline
+\circlenumber{A} & Stokes I (16x)   &  4 & 543 &  12 Gb/s & 70 Gb/s & Memory \\
+\circlenumber{B} & Stokes I (8x)   & 24 & 327 &  74 Gb/s & 63 Gb/s & CPU   \\
+\circlenumber{C} & Stokes I (8x)   & 64 & 155 & 198 Gb/s & 30 Gb/s & CPU   \\  
+\circlenumber{D} & Stokes IQUV & 24 &  13 &  74 Gb/s & 81 Gb/s & I/O   \\
+\circlenumber{E} & Stokes IQUV & 64 &  10 & 198 Gb/s & 62 Gb/s & I/O   \\
+\circlenumber{F} & Stokes I    & 64 &  42 & 198 Gb/s & 65 Gb/s & CPU
+\end{tabular}
+\caption{Several highlighted cases.}
+\label{table:cases}
+\end{minipage}
+\hfill
+\begin{minipage}[t]{0.5\textwidth}
+\includegraphics[width=\textwidth]{ionperf.pdf}
+\caption{The load of the busiest I/O nodes.}
+\label{fig:ionperf}
+\end{minipage}
+\end{table}
+}
+
+\begin{figure}[t]
+\begin{minipage}[t]{0.55\textwidth}
+\includegraphics[width=\textwidth]{execution_times.pdf}
+\caption{The load of the compute cores.}
+\label{fig:execution-times}
+\end{minipage}
+\hspace{-1cm}
+\begin{minipage}[t]{0.55\textwidth}
+\includegraphics[width=\textwidth]{ionperf.pdf}
+\caption{The load of the busiest I/O nodes.}
+\label{fig:ionperf}
+\end{minipage}
+\end{figure}
+
 \begin{table}[t]
 \center
 \begin{tabular}{l|l|r|r|r|r|r|r|l|l}
@@ -219,39 +274,29 @@ Case & Mode & Channel & Int. & Stations & Beams  & Input & Output & Bound & Used
 \circlenumber{A} & Stokes I    & N & 16 &  4 & 543 &  12 Gb/s & 70 Gb/s & Memory & Surveys \\
 \circlenumber{B} & Stokes I    & N &  8 & 24 & 327 &  74 Gb/s & 63 Gb/s & CPU   & Surveys \\
 \circlenumber{C} & Stokes I    & N &  8 & 64 & 155 & 198 Gb/s & 30 Gb/s & CPU   & Surveys \\  
-\circlenumber{D} & Stokes IQUV & Y & - & 24 &  13 &  74 Gb/s & 80 Gb/s & I/O   & Known sources \\
+\circlenumber{D} & Stokes IQUV & Y & - & 24 &  13 &  74 Gb/s & 81 Gb/s & I/O   & Known sources \\
 \circlenumber{E} & Stokes IQUV & Y & - & 64 &  10 & 198 Gb/s & 62 Gb/s & I/O   & Known sources \\
 \circlenumber{F} & Stokes I    & Y & 1 & 64 &  42 & 198 Gb/s & 65 Gb/s & CPU   & Known sources 
 \end{tabular}
 \caption{Several highlighted cases.}
 \label{table:cases}
+\vspace{-0.7cm}
 \end{table}
 
-\begin{figure}[t]
-\begin{minipage}[t]{0.52\textwidth}
-\includegraphics[width=\textwidth]{stations-beams.pdf}
-\caption{The maximum number of beams that can be created in various configurations.}
-\label{fig:stations-beams}
-\end{minipage}
-\hfill
-\begin{minipage}[t]{0.5\textwidth}
-\includegraphics[width=\textwidth]{execution_times.pdf}
-\caption{The time spent in the processing steps.}
-\label{fig:execution-times}
-\end{minipage}
-\end{figure}
 
 \subsection{System Load}
 
 We further analyse the workload of the compute cores by highlighting a set of cases, summarised in Table \ref{table:cases}. We will focus on a memory-bound case (\circlenumber{A}), which also creates the highest number of beams, on CPU-bound cases interesting for performing surveys, with either 24 stations (\circlenumber{B}) or 64 stations (\circlenumber{C}) as input. Cases \circlenumber{D} and \circlenumber{E} focus on high-resolution observations of known sources, and are I/O bound configurations with 24 and 64 stations, respectively. Case \circlenumber{F} focusses on the observations of known sources as well, using Stokes I output, which allows more beams to be created. Channel-level dedispersion is applied for all cases that observe known sources.
 
-The workload of the compute cores for each case is shown in Figure \ref{fig:execution-times}, which shows the average workload per core. For the CPU-bound cases \circlenumber{B} and \circlenumber{C}, the average load has to be lower than 100\% to recover from small delays in the processing, that can occur since the BG/P is not a real-time system. These fluctuations typically occur due to clashes within the BG/P torus network which is used for both all-to-all-exchanges, and cannot be avoided in all cases.
+The average workload of the compute cores for each case is shown in Figure \ref{fig:execution-times}. For the CPU-bound cases \circlenumber{B} and \circlenumber{C}, the average load has to be lower than 100\% to recover from small delays in the processing, that can occur since the BG/P is not a real-time system. These fluctuations typically occur due to clashes within the BG/P torus network which is used for both all-to-all-exchanges, and cannot be avoided in all cases.
 
 The cases where we create many beams (\circlenumber{A}\circlenumber{B}\circlenumber{C}) spend most of the cycles performing beam forming and calculation the Stokes I parameters. The beam forming scales with both the number of stations and the number of beams, while the Stokes I calculation costs depends solely on the number of beams. Case \circlenumber{A} has to beam form only four stations, and thus requires most of its time calculating the Stokes I parameters. Case \circlenumber{B} and \circlenumber{C} use more stations, and thus need more time to beam form.
 
 The costs for both the first and the second all-to-all exchange are mostly hidden due to overlaps with computation. The remaining cost for the second exchange is proportional to the output bandwidth required in each case.
 
-For the I/O-bound cases \circlenumber{D}\circlenumber{E}\circlenumber{F}, only a few tied-array beams are formed and transformed into Stokes I(QUV) parameters, which produces a lot of data but requires little CPU time. Enough CPU time is therefore available to include channel-level dedispersion, which scales with the number of beams and, as Figure \ref{fig:execution-times} shows, is an expensive operation.
+For the I/O-bound cases \circlenumber{D}\circlenumber{E}\circlenumber{F}, only a few tied-array beams are formed and transformed into Stokes I(QUV) parameters, which produces a lot of data but requires little CPU time. Enough CPU time is therefore available to include channel-level dedispersion, which scales with the number of beams and is an expensive operation.
+
+Figure \ref{fig:ionperf} shows the workload for the busiest I/O nodes in each case, including the time spent in the kernel to handle IRQs. Processing the station data and communicating with the compute cores cause most of the load. For cases \circlenumber{A} and \circlenumber{B}, the output is handled by other I/O nodes. Both cases form many beams, and thus require time to compute beam former weights. Cases \circlenumber{C}\circlenumber{D}\circlenumber{E}\circlenumber{F} do handle output, and show nearly identical behaviour.
 
 \section{Related Work}
 \label{Sec:related-work}
@@ -266,11 +311,11 @@ The LOFAR beam former is the only beam former capable of producing hundreds of t
 \section{Conclusions}
 \label{Sec:conclusions}
 
-We have shown the capabilities of our beam former pipelines, running in software on an IBM BlueGene/P supercomputer. Our system is capable of producing 13 tied-array beams at LOFAR's full observational bandwidth before our output limit of 80~Gb/s is met. Alternatively, it can form hundreds of beams at a reduced resolution, the exact number depending on the number of stations and the pipeline used. Finally, an incoherent beam can be created, which retains the wide field-of-view offered by our stations. None of these feats are possible with any other telescope.
+We have shown the capabilities of our beam former pipelines, running in software on an IBM BlueGene/P supercomputer. Our system is capable of producing 13 tied-array beams at LOFAR's full observational bandwidth before our output limit of 81~Gb/s is met. Alternatively, it can form hundreds of beams at a reduced resolution, the exact number depending on the number of stations and the pipeline used. Finally, an incoherent beam can be created, which retains the wide field-of-view offered by our stations. None of these feats are possible with any other telescope.
 
 The use of a software solution on powerful interconnected hardware is a key aspect in the development and deployment of our pipeline. Because we use software, rapid prototyping is cheap, allowing novel features to be tested to aid the exploration of the design space of a new instrument. The resulting pipelines retain the flexibility that software allows. The control flow and bookkeeping has become complex while remaining manageable through software abstraction. We are able to run the same station data through multiple pipelines in parallel, and even multiple independent observations in parallel, as long as there are enough available resources. The science which drives LOFAR, and which is driven by it, is greatly accelerated through the use of an easily reconfigurable instrument.
 
-The BG/P supercomputer provides us with enough computing power and powerful networks to be able to implement the signal processing and all-to-all-exchanges that we require, without having to resort to a dedicated system which inevitably curbs the design freedom that the supercomputer provides. As with any system, platform-specific parameters nevertheless become important when maximal performance is desired. Although a C reference implementation allowed us to quickly develop and test features, we needed handcrafted assembly to keep the double FPUs of each compute core as busy as possible.
+The BG/P supercomputer provides us with enough computing power and powerful networks to be able to implement the signal processing and all-to-all-exchanges that we require, without having to resort to a dedicated system which inevitably curbs the design freedom that the supercomputer provides. As with any system, platform-specific parameters nevertheless become important when maximal performance is desired. Although a C reference implementation allowed us to quickly develop and test features, we needed handcrafted assembly to keep the double FPUs of each compute core busy.
 
 The architecture of the BG/P makes some tasks more difficult as well. The fact that an I/O node can only communicate with its own compute cores prevents us from freely scheduling the workload. Instead, we had to manually route the data using two all-to-all exchanges in order to stream the data from and to the right I/O nodes. To achieve maximum performance, we tuned the distribution of the workload over the cores to avoid network collisions.
 
diff --git a/doc/papers/references.bib b/doc/papers/references.bib
index 3e7aa5fd836cd754cdb26cb876e5646250fcfbed..e2e336cc9f27687d83ffa9d3556205b5b4f0afbb 100644
--- a/doc/papers/references.bib
+++ b/doc/papers/references.bib
@@ -808,7 +808,7 @@
 		  Bruyn:02,
     title	= {{Exploring the Universe with the Low Frequency Array, A Scientific Case}},
     author	= {A.G. de Bruyn and others},
-    note	= {\newline http://www.lofar.org/PDF/NL-CASE-1.0.pdf},
+    note	= {http://www.lofar.org/PDF/NL-CASE-1.0.pdf},
     month	= {September},
     year	= {2002}
 }
@@ -1879,7 +1879,7 @@
     volume	= {49},
     number	= {2/3},
     pages	= {437--446},
-    month	= {March},
+    _month	= {March},
     year	= {2005}
 }
 
@@ -2399,8 +2399,8 @@
     pages       = {305--314},
     volume      = {I},
     organization= {CSREA},
-    address     = {Las Vegas, NV},
-    month       = {July},
+    _address    = {Las Vegas, NV},
+    _month      = {July},
     year        = {1997}
 }
 
@@ -2550,6 +2550,7 @@
 		  Romein:09a,
     title	= {{FCNP: Fast I/O on the Blue Gene/P}},
     author	= {J.W. Romein},
+    pages	= {225--231},
     booktitle	= PDPTA # { (PDPTA'09)},
     address	= {Las Vegas, NV},
     month	= {July},
@@ -2562,8 +2563,8 @@
     title	= {{The LOFAR Correlator: Implementation and Performance Analysis}},
     author	= {J.W. Romein and P.C. Broekema and J.D. Mol and R.V. van Nieuwpoort},
     booktitle	= PPOPP # { (PPoPP'10)},
-    address	= {Bangalore, India},
-    note	= {To appear},
+    _address	= {Bangalore, India},
+    pages	= {169--178},
     month	= {January},
     year	= {2010}
 }
@@ -3022,7 +3023,7 @@
 {
   		  Yoshii:10,
   title		= {{Performance and Scalability Evaluation of ``Big Memory'' on Blue Gene Linux}},
-  author	= {K. Yoshii and K. Iskra and H. Naik and P. Beckman, P.C. Broekema},
+  author	= {K. Yoshii and K. Iskra and H. Naik and P. Beckman and P.C. Broekema},
   journal	= IJHPC,
   note		= {To appear}
 }