diff --git a/doc/papers/2011/europar/TODO b/doc/papers/2011/europar/TODO
index 6df2fa74a970d5f22d6e268bf92f1293f190f5ad..8d2d22d7b7a5c2df48e42081d2ed8cc5e1ad8bf1 100644
--- a/doc/papers/2011/europar/TODO
+++ b/doc/papers/2011/europar/TODO
@@ -11,4 +11,4 @@ V 12 -> 16 int factor in grafieken en tekst
 
 V c vs asm beamformer speedup = 16
 
-- ionproc stats & graph
+V ionproc stats & graph
diff --git a/doc/papers/2011/europar/execution_times.jgr b/doc/papers/2011/europar/execution_times.jgr
index a18fac2193b6e7738facfea16c3153835a7203bc..31602fcee8b4ce2a78f457aaa77d2d80a3f7c6d4 100644
--- a/doc/papers/2011/europar/execution_times.jgr
+++ b/doc/papers/2011/europar/execution_times.jgr
@@ -7,13 +7,13 @@
 	5 = IQUV, 64s, 10b
         6 = I, 4i, 64s, 42b
 
-pre_bf_dsp_&_I/O  ->  3.44 %    4.69 %    7.70 %     4.07 %    6.72 %    7.04 %
-beam_forming	  -> 14.36 %   49.19 %   67.27 %     2.19 %    4.56 %   17.50 %
-coh_dedispersion  ->     0 %       0 %       0 %     2.76 %    2.12 %    9.18 %
-stokes		  -> 45.21 %   26.98 %   13.32 %     0.27 %    0.20 %    0.66 %
-2nd_xpose	  ->  5.87 %    2.73 %    0.85 %     2.88 %    3.40 %    3.37 %
-stokes_reorder    ->  4.99 %    4.44 %    2.11 %     5.77 %    4.45 %    4.68 %
 output_I/O        ->  1.45 %    0.48 %    0.20 %     0.80 %    0.36 %    0.38 %
+stokes_reorder    ->  4.99 %    4.44 %    2.11 %     5.77 %    4.45 %    4.68 %
+2nd_xpose	  ->  5.87 %    2.73 %    0.85 %     2.88 %    3.40 %    3.37 %
+stokes		  -> 45.21 %   26.98 %   13.32 %     0.27 %    0.20 %    0.66 %
+coh_dedispersion  ->     0 %       0 %       0 %     2.76 %    2.12 %    9.18 %
+beam_forming	  -> 14.36 %   49.19 %   67.27 %     2.19 %    4.56 %   17.50 %
+pre_bf_dsp_&_I/O  ->  3.44 %    4.69 %    7.70 %     4.07 %    6.72 %    7.04 %
 
 The above data is read from this file and accumulated using the following command, where N is the line number we need:
 
@@ -53,55 +53,61 @@ Case (see Table 1)
         *)
 
 	newcurve
-	(* output core pure I/O *)
-	marktype xbar marksize 0.8 fill 0 pattern solid cfill 1 0 0
+	(* input core I/O + prebf dsp *)
+	marktype xbar marksize 0.8 fill 0 pattern solid cfill 0.494117618 0 0.129411757
+
 	pts
         shell : awk -v N=7 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
-	label : Output to I/O node
+	label : 1st all-to-all exchange & \
+input handling
 
 	newcurve
-	(* output reorder *)
-	marktype xbar marksize 0.8 fill 0.8 pattern solid cfill 1 1 0
+	(* beam forming *)
+	marktype xbar marksize 0.8 fill 0.8 pattern solid cfill 1 .5 1
 	pts
         shell : awk -v N=6 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
-	label : Output reordering
+	label : Beam forming
 
 	newcurve
-	(* 2nd xpose *)
-	marktype xbar marksize 0.8 fill 1 pattern solid cfill 0 0.7 0
+	(* coh dd *)
+	marktype xbar marksize 0.8 fill 1 pattern solid cfill 0 0.270588219 0.525490165
+
 	pts
         shell : awk -v N=5 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
-	label : 2nd all-to-all exchange
+	label : Channel dedispersion \
+(cases D-F)
 
 	newcurve
 	(* stokes calculation *)
-	marktype xbar marksize 0.8 pattern stripe 45 cfill 0 1 1
+	marktype xbar marksize 0.8 pattern solid cfill 1 0.258823514 0.0549019575
+
 	pts
         shell : awk -v N=4 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
 	label : Stokes calculations
 
 	newcurve
-	(* coh dd *)
-	marktype xbar marksize 0.8 fill 0.2 pattern solid cfill 1 0 1
+	(* 2nd xpose *)
+	marktype xbar marksize 0.8 fill 0.2 pattern solid cfill 1 0.827450931 0.125490189
+
 	pts
         shell : awk -v N=3 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
-	label : Channel dedispersion \
-(cases D-F)
+	label : 2nd all-to-all exchange
 
 	newcurve
-	(* beam forming *)
-	marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0 0 1
+	(* output reorder *)
+	marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0.34117645 0.615686238 0.109803915
+
 	pts
         shell : awk -v N=2 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
-	label : Beam forming
+	label : Output reordering
 
 	newcurve
-	(* input core I/O + prebf dsp *)
-	marktype xbar marksize 0.8 fill 0.5 pattern stripe -45 cfill 0 0.7 0
+	(* output core pure I/O *)
+	marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0.51372546 0.792156816 1
+
 	pts
         shell : awk -v N=1 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <execution_times.jgr
-	label : 1st all-to-all exchange & \
-input handling
+	label : Output to I/O node
 
   	legend top defaults hjl linelength 75 x 4 y 110
 
diff --git a/doc/papers/2011/europar/ionperf.jgr b/doc/papers/2011/europar/ionperf.jgr
index 6ddfca3eb11f9f547071d72d32802eb87cb386aa..7e567d22c3a261f89ac8fff8d5979c6c655a9642 100644
--- a/doc/papers/2011/europar/ionperf.jgr
+++ b/doc/papers/2011/europar/ionperf.jgr
@@ -154,35 +154,40 @@ Case (see Table 1)
 
 	newcurve
 	(* station->ion *)
-	marktype xbar marksize 0.8 fill 0.5 pattern stripe -45 cfill 0 0.7 0
+	marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0 0.270588219 0.525490165
+
 	pts
         shell : awk -v N=5 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
 	label hjl x 1 y 105 : Input from station
 
 	newcurve
 	(* delaycomp *)
-	marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 0 0 1
+	marktype xbar marksize 0.8 fill 0.5 pattern solid cfill 1 0.258823514 0.0549019575
+
 	pts
         shell : awk -v N=4 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
 	label hjl x 1 y 100 : Weight computations
 
 	newcurve
 	(* ion<->cn *)
-	marktype xbar marksize 0.8 pattern stripe 45 cfill 0 1 1
+	marktype xbar marksize 0.8 pattern solid cfill 1 0.827450931 0.125490189
+        
 	pts
         shell : awk -v N=3 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
 	label hjl x 1 y  95 : I/O with compute cores
 
 	newcurve
 	(* ion->storage *)
-	marktype xbar marksize 0.8 fill 0.2 pattern solid cfill 1 0 1
+	marktype xbar marksize 0.8 fill 0.2 pattern solid cfill 0.34117645 0.615686238 0.109803915
+ 
 	pts
         shell : awk -v N=2 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
 	label hjl x 5 y 105 : Output to storage
 
 	newcurve
 	(* interrupts *)
-	marktype xbar marksize 0.8 fill 1 pattern solid cfill 0 0.7 0
+	marktype xbar marksize 0.8 fill 1 pattern solid cfill 0.51372546 0.792156816 1
+
 	pts
         shell : awk -v N=1 '/->/ {a+=$3;b+=$5;c+=$7;d+=$9;e+=$11;f+=$13;n++; if(n==N) print "1 " a " 2 " b " 3 " c " 4 " d " 5 " e " 6 " f;}' <ionperf.jgr
 	label hjl x 5 y 100 : IRQ handling
diff --git a/doc/papers/2011/europar/lofar.pdf b/doc/papers/2011/europar/lofar.pdf
index e00393c58c6fcce445c72279337d57effe0af01c..d61299958ce2e164968d4e914e9c1976fd65af26 100644
Binary files a/doc/papers/2011/europar/lofar.pdf and b/doc/papers/2011/europar/lofar.pdf differ
diff --git a/doc/papers/2011/europar/lofar.tex b/doc/papers/2011/europar/lofar.tex
index 0652d9643a7241c0543bbebfa1295c481f18b8e3..1636385b5519387f42cbdfae57a005b9b9b212ae 100644
--- a/doc/papers/2011/europar/lofar.tex
+++ b/doc/papers/2011/europar/lofar.tex
@@ -132,15 +132,7 @@ The beam former combines the chunks from all stations, producing a chunk for eac
 
 %The delays are applied to the station data through complex multiplications and additions, programmed in assembly. In order to take full advantage of the L1 cache and the available registers, data is processed in sets of 6 stations, producing 3 beams, in portions of 128 samples, or a subset thereof to cover the remainders. While the exact ideal set size in which the data is to be processed is platform specific, we have shown in previous work that similar tradeoffs exist for similar problems across different architectures~\cite{Nieuwpoort:09}.
 
-\comment{
 \begin{listing}
-\caption{Pseudo code for the processing loops around the beam former assembly.}
-\label{lst:beam-forming}
-\end{listing}        
-}
-
-All time-consuming pipeline components are written in assembly, to achieve maximum performance.  The assembly code minimises the number of memory accesses, minimises load delays, minimises FPU pipeline stalls, and maximises instruction-level parallelism.  We learnt that optimal performance is often achieved by combining multiple iterations of a multi-dimensional loops:
-
 \lstset{language=pseudo}
 \begin{lstlisting}{}
 FOR Channel IN 1 .. NrChannels DO
@@ -149,8 +141,11 @@ FOR Channel IN 1 .. NrChannels DO
       FOR Beam IN 1 .. NrBeams STEP 3 DO
         BeamForm6StationsAnd128TimesTo3BeamsAssembly(...)
 \end{lstlisting}
+\caption{Pseudo code for the processing loops around the beam former assembly.}
+\label{lst:beam-forming}
+\end{listing}        
 
-This is much more efficient than to create all beams one at a time, due to better reuse of data loaded from main memory.  Finding the most efficient way to group work is a combination of careful analysis and, unfortunately, trial-and-error. The coherent beam former achieves 85\% of the FPU peak performance, not as high as the 96\% of the correlator~\cite{Romein:10a}, but still 16 times more than the C++ reference implementation. 
+All time-consuming pipeline components are written in assembly, to achieve maximum performance.  The assembly code minimises the number of memory accesses, minimises load delays, minimises FPU pipeline stalls, and maximises instruction-level parallelism.  We learnt that optimal performance is often achieved by combining multiple iterations of a multi-dimensional loops like shown in Listing~\ref{lst:beam-forming}. This is much more efficient than to create all beams one at a time, due to better reuse of data loaded from main memory.  Finding the most efficient way to group work is a combination of careful analysis and, unfortunately, trial-and-error. The coherent beam former achieves 86\% of the FPU peak performance, not as high as the 96\% of the correlator~\cite{Romein:10a}, but still 16 times more than the C++ reference implementation. 
 %Because each beam is an accumulation of the data from all stations, the bandwidth of each beam is equal to the bandwidth of data from a single station, which is 6.2~Gb/s now that the samples are 32-bit floats. Once the beams are formed, they are kept as complex voltages or transformed into the Stokes IQUV or the Stokes I parameters. In the latter case, the beams can also be integrated temporally to reduce the resulting data rate. Finally, an incoherent beam can be created in parallel, and converted into either Stokes I or Stokes IQUV parameters.
 
 %Our beam former supports several pipelines: \emph{complex voltages}, \emph{Stokes IQUV}, and \emph{Stokes I}. The complex voltages pipeline outputs the raw tied-array beams, which consist of two 3.1~Gb/s streams of 32-bit complex floating points numbers (floats), one stream for each polarisation. The Stokes IQUV pipeline applies a domain transformation to each sample of the raw tied-array beams, which is useful for polarisation-related studies. The four Stokes parameters, calculated through $I = X\overline{X} + Y\overline{Y}$, $Q = X\overline{X} - Y\overline{Y}$, $U = 2\mathrm{Re}(X\overline{Y})$, $V = 2\mathrm{Im}(X\overline{Y})$, with each parameter being a 32-bit float, resulting in four 1.5~Gb/s streams. The Stokes I pipeline calculates only the first Stokes parameter, which represents the signal strength in both polarisations. The Stokes I pipeline supports temporal integration to trade time resolution for a reduced bandwidth per beam, allowing more beams to be created.
@@ -217,7 +212,6 @@ We will focus our performance analysis on the most challenging cases that are of
 \vspace{-1cm}
 \end{wrapfigure}
 
-% TODO: getallen kloppen niet.. 13 beams is 80.6 Gb/s, en met 70 Gb/s zouden we 11 beams aan moeten kunnen
 Figure \ref{fig:stations-beams} shows the maximum number of beams that can be created when using a various number of stations, in each of the three pipelines: complex voltages, Stokes IQUV, and Stokes I. Both the complex voltages and the Stokes IQUV pipelines are I/O bound. Each beam is 6.2~Gb/s wide. We can make at most 13 beams without exceeding the available 81~Gb/s to our storage cluster. If 64 stations are used, the available bandwidth is 70~Gb/s due to the fact that an I/O node can only output 1.1~Gb/s if it also has to process station data. The granularity with which the output can be distributed over the I/O nodes, as well as scheduling details, determine the actual number of beams that can be created, but in all cases, the beam former can create at least 10 beams at LOFAR's full observational bandwidth.
 
 In the Stokes I pipeline, we applied several integration factors (1, 2, 4, 8, and 16) in order to show the trade-off between beam quality and the number of beams. Integration factors higher than 16 does not allow significantly more beams to be created, but could be used in order to further reduce the total output rate. For low integration factors, the beam former is again limited by the available output bandwidth. Once the Stokes I streams are integrated sufficiently, the system becomes bounded by the compute nodes: if only signals from a few stations have to be combined, the beam former is limited by the amount of available memory required to store the beams. If more input has to be combined, the beam former becomes limited by the CPU power available in the compute cores. For observations for which a high integration factor is acceptable, the beam former is able to create 155 up to 543 tied-array beams, depending on the number of stations used. For observations that need a high time resolution and thus a low integration factor, the beam former is still able to create at least 42 tied-array beams.
@@ -276,7 +270,7 @@ Case & Mode & Channel & Int. & Stations & Beams  & Input & Output & Bound & Used
 \circlenumber{C} & Stokes I    & N &  8 & 64 & 155 & 198 Gb/s & 30 Gb/s & CPU   & Surveys \\  
 \circlenumber{D} & Stokes IQUV & Y & - & 24 &  13 &  74 Gb/s & 81 Gb/s & I/O   & Known sources \\
 \circlenumber{E} & Stokes IQUV & Y & - & 64 &  10 & 198 Gb/s & 62 Gb/s & I/O   & Known sources \\
-\circlenumber{F} & Stokes I    & Y & 1 & 64 &  42 & 198 Gb/s & 65 Gb/s & CPU   & Known sources 
+\circlenumber{F} & Stokes I    & Y & 1 & 64 &  42 & 198 Gb/s & 65 Gb/s & I/O   & Known sources 
 \end{tabular}
 \caption{Several highlighted cases.}
 \label{table:cases}
@@ -296,7 +290,7 @@ The costs for both the first and the second all-to-all exchange are mostly hidde
 
 For the I/O-bound cases \circlenumber{D}\circlenumber{E}\circlenumber{F}, only a few tied-array beams are formed and transformed into Stokes I(QUV) parameters, which produces a lot of data but requires little CPU time. Enough CPU time is therefore available to include channel-level dedispersion, which scales with the number of beams and is an expensive operation.
 
-Figure \ref{fig:ionperf} shows the workload for the busiest I/O nodes in each case, including the time spent in the kernel to handle IRQs. Processing the station data and communicating with the compute cores cause most of the load. For cases \circlenumber{A} and \circlenumber{B}, the output is handled by other I/O nodes. Both cases form many beams, and thus require time to compute beam former weights. Cases \circlenumber{C}\circlenumber{D}\circlenumber{E}\circlenumber{F} do handle output, and show nearly identical behaviour.
+Figure \ref{fig:ionperf} shows the workload for the busiest I/O nodes in each case, including the time spent in the kernel to handle IRQs. Processing the station data and communicating with the compute cores cause most of the load. For cases \circlenumber{A} and \circlenumber{B}, the output is handled by other I/O nodes. Both cases form many beams, and thus require time to compute the positional weights that are required by the beam former. Cases \circlenumber{C}\circlenumber{D}\circlenumber{E}\circlenumber{F} do handle output, and show nearly identical behaviour.
 
 \section{Related Work}
 \label{Sec:related-work}
@@ -317,7 +311,7 @@ The use of a software solution on powerful interconnected hardware is a key aspe
 
 The BG/P supercomputer provides us with enough computing power and powerful networks to be able to implement the signal processing and all-to-all-exchanges that we require, without having to resort to a dedicated system which inevitably curbs the design freedom that the supercomputer provides. As with any system, platform-specific parameters nevertheless become important when maximal performance is desired. Although a C reference implementation allowed us to quickly develop and test features, we needed handcrafted assembly to keep the double FPUs of each compute core busy.
 
-The architecture of the BG/P makes some tasks more difficult as well. The fact that an I/O node can only communicate with its own compute cores prevents us from freely scheduling the workload. Instead, we had to manually route the data using two all-to-all exchanges in order to stream the data from and to the right I/O nodes. To achieve maximum performance, we tuned the distribution of the workload over the cores to avoid network collisions.
+The architecture of the BG/P makes some tasks more difficult as well. The fact that an I/O node can only communicate with its own compute cores prevents us from freely scheduling the workload. Instead, we have to manually route the data using two all-to-all exchanges in order to stream the data from and to the right I/O nodes. To achieve maximum performance, we tuned the distribution of the workload over the cores to avoid network collisions.
 
 \bibliographystyle{plain}
 \bibliography{lofar}
diff --git a/doc/papers/2011/europar/stations-beams.jgr b/doc/papers/2011/europar/stations-beams.jgr
index 2ec46bb4d487d4241b2575988dad6d5f61ffd38f..c7a2bfa598bcdbaa67f6c32997ddf23030de8ce8 100644
--- a/doc/papers/2011/europar/stations-beams.jgr
+++ b/doc/papers/2011/europar/stations-beams.jgr
@@ -26,7 +26,6 @@ newgraph
     hash_label at 17 : 289
     hash_label at 19 : 361
     hash_label at 21 : 441
-    hash_label at 23 : 529
 (*
     hash_label at 1 : 1
     hash_label at 2 : 4
@@ -54,7 +53,7 @@ newgraph
     hash_label at 24 : 576
 *)
     min 0
-    max 24
+    max 22
 
 newline
   linetype dotted
@@ -188,6 +187,7 @@ newstring : Stokes I, \
   x 2 y 18.2
   hjl vjc
 
+(*
 newline
   linetype dotted
   linethickness 2.0
@@ -197,15 +197,19 @@ newline
       8 20.15 (* 406 *)
       12 19.60 (* 384 *)
       16 19.60 (* 384 *)
+*)
 
 newline
   linetype solid
   linethickness 2.0
   color 1 0 1
   pts
+      24 17.03 (* 290 *)
+  (*
       16 19.60 (* 384 *)
       20 18.97 (* 360 *)
       24 18.08 (* 327 *)
+  *)    
       28 17.26 (* 298 *)
       32 16.43 (* 270 *)
       36 15.84 (* 251 *)
@@ -221,6 +225,7 @@ newstring : Stokes I, 16x integration
   x 15 y 22
   hjl vjc
 
+  (*
 newline
   linetype dotted
   linethickness 2.0
@@ -228,16 +233,23 @@ newline
   pts
       4 23.26 (* 541 *)
       8 23.17 (* 537 *)
+  *)    
 
 newline  
   linetype solid
   linethickness 2.0
   color 0 1 1
   pts
+      4 21.21 (* 450 *)
+      8 21.21 (* 450 *)
+      12 20.62 (* 425 *)
+      16 19.75 (* 390 *)
+  (*
       8 23.17 (* 537 *)
       12 21.84 (* 477 *)
       16 20.37 (* 415 *)
       20 18.97 (* 360 *)
+  *)    
       24 18.08 (* 327 *)
       28 17.26 (* 298 *)
       32 16.43 (* 270 *)