diff --git a/doc/papers/2011/europar/coherent-dedispersion.jgr b/doc/papers/2011/europar/coherent-dedispersion.jgr
index 926f5383799adc26a475fa7748a8ab966c9a276e..d41ffbd5e744150e2c49e989341ade2b42ddd246 100644
--- a/doc/papers/2011/europar/coherent-dedispersion.jgr
+++ b/doc/papers/2011/europar/coherent-dedispersion.jgr
@@ -2,6 +2,7 @@ newgraph
   clip
   xaxis
     min 0 max 2
+    size 2.5
 
     no_auto_hash_labels
     (*
@@ -26,7 +27,7 @@ newgraph
     size 1.2
 
   legend
-    x 0.5 y 1.1
+    x 0 y 1.1
 
 newline
   label : No channel-level dedispersion
diff --git a/doc/papers/2011/europar/dispersed-signal.jgr b/doc/papers/2011/europar/dispersed-signal.jgr
index e43738454ffe427cef25575831b5d62863e36cbe..2ce50f2708aa61a71e5d1814897fc6f0c712a9bf 100644
--- a/doc/papers/2011/europar/dispersed-signal.jgr
+++ b/doc/papers/2011/europar/dispersed-signal.jgr
@@ -4,7 +4,7 @@ newgraph
     label : Time (ms)
     min 0
     max 4
-    size 2
+    size 1.5
     mhash 5
     no_auto_hash_labels
     shell : seq 0 2 | awk '{ printf "hash_label at %d : %.2f\n",2*$1,$1 * 1.88; }'
@@ -12,7 +12,7 @@ newgraph
     label : Frequency (MHz)
     hash 0
     min -0.2
-    size 2
+    size 1.5
     shell : seq 0 3 | awk '{ f = (512 + 200 + $1 * 1/16/3)*200/1024; printf "hash_label at %d : %.3f\n",$1,f; }'
 
 newline
@@ -43,14 +43,14 @@ copycurve
     shell : ./dispersed-signal-data-2.sh 3
 
 newgraph
-  x_translate 2.5
+  x_translate 2
 
   clip
   xaxis
     label : Time (ms)
     min 0
     max 4
-    size 2
+    size 1.5
     mhash 5
     no_auto_hash_labels
     shell : seq 0 2 | awk '{ printf "hash_label at %d : %.2f\n",2*$1,$1 * 1.88; }'
@@ -58,7 +58,7 @@ newgraph
     label : Frequency (MHz)
     hash 0
     min -0.2
-    size 2
+    size 1.5
     shell : seq 0 3 | awk '{ f = (512 + 200 + $1 * 1/16/3)*200/1024; printf "hash_label at %d : %.3f\n",$1,f; }'
     nodraw
 
diff --git a/doc/papers/2011/europar/lofar.pdf b/doc/papers/2011/europar/lofar.pdf
index 784d90fd7671bb8d56ae5b06263c27f1e46a2488..cf15506ee97f911ece867d53e672a839320e00ed 100644
Binary files a/doc/papers/2011/europar/lofar.pdf and b/doc/papers/2011/europar/lofar.pdf differ
diff --git a/doc/papers/2011/europar/lofar.tex b/doc/papers/2011/europar/lofar.tex
index cf73924439136756b9f02b9ad336f3944ef18495..d397afcf9f0c3e7899cb056f5cb42e5fcca202fb 100644
--- a/doc/papers/2011/europar/lofar.tex
+++ b/doc/papers/2011/europar/lofar.tex
@@ -122,7 +122,7 @@ We use an IBM BlueGene/P (BG/P) supercomputer for the real-time processing of st
 
 \subsection{System Description}
 
-Our system consists of 3 racks, with 12,480 processor cores that provide 42.4 TFLOPS peak processing power. One chip contains four PowerPC~450 cores, running at a modest 850~Mhz clock speed to reduce power consumption and to increase package density. Each core has two floating-point units (FPUs) that provide support for operations on complex numbers. The chips are organised in \emph{psets}, each of which consists of 64 cores for computation (\emph{compute cores}) and one chip for communication (\emph{I/O node}). Each compute core runs a fast, simple, single-process kernel (the Compute Node Kernel, or CNK), and has access to 512 MiB of memory. The I/O nodes consist of the same hardware as the compute nodes, but additionally have a 10~Gb/s Ethernet interface connected. Also, they run Linux, which allows the I/O nodes to do full multitasking. One rack contains 64 psets, which is equal to 4096 compute cores and 64 I/O nodes.
+Our system consists of 3 racks, with 12,480 processor cores that provide 42.4 TFLOPS peak processing power. One chip contains four PowerPC~450 cores, running at a modest 850~Mhz clock speed to reduce power consumption and to increase package density. Each core has two floating-point units (FPUs) that provide support for operations on complex numbers. The chips are organised in \emph{psets}, each of which consists of 64 cores for computation (\emph{compute cores}) and one chip for communication (\emph{I/O node}). Each compute core runs a fast, simple, single-process kernel,  and has access to 512 MiB of memory. The I/O nodes consist of the same hardware as the compute nodes, but additionally have a 10~Gb/s Ethernet interface connected. They run Linux, which allows the I/O nodes to do full multitasking. One rack contains 64 psets, which is equal to 4096 compute cores and 64 I/O nodes.
 
 The BG/P contains several networks. A fast \emph{3-dimensional torus\/} connects all compute nodes and is used for point-to-point and all-to-all communications over 3.4~Gb/s links. The torus uses DMA to offload the CPUs and allows asynchronous communication. The \emph{collective network\/} is used for communication within a pset between an I/O node and the compute nodes, using 6.8~Gb/s links. In both networks, data is routed through compute nodes using a shortest path. Additional networks exist for fast barriers, initialization, diagnostics, and debugging.
 
@@ -276,7 +276,10 @@ In the Stokes I mode, we applied several integration factors (1, 2, 4, 8, and 12
 
 \subsection{System Load}
 
-\begin{table}
+We further analyse the workload of the compute cores by highlighting a set of cases, summarised in Table \ref{table:cases}. We will focus on a memory-bound case (A), which also creates the highest number of beams, on CPU-bound cases interesting for performing surveys, with either 24 stations (B) or 64 stations (C) as input. Cases D and E focus on high-resolution observations of known sources, and are I/O bound configurations with 24 and 64 stations, respectively. Case F focusses on the observations of known sources as well, using Stokes I output, which allows more beams to be created. Channel-level dedispersion is applied for all cases that observe known sources.
+
+\begin{table}[ht]
+\center
 \begin{tabular}{l|l|r|r|r|r|r|r|l|l}
 Case & Mode & Channel & Int. & Stations & Beams  & Input & Output & Bound & Used for \\
      &      & dedisp. & factor      &          &        & rate  & rate   &       & \\
@@ -307,8 +310,6 @@ F & Stokes I    & Y & 1 & 64 &  42 & 198 Gb/s & 65 Gb/s & CPU   & Known sources
 \end{minipage}
 \end{figure}
 
-We further analyse the workload of the compute cores by highlighting a set of cases, summarised in Table \ref{table:cases}. We will focus on a memory-bound case (A), which also creates the highest number of beams, on CPU-bound cases interesting for performing surveys, with either 24 stations (B) or 64 stations (C) as input. Cases D and E focus on high-resolution observations of known sources, and are I/O bound configurations with 24 and 64 stations, respectively. Case F focusses on the observations of known sources as well, using Stokes I output, which allows more beams to be created. Channel-level dedispersion is applied for all cases that observe known sources.
-
 The workload of the compute cores for each case is shown in Figure \ref{fig:execution-times}, which shows the average workload per core. For the CPU-bound cases B and C, the average load has to be lower than 100\% in order to prevent fluctuations from slowing down our real-time system. These fluctuations typically occur due to clashes within the BG/P 3D-torus network which is used for both all-to-all-exchanges, and cannot be avoided in all cases.
 
 The cases which create many beams (A-C) spend most of the cycles performing beam forming and calculation the Stokes I parameters. The beamforming scales with both the number of stations and the number of beams, while the Stokes I calculation costs depends solely on the number of beams. Case A has to beam form only four stations, and thus requires most of its time calculating the Stokes I parameters. Case B and C use more stations, and thus need more time to beam form.