diff --git a/doc/papers/2009/PDPTA-09/FCNP-read.fig b/doc/papers/2009/PDPTA-09/FCNP-read.fig
index 044b348be5a4452cb5c22da3c47a3adc47ed4fb9..cda86c5ceda18339d0c3404ca63086f763837557 100644
--- a/doc/papers/2009/PDPTA-09/FCNP-read.fig
+++ b/doc/papers/2009/PDPTA-09/FCNP-read.fig
@@ -32,8 +32,8 @@ Single
 	 3150 3060 6750 3960
 2 1 0 1 14 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 3150 5400 3150 900
-4 1 14 50 -1 20 24 0.0000 4 285 1245 3150 5805 I/O node\001
-4 1 14 50 -1 20 24 0.0000 4 375 2070 6750 5805 compute node\001
 4 1 4 50 -1 20 24 0.2527 4 375 1815 4950 1395 read request\001
 4 1 4 50 -1 20 24 6.0305 4 285 1245 4950 2565 read ack\001
 4 1 1 50 -1 20 24 6.0305 4 285 1350 4950 3375 user data\001
+4 0 14 50 -1 20 24 0.0000 4 285 1245 2700 5805 I/O node\001
+4 2 14 50 -1 20 24 0.0000 4 375 2070 7200 5805 compute node\001
diff --git a/doc/papers/2009/PDPTA-09/FCNP-write.fig b/doc/papers/2009/PDPTA-09/FCNP-write.fig
index be17d31bedcbed15e980ec0f08e0224ab4513d76..5bd12f4a18093569f9858a012d0db3340690a215 100644
--- a/doc/papers/2009/PDPTA-09/FCNP-write.fig
+++ b/doc/papers/2009/PDPTA-09/FCNP-write.fig
@@ -34,5 +34,5 @@ Single
 	 6750 5400 6750 900
 4 1 4 50 -1 20 24 0.2527 4 375 1860 4950 1395 write request\001
 4 1 4 50 -1 20 24 6.0305 4 285 1290 4950 2565 write ack\001
-4 1 14 50 -1 20 24 0.0000 4 285 1245 3150 5805 I/O node\001
-4 1 14 50 -1 20 24 0.0000 4 375 2070 6750 5805 compute node\001
+4 2 14 50 -1 20 24 0.0000 4 375 2070 7245 5805 compute node\001
+4 0 14 50 -1 20 24 0.0000 4 285 1245 2700 5805 I/O node\001
diff --git a/doc/papers/2009/PDPTA-09/Makefile b/doc/papers/2009/PDPTA-09/Makefile
index 0f5f9998d969a84dafb032ddeb99160ea5b9709f..f562cb0ae0a178e56485d1af906c19cc2c2ba9fb 100644
--- a/doc/papers/2009/PDPTA-09/Makefile
+++ b/doc/papers/2009/PDPTA-09/Makefile
@@ -16,7 +16,8 @@ SVG_SOURCES =	ionode-load.svg
 STY_SOURCES =	
 
 AUX_FILES =	$(TEX_SOURCES:%.tex=%.aux)
-FIGURES =	$(FIG_SOURCES:%.fig=%.pdf) $(JGR_SOURCES:%.jgr=%.pdf) $(SVG_SOURCES:%.svg=%.pdf)
+FIGURES =	$(FIG_SOURCES:%.fig=%.pdf) $(JGR_SOURCES:%.jgr=%.pdf)\
+		$(JPG_SOURCES) $(SVG_SOURCES:%.svg=%.pdf)
 
 GEN_EXT =	bbl blg dvi idx ilg ind lof log lot ps toc ps_pages
 GEN_FILES =	$(AUX_FILES) $(FIGURES) $(GEN_EXT:%=fcnp.%) mfput.log\
diff --git a/doc/papers/2009/PDPTA-09/cn-to-ion-performance.jgr b/doc/papers/2009/PDPTA-09/cn-to-ion-performance.jgr
index fcf0baf2ada4f4f13919718f0f099f8ef6d3d633..15ae5ed917df5a034cdfa50961272fd087ad93a1 100644
--- a/doc/papers/2009/PDPTA-09/cn-to-ion-performance.jgr
+++ b/doc/papers/2009/PDPTA-09/cn-to-ion-performance.jgr
@@ -41,7 +41,7 @@ newgraph
       : message size (bytes)
 
   yaxis
-    size 2.5
+    size 2.0
     hash_labels
       font Helvetica
     label
diff --git a/doc/papers/2009/PDPTA-09/fcnp.tex b/doc/papers/2009/PDPTA-09/fcnp.tex
index debf69e6515b46834f657f1fbce5ce2699871aef..3722f1e270c7e62fe01ca9d629dc67750bf199de 100644
--- a/doc/papers/2009/PDPTA-09/fcnp.tex
+++ b/doc/papers/2009/PDPTA-09/fcnp.tex
@@ -1,8 +1,20 @@
-\documentclass[journal]{IEEEtran}
+%\documentclass[conference]{IEEEtran}
+\documentclass[conference]{worldcomp}
+
+\usepackage[hmargin=15mm,vmargin=1in]{geometry}
+\usepackage[american]{babel}
+\usepackage[T1]{fontenc}
+\usepackage{times}
+\usepackage{caption}
+\usepackage{textcomp}
+%\usepackage{epsfig,graphicx}
+\usepackage{amsfonts,amsmath,amssymb}
+%\usepackage{fixltx2e} % Fixing numbering problem when using figure/table* 
+\usepackage{booktabs}
 
 \usepackage{cite, graphicx, subfigure, listings, xspace, txfonts} 
 
-\begin{document}
+\columnsep 6mm
 
 \title{FCNP: Fast I/O on the Blue Gene/P}
 
@@ -14,6 +26,8 @@ Stichting ASTRON (Netherlands Institute for Radio Astronomy), Dwingeloo, The Net
 \newcommand{\us}{\,$\muup$s\xspace}
 \newcommand{\ns}{\,ns\xspace}
 
+\begin{document}
+
 \maketitle
 
 \begin{abstract}
@@ -37,8 +51,8 @@ so that the telescope can observe proportionally more sources or frequencies
 and becomes a much more efficient instrument.
 \end{abstract}
 
-\vspace{4mm}\noindent
-\textbf{Keywords:} low-overhead network protocol, IBM Blue Gene/P, LOFAR radio telescope
+\vspace{1em}\noindent
+\textbf{Keywords:} {\small low-overhead network protocol, IBM Blue Gene/P, LOFAR radio telescope}
 
 
 \section{Introduction}
@@ -66,6 +80,7 @@ By default, the I/O~nodes work transparently.
 However, the performance of I/O-intensive applications can improve
 significantly if a select part of the application or communication library
 (like PVFS) runs on I/O~nodes rather than compute nodes~\cite{Iskra:08}.
+
 We use the Blue Gene to process real-time radio-telescope data.
 Part of the application runs on the I/O~nodes, while the compute-intensive
 processing is done on compute nodes.
@@ -107,21 +122,28 @@ transport user data.
 We will show performance results and characterize the performance in terms
 of bandwidth, latency, and overhead.
 
+\addtocounter{figure}{1}
+\begin{figure*}[t]
+\includegraphics[width=\textwidth]{processing.pdf}
+\caption{LOFAR real-time signal processing.}
+\label{fig:processing}
+\end{figure*}
+
 FCNP is heavily used to process LOFAR telescope data.
 LOFAR~\cite{Butcher:04,deVos:09} is the first of a new generation of
 telescopes, that
-combines the signals of many thousands of simple, cheap, omni-directional
+combines the signals of tens of thousands of simple, cheap, omni-directional
 antennas rather than using expensive dishes.
 In several ways, it will be the largest telescope of the world.
 Another novel feature is that the data are processed in \emph{software\/} on a
-supercomputer~\cite{Romein:06}, where traditionally custom-built hardware is
+supercomputer~\cite{Romein:06,Romein:09b}, where traditionally custom-built hardware is
 used.
 The data are streamed at high bandwidths into the system and processed in real
 time.
 I/O~nodes receive the data and internally forward them to compute nodes.
 Standard system software does not provide sufficient bandwidth to handle
 2.1~Gb/s input and 0.58~Gb/s output per I/O~node, needed to meet the LOFAR
-specifications\footnote{\texttt{http://www.lofar.org/p/astronomy\_spec.htm}}.
+specifications~\cite{LOFAR_SPECS}.
 In contrast, FCNP achieves 3.1~Gb/s input and 1.2~Gb/s output bandwidth per
 I/O~node.
 The improved input data rate matches the absolute maximum that the telescope
@@ -136,13 +158,6 @@ precision), increasing the flexibility of the instrument.
 We developed FCNP to support streaming LOFAR data,
 but the ideas of this protocol are more widely applicable.
 
-\addtocounter{figure}{1}
-\begin{figure*}[t]
-\includegraphics[width=\textwidth]{processing.pdf}
-\caption{LOFAR real-time signal processing.}
-\label{fig:processing}
-\end{figure*}
-
 This paper is structured as follows.
 In Section~\ref{sec:LOFAR}, we describe the relevant parts of the LOFAR
 processing pipeline.
@@ -158,42 +173,26 @@ Section~\ref{sec:conclusions} concludes.
 \section{LOFAR processing}
 \label{sec:LOFAR}
 
-LOFAR is a new type of radio telescope, that combines the signals of many
+LOFAR is a new type of radio telescope, that combines the signals of tens of
 thousands of antennas and processes the data centrally on a BG/P supercomputer.
 We briefly explain how LOFAR data are processed, other papers provide more
-details\cite{Romein:06,Iskra:08}.
-%\emph{Note to the reviewers: a survey paper that describes the entire real-time
-%processing system (including recent additions, focusing on performance and
-%scalability) is in preparation.
-%We feel, however, that FCNP should be published separately.}
-
-%\begin{figure}[h]
-%\epsfxsize=\columnwidth
-%\epsffile{station.eps}
-%\caption{A LOFAR station.}
-%\label{fig:station}
-%\end{figure}
+details~\cite[Sec.~2]{Romein:06},~\cite[Sec.~4--6]{Romein:09b}.
 
 \addtocounter{figure}{-2}
 \begin{figure}[h]
-%\begin{minipage}[b]{.3\textwidth}
 \begin{center}
 \includegraphics[width=.7\columnwidth]{station.jpg}
 \end{center}
-\caption{Part of a LOFAR station, showing some low-band antennas.}
+\caption{The low-band antennas of a LOFAR station.}
 \label{fig:station}
-%\end{minipage}
-%\hfill
-%\end{minipage}
 \end{figure}
 \addtocounter{figure}{1}
 
 Co-located groups of 48 or 96 dual-polarized low-band antennas (see
 Figure~\ref{fig:station}) and high-band receivers form a \emph{station},
 i.e.\ a virtual telescope.
-Construction of 36--54 Dutch stations and 8--20~European stations has started.
-\emph{Note to the reviewers: we are currently in a transition phase
-where pre-production test stations are replaced by final stations.}
+Construction of 36--54 Dutch stations and 8--20~European stations is well
+underway.
 Each station digitizes the antenna voltages and pre-processes the data using
 FPGAs~\cite{Gunst:07}.
 The FPGAs send UDP packets with station data over dedicated Wide-Area
@@ -206,9 +205,9 @@ The application that runs on the BG/P is called the \emph{correlator},
 although it does much more processing than correlating only.
 Figure~\ref{fig:processing} shows a simplified scheme of one of the processing
 pipelines: the standard imaging pipeline that creates sky images.
-An I/O~node receives 48,828 UDP packets per second of up to 8\,KB from one
+An I/O~node receives 48,828 UDP packets per second of up to 8\,KiB from one
 station.
-It copies the samples into a large, circular buffer that holds the most recent
+It copies the samples into a circular buffer that holds the most recent
 three seconds of data.
 The buffer is used to synchronize the stations (the travel times over the
 WAN are higher for remote stations than for nearby stations) and to prevent
@@ -219,29 +218,24 @@ the fact that a celestial wave hits stations at different
 times~\cite[Sec.~2.1]{Romein:06}.
 
 The buffered data are sent to the compute nodes for further processing.
-These include the following tasks:
-\begin{itemize}
-\item	An all-to-all data exchange over the 3-D~torus: each input contains all
-	subbands (frequencies) from one station, while each output contains
-	one subband from all stations;
-\item	Filtering and a Fast Fourier Transform to split each subband into
-	narrow frequency channels;
-\item	A phase correction to fine-tune the observation direction;
-\item	A per-channel bandpass correction to flatten a ripple introduced by
-	a station filter;
-\item	Optionally beamform a group of stations (by weighted addition of their
-	samples) to form a more sensitive, virtual ``super station'';
-\item	Correlate the data (by multiplying the samples of all station pairs)
-	to filter out noise.
-\end{itemize}
-The signal-processing tasks are described in more detail
-elsewhere~\cite[Sec.~2]{Romein:06}.
+There, the data are first exchanged over the torus network, to collect pieces
+of data that can be processed independently.
+Then, the data are filtered and Fourier transformed to split each subband into
+narrow frequency channels.
+Next, a phase correction fine-tunes the observation direction.
+Then, a per-channel bandpass correction flattens a ripple introduced by
+a station filter.
+Optionally, a group of stations can be beam formed (by weighted addition of
+their samples) to form a more sensitive, virtual ``super station''.
+Finally, the data are correlated (by multiplying the samples of all station
+pairs) to filter out noise and integrated to reduce the amount of output.
 
 The compute node sends the correlated data back to the I/O~node.
 The I/O node optionally adds data from other compute nodes, and sends the
 result (asynchronously) to external systems that temporarily store the data
 on disk.
-After an observation, bad data (due to interference) are removed, and the
+After an observation has finished (not shown in Figure~\ref{fig:processing}),
+bad data (due to interference) are removed, and the
 resulting data are calibrated~\cite{Nijboer:07} and imaged.
 
 Two tasks are of particular interest to this paper: the transport of buffered
@@ -251,8 +245,8 @@ Moreover, each I/O~node also needs to receive 3.1~Gb/s of UDP data from the
 stations and send 1.2~Gb/s of TCP data to external systems, and are therefore
 already quite busy processing the data through the IP protocol stack.
 The two available mechanisms to communicate between the
-I/O~nodes and compute nodes (TCP and Unix domain sockets) do not even come
-close to these bandwidths (see Section~\ref{sec:performance}), and consume
+I/O~nodes and compute nodes (TCP and Unix domain sockets) do not provide
+sufficient bandwidth (see Section~\ref{sec:performance}) and consume
 too many CPU resources,
 necessitating the need for a light-weight protocol for internal communication.
 
@@ -319,14 +313,12 @@ Our system has 160~psets in total.
 
 The original BG/L design did not support the idea of running user applications
 on the I/O~nodes, but in earlier work~\cite{Iskra:08}, we showed that doing so 
-significantly improved performance and flexibility.
-We also saved on costs for additional hardware by doing part of the LOFAR
-processing on the I/O~nodes.
+significantly improved performance, flexibility, and costs.
 Unfortunately, this required major changes to the BG/L system
 software~\cite{Iskra:08,Boonstoppel:08}.
-In contrast, the BG/P fully supports user applications on I/O~nodes, but
+In contrast, the BG/P supports user applications on I/O~nodes, but
 the existing communication protocols between the compute nodes and the
-I/O~nodes did not provide sufficient bandwidth for the LOFAR application.
+I/O~nodes provided insufficient bandwidth for the LOFAR correlator.
 
 The tree uses bi-directional links at 6.8~Gb/s per direction.
 The network has a tree topology with a complex physical structure.
@@ -336,7 +328,7 @@ The processor cores of in-between nodes are not interrupted by the routing
 process.
 The hardware provides two separate virtual channels.
 One is used by CIOD;
-the other is typically only used on the compute nodes, by the MPI library for
+the other is typically only used on the compute nodes: by the MPI library for
 some of the collective MPI operations.
 However, the runtime environment can be changed so that MPI uses the
 3-D~torus for these collectives instead of the tree, leaving one of the virtual
@@ -350,10 +342,10 @@ instead of the tree.
 On the BG/L, the collective network was specifically designed to
 support collective operations.
 However, the BG/P is very well capable of doing collective operations
-using the torus, since the BG/P added DMA support for the torus
+using the torus, since the BG/P has DMA support for the torus
 (but not for the tree).
-Therefore, the free virtual channel can be used for FCNP, without
-significant performance penalty for collective operations.
+Therefore, FCNP can use the free virtual channel, without significantly
+slowing down collective operations.
 
 A processor can send and receive fixed-size packets over a virtual channel.
 A packet consists of a 4-byte header (used for routing) and 256 bytes payload.
@@ -387,12 +379,12 @@ and external systems.
 It is also possible to use TCP or Unix domain sockets internally,
 between an application on the compute nodes and an application on the I/O~node.
 However, the obtained bandwidth (see Section~\ref{sec:performance}) is
-insufficient for LOFAR.
+insufficient for the LOFAR correlator.
 Therefore, we developed a new protocol, \emph{Fast Collective Network
 Protocol\/} (FCNP), that uses the free virtual channel of the tree.
 
 Since one compute core is barely able to keep up with the link speed,
-any protocol overhead would decrease the obtained bandwidth.
+any heavy-weight protocol overhead would decrease the obtained bandwidth.
 FCNP reduces the protocol overhead to a single bit test in the normal case.
 FCNP distinguishes control packets (requests and acknowledgments) from data
 packets by setting the Irq (interrupt-on-receipt) bit in the header.
@@ -401,15 +393,15 @@ that do not contain any metadata in the payload part.
 
 \begin{figure}[h]
 \subfigure[write.]{
-  \includegraphics[width=.46\columnwidth]{FCNP-write.pdf}
+  \includegraphics[width=.46\columnwidth,height=35mm]{FCNP-write.pdf}
   \label{fig:FCNP-write}
 }
 \hfill
 \subfigure[read.]{
-  \includegraphics[width=.46\columnwidth]{FCNP-read.pdf}
+  \includegraphics[width=.46\columnwidth,height=35mm]{FCNP-read.pdf}
   \label{fig:FCNP-read}
 }
-\caption{The protocol.}
+\caption{The FCNP protocol.}
 \label{fig:FCNP-protocol}
 \end{figure}
 
@@ -448,7 +440,6 @@ struct RequestReplyPacket {
 \label{fig:control-packet-format}
 \end{figure}
 
-
 Figure~\ref{fig:control-packet-format} shows the format of request and reply
 packets.
 A request is a \emph{read}, \emph{write}, or a \emph{reset\/} packet
@@ -494,8 +485,7 @@ makes sure that only one read can be active at a given time.
 %Transferring ownership of the receive FIFO is complicated and prone to
 %race conditions.
 
-On the compute cores, we use extremely fast hardware mutexes to synchronize
-the cores.
+On the compute cores, we use fast hardware mutexes to synchronize the cores.
 On the I/O~nodes, the same hardware mutexes are physically present, but not
 exposed by the Linux kernel, so we use atomic instructions to implement spin
 locks.
@@ -504,7 +494,7 @@ the implementation writes up to eight consecutive packets before releasing
 and re-obtaining a lock.
 This way, the amortized locking overhead is negligible.
 
-FCNP strongly encourages 16-byte aligned data and the message sizes, but
+FCNP strongly encourages 16-byte aligned data and message sizes, but
 does not enforce this.
 Unaligned transfers are supported at the expense of a copy to an
 intermediate buffer.
@@ -659,6 +649,20 @@ DMA hardware would limit the use of asynchronous~I/O anyway.
 %required data rates.
 
 
+\begin{figure*}[t]
+\subfigure[I/O~node to compute node.]{
+  \includegraphics[width=.44\textwidth]{ion-to-cn-performance.pdf}
+  \label{fig:ion-to-cn-performance}
+}
+\hfill
+\subfigure[Compute node to I/O~node.]{
+  \includegraphics[width=.44\textwidth]{cn-to-ion-performance.pdf}
+  \label{fig:cn-to-ion-performance}
+}
+\caption{Measured bandwidths, as function of message size.}
+\label{fig:performance}
+\end{figure*}
+
 \section{Related Work}
 \label{sec:related-work}
 
@@ -681,7 +685,7 @@ ZOID is extensible: a small daemon on the I/O~node provides a basic function
 forwarding facility from the compute nodes to the I/O~nodes.
 On top of this, plug-ins (in the form of shared objects) that implement some
 functionality are loaded by the daemon and perform the real work.
-A standard plug-in is the \emph{Unix\/} plug-in that implements the standard
+A standard plug-in is the \emph{Unix\/} plug-in that implements the
 Unix I/O related system calls.
 On the compute nodes, the glibc library was adapted to replace the Unix
 system calls by stubs that forwards calls like \emph{socket()\/} and
@@ -710,9 +714,9 @@ application code on the I/O~nodes~\cite{Iskra:08}.
 
 ZOID was ported to the BG/P, but runs only with ZeptoOS kernels on the
 compute nodes, not with the CNK.
-Unfortunately, ZeptoOS did not yet support the 3-D~torus, which is of critical
+ZeptoOS did not yet support the 3-D~torus, which is of critical
 importance to the LOFAR application.
-Once ZeptoOS fully supports the torus, we may use it instead of CNK.
+Once ZeptoOS fully supports the torus, we might use it instead of CNK.
 
 A fundamental difference between ZOID and FCNP is that ZOID integrates system
 control and application I/O into the same process.
@@ -749,30 +753,16 @@ FCNP can separate control I/O from application I/O, without performance penalty.
 \section{Performance}
 \label{sec:performance}
 
-\begin{figure*}[t]
-\subfigure[I/O~node to compute node.]{
-  \includegraphics[width=.44\textwidth]{ion-to-cn-performance.pdf}
-  \label{fig:ion-to-cn-performance}
-}
-\hfill
-\subfigure[Compute node to I/O~node.]{
-  \includegraphics[width=.44\textwidth]{cn-to-ion-performance.pdf}
-  \label{fig:cn-to-ion-performance}
-}
-\caption{Measured bandwidths, as function of message size.}
-\label{fig:performance}
-\end{figure*}
-
 Figure~\ref{fig:performance} shows the measured bandwidths for FCNP (with
 interrupts enabled or disabled) and for TCP and Unix domain socket
 communication using CIOD.
 The benchmark communicates data between the I/O node and one of the compute
 nodes as fast as possible, using messages of various sizes.
-For large messages, FCNP approaches the link speed of the tree.
+For large messages, FCNP approaches the link speed.
 
-CIOD peaks at a bandwidth that is slightly higher than 2~Gb/s.
+CIOD peaks at a bandwidth that is slightly over 2~Gb/s.
 In theory, this equals the required LOFAR data rate, but we found that the
-bandwidth is not stable over long times, and definitely provides too little
+bandwidth is not stable over long times, and provides too little
 headroom for real-time processing.
 
 FCNP is significantly faster than CIOD, because its overhead is much lower.
@@ -785,7 +775,7 @@ The discontinuity in the curve for interrupt-driven FCNP in
 Figure~\ref{fig:ion-to-cn-performance} is caused by the fact that the polling
 thread polls the tree for 50\us after receiving a request packet, before
 suspending itself (see Section~\ref{sec:interrupts}).
-Messages of 64~KB or more take over 50\us to send.
+Messages of 64~KiB or more take over 50\us to send.
 Therefore, a request for a large message causes an interrupt that increases the
 latency.
 In contrast, requests for smaller messages will be received through polling.
@@ -825,11 +815,11 @@ core~0: Ethernet interrupts are handled there as well).
 
 In both directions, FCNP obtains a bandwidth of 6.54~Gb/s for large messages,
 hence the time per byte is 1.22\ns.
-This is as fast as a protocol-less benchmark achieves that simply sends packets
+This is as fast as a protocol-less benchmark, that simply sends packets
 on one side and receives them on the other side, and thus the maximum that
 the hardware can practically achieve.
 
-Since reading and writing is done in separate threads on separate hardware,
+Since reading and writing is done in separate threads,
 this bandwidth can be achieved in both directions simultaneously.
 A separate benchmark confirmed this claim.
 
@@ -840,7 +830,7 @@ A separate benchmark confirmed this claim.
 
 \begin{figure}[h]
 \includegraphics[width=\columnwidth]{ionode-load.pdf}
-\caption{Performance breakdown for the LOFAR Application on the I/O~node.}
+\caption{Performance breakdown on the I/O~node.}
 \label{fig:ionode-load}
 \end{figure}
 
@@ -943,13 +933,14 @@ it was ever designed for.
 Chris Broekema, Jan David Mol, and Rob van Nieuwpoort made useful comments to
 a draft version of this paper.
 We thank Kamil Iskra and Kazutomo Yoshii from Argonne National Labs and 
-Todd Inglett, Tom Liebsch, and Andrew Tauferner from IBM for their support.
+Bruce Elmegreen, Todd Inglett, Tom Liebsch, and Andrew Tauferner from IBM for
+their support.
 
 LOFAR is funded by the Dutch government through the
 BSIK program for interdisciplinary research and
 improvement of the knowledge infrastructure. Additional
 funding is provided through the European Regional
-Development Fund (EFRO) and the innovation program
+Development Fund and the innovation program
 EZ/KOMPAS of the Collaboration of the Northern
 Provinces (SNN). ASTRON is part of the Netherlands
 Organization for Scientific Research, NWO.
diff --git a/doc/papers/2009/PDPTA-09/ion-to-cn-performance.jgr b/doc/papers/2009/PDPTA-09/ion-to-cn-performance.jgr
index 7cc2ecb1079e0a9ea65e567336d973210400c2c2..c84b25656c25e1e6a8dcf88b62cd4655fb67b6d2 100644
--- a/doc/papers/2009/PDPTA-09/ion-to-cn-performance.jgr
+++ b/doc/papers/2009/PDPTA-09/ion-to-cn-performance.jgr
@@ -41,7 +41,7 @@ newgraph
       : message size (bytes)
 
   yaxis
-    size 2.5
+    size 2.0
     hash_labels
       font Helvetica
     label
diff --git a/doc/papers/2009/PDPTA-09/pset.fig b/doc/papers/2009/PDPTA-09/pset.fig
index f9cc2cd4d1be10ab6ccd7deef60d102b95e87630..6fbb556f1e0ecf89cd542881eb9f9202e3d2ccc4 100644
--- a/doc/papers/2009/PDPTA-09/pset.fig
+++ b/doc/papers/2009/PDPTA-09/pset.fig
@@ -436,7 +436,7 @@ Single
 2 1 0 4 4 7 80 -1 -1 0.000 0 0 -1 0 0 2
 	 11565 4860 13860 4860
 2 1 0 4 26 7 80 -1 -1 0.000 0 0 -1 0 0 2
-	 5310 4275 2745 4275
+	 5310 4275 3690 4275
 2 1 0 3 0 7 80 -1 -1 0.000 0 0 -1 1 0 2
 	2 1 1.00 120.00 120.00
 	 6975 6030 7650 6030
@@ -450,7 +450,8 @@ Single
 	 13140 2340 13140 1800
 2 1 0 4 4 7 80 -1 -1 0.000 0 0 -1 0 0 2
 	 8415 2340 7650 2340
-4 2 0 80 -1 19 20 0.0000 4 315 3495 7650 6390 compute nodes (CNK)\001
-4 1 4 80 -1 17 14 0.2094 4 165 1905 6840 4185 collective network\001
-4 1 0 80 -1 19 20 0.0000 4 315 2415 5310 3780 I/O node (linux)\001
-4 1 26 80 -1 17 14 0.0000 4 165 1800 3645 4545 10 Gb/s Ethernet\001
+4 0 26 80 -1 17 16 0.0000 4 210 930 3690 4185 10 GbE\001
+4 1 4 80 -1 17 16 0.2094 4 210 1080 6750 3825 collective\001
+4 1 4 80 -1 17 16 0.2094 4 210 960 6840 4185 network\001
+4 1 0 80 -1 19 18 0.0000 4 270 2010 5310 4950 I/O node (linux)\001
+4 2 0 80 -1 19 18 0.0000 4 270 2895 7650 6390 compute nodes (CNK)\001
diff --git a/doc/papers/2009/PDPTA-09/station.jpg b/doc/papers/2009/PDPTA-09/station.jpg
index 80c1382a0b383847f3b3466cbc2645eb667a1c74..40354ca9db08c19b5e2c74a92c8cdc4ad549df32 100644
Binary files a/doc/papers/2009/PDPTA-09/station.jpg and b/doc/papers/2009/PDPTA-09/station.jpg differ