diff --git a/.gitattributes b/.gitattributes
index bab6d3e21ec83c6377cea1d5b274a44a5a2905bf..d9de3f84f0d0cd59544ee0ed3d8a4a2b7683f986 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1425,7 +1425,6 @@ doc/papers/2009/SC-09/sig-alternate.cls -text
 doc/papers/2009/SC-09/transpose.fig -text
 doc/papers/2010/SPM/IEEEbib.bst -text
 doc/papers/2010/SPM/Makefile -text
-doc/papers/2010/SPM/bio-nieuwpoort.txt -text
 doc/papers/2010/SPM/comments-for-reviewers.odt -text
 doc/papers/2010/SPM/copyright[!!-~]form[!!-~]IEEE[!!-~]SPM[!!-~]Nieuwpoort.pdf -text
 doc/papers/2010/SPM/cover-letter.odt -text
@@ -1439,6 +1438,20 @@ doc/papers/2010/SPM/figures/map.jpg -text
 doc/papers/2010/SPM/figures/pastedpic_11252008_163347.png -text
 doc/papers/2010/SPM/figures/performance-graph-v2.pdf -text
 doc/papers/2010/SPM/figures/performance-graph-v2.svg -text
+doc/papers/2010/SPM/final/IEEEbib.bst -text
+doc/papers/2010/SPM/final/Makefile -text
+doc/papers/2010/SPM/final/bio-nieuwpoort.txt -text
+doc/papers/2010/SPM/final/bio-romein.txt -text
+doc/papers/2010/SPM/final/copyright[!!-~]form[!!-~]IEEE[!!-~]SPM[!!-~]Nieuwpoort.pdf -text
+doc/papers/2010/SPM/final/figures/fig1.jpg -text
+doc/papers/2010/SPM/final/figures/fig1.pdf -text
+doc/papers/2010/SPM/final/figures/fig2.pdf -text
+doc/papers/2010/SPM/final/figures/fig3.pdf -text
+doc/papers/2010/SPM/final/figures/fig4.pdf -text
+doc/papers/2010/SPM/final/figures/fig5.pdf -text
+doc/papers/2010/SPM/final/spconf.sty -text
+doc/papers/2010/SPM/final/spm.bib -text
+doc/papers/2010/SPM/final/spm.tex -text
 doc/papers/2010/SPM/initial-submission/nieuwpoort-double-spaced.pdf -text
 doc/papers/2010/SPM/initial-submission/nieuwpoort-normal-spaced.pdf -text svneol=unset#unset
 doc/papers/2010/SPM/initial-submission/nieuwpoort.zip -text
diff --git a/doc/papers/2010/SPM/final/IEEEbib.bst b/doc/papers/2010/SPM/final/IEEEbib.bst
new file mode 100644
index 0000000000000000000000000000000000000000..f9bf3cca18df5226892678df3d5c5d01cdb23bc8
--- /dev/null
+++ b/doc/papers/2010/SPM/final/IEEEbib.bst
@@ -0,0 +1,1021 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  IEEE.bst  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Bibliography Syle file for articles according to IEEE instructions
+% balemi@aut.ee.ethz.ch     <22-JUN-93>
+% modified from unsrt.bib. Contributions by Richard H. Roy
+
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+  }
+  {}
+  { label }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+% next line commented out by rhr and changed to write comma
+%	{ add.period$ write$
+	{ ", " * write$ 
+	  newline$
+	  "\newblock " write$
+	}
+	{ output.state before.all =
+	    'write$
+	    { add.period$ " " * write$ }
+	  if$
+	}
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem{" write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+% 5/24/89 rhr
+%  modified fin.entry function - prints note field after body of entry  
+%FUNCTION {fin.entry}
+%{ add.period$
+%  note empty$
+%    'write$
+%    { "\par\bgroup\parindent=0em  " * annote * "\par\egroup " * write$
+%    }
+%  if$
+%  newline$
+%}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+% new block without terminating last block with a comma
+FUNCTION {new.ncblock}
+{
+  write$ 
+  newline$
+  "\newblock "
+  before.all 'output.state :=
+}
+
+FUNCTION {new.nccont}
+{
+  write$ 
+  " "
+  before.all 'output.state :=
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+	'skip$
+	{ after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "{\em " swap$ * "}" * }
+  if$
+}
+
+FUNCTION {boldface}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "{\bf " swap$ * "}" * }
+  if$
+}
+
+%FUNCTION {boldface}
+%{ 's swap$ :=
+%  s "" =
+%    { "" }
+%    { "{\bf " s * "}" * }
+%  if$
+%}
+%
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
+      nameptr #1 >
+	{ namesleft #1 >
+	    { ", " * t * }
+	    { numnames #2 >
+		{ "," * }
+		'skip$
+	      if$
+	      t "others" =
+		{ " et~al." * }
+		{ " and " * t * }
+	      if$
+	    }
+	  if$
+	}
+	't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+	{ ", Eds." * }
+	{ ", Ed." * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { "``" title "t" change.case$ * }
+  if$
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+	{ t #1 #2 substring$ "--" = not
+	    { "--" *
+	      t #2 global.max$ substring$ 't :=
+	    }
+	    {   { t #1 #1 substring$ "-" = }
+		{ "-" *
+		  t #2 global.max$ substring$ 't :=
+		}
+	      while$
+	    }
+	  if$
+	}
+	{ t #1 #1 substring$ *
+	  t #2 global.max$ substring$ 't :=
+	}
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year empty$
+    { month empty$
+	{ "" }
+	{ "there's a month but no year in " cite$ * warning$
+	  month
+	}
+      if$
+    }
+    { month empty$
+	'year
+	{ month " " * year * }
+      if$
+    }
+  if$
+}
+
+% FUNCTION {format.date}
+% { year empty$
+% 	'year 
+% 	{ " "  year * }
+%   if$
+% }
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "vol." volume tie.or.space.connect
+      series empty$
+	'skip$
+	{ " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+	{ series field.or.null }
+	{ output.state mid.sentence =
+	    { "number" }
+	    { "Number" }
+	  if$
+	  number tie.or.space.connect
+	  series empty$
+	    { "there's a number but no series in " cite$ * warning$ }
+	    { " in " * series * }
+	  if$
+	}
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+	{ edition "l" change.case$ " edition" * }
+	{ edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+	{ #1 'multiresult := }
+	{ t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+	{ "pp." pages n.dashify tie.or.space.connect }
+	{ "p." pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ 
+volume empty$
+   {"" }
+   {"vol. " volume *}
+if$
+number empty$
+   'skip$
+   {", no. " number * *}
+if$
+pages empty$
+   'skip$
+    { duplicate$ empty$
+	{ pop$ format.pages }
+	{ ", pp. " * pages n.dashify * }
+      if$
+    }
+if$
+}
+
+%FUNCTION {format.vol.num.pages}
+%%boldface added 3/17/87 rhr
+%{ volume field.or.null boldface
+%  number empty$
+%    'skip$
+%    { "(" number * ")" * *
+%      volume empty$
+%	{ "there's a number but no volume in " cite$ * warning$ }
+%	'skip$
+%      if$
+%    }
+%  if$
+%  pages empty$
+%    'skip$
+%    { duplicate$ empty$
+%	{ pop$ format.pages }
+%	{ ":" * pages n.dashify * }
+%      if$
+%    }
+%  if$
+%}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+	{ "chapter" }
+	{ type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+	'skip$
+	{ ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+	{ "in " booktitle emphasize * }
+	{ "in "  booktitle emphasize *  ", " * format.editors * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Tech. {R}ep." }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+	{ "need key or journal for " cite$ * " to crossref " * crossref *
+	  warning$
+	  ""
+	}
+	{ "In {\em " journal * "\/}" * }
+      if$
+    }
+    { "In " key * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.crossref.editor}
+{ editor #1 "{vv~}{ll}" format.name$
+  editor num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+	'skip$
+	{ editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+	    { " et~al." * }
+	    { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+	  if$
+	}
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "vol." volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+	{ series empty$
+	    { "need editor, key, or series for " cite$ * " to crossref " *
+	      crossref * warning$
+	      "" *
+	    }
+	    { "{\em " * series * "\/}" * }
+	  if$
+	}
+	{ key * }
+      if$
+    }
+    { format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+	{ booktitle empty$
+	    { "need editor, key, or booktitle for " cite$ * " to crossref " *
+	      crossref * warning$
+	      ""
+	    }
+	    { "In {\em " booktitle * "\/}" * }
+	  if$
+	}
+	{ "In " key * }
+      if$
+    }
+    { "In " format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title ",''" * "title" output.check
+  new.ncblock
+  crossref missing$
+    { journal emphasize "journal" output.check
+      format.vol.num.pages output
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+	{ "author and editor" editor either.or.check }
+	'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  new.block
+  format.title ",''" * "title" output.check
+  new.nccont
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+	{ "author and editor" editor either.or.check }
+	'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title ",''" * "title" output.check
+  new.ncblock
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title ",''" * "title" output.check
+  new.ncblock
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      address empty$
+	{ organization publisher new.sentence.checkb
+	  organization output
+	  format.date "year" output.check
+	}
+	{ address output.nonnull
+	  format.date "year" output.check
+	  organization output
+	}
+      if$
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      publisher output
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  author empty$
+    { organization empty$
+	'skip$
+	{ organization output.nonnull
+	  address output
+	}
+      if$
+    }
+    { format.authors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  author empty$
+    { organization empty$
+	{ address new.block.checka
+	  address output
+	}
+	'skip$
+      if$
+    }
+    { organization address new.block.checkb
+      organization output
+      address output
+    }
+  if$
+  format.edition output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title ",''" * "title" output.check
+  new.ncblock
+  "M.S. thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  title howpublished new.block.checkb
+  format.title ",''" * output
+  new.nccont
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "Ph.D. thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  editor empty$
+    { organization output }
+    { format.editors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address empty$
+    { editor empty$
+	{ publisher new.sentence.checka }
+	{ organization publisher new.sentence.checkb
+	  organization output
+	}
+      if$
+      publisher output
+      format.date "year" output.check
+    }
+    { address output.nonnull
+      format.date "year" output.check
+      new.sentence
+      editor empty$
+	'skip$
+	{ organization output }
+      if$
+      publisher output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title ",''" * "title" output.check
+  new.ncblock
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title ",''" * "title" output.check
+  new.ncblock
+  note "note" output.check
+  format.date output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+MACRO {jan} {"Jan."}
+
+MACRO {feb} {"Feb."}
+
+MACRO {mar} {"Mar."}
+
+MACRO {apr} {"Apr."}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"Aug."}
+
+MACRO {sep} {"Sept."}
+
+MACRO {oct} {"Oct."}
+
+MACRO {nov} {"Nov."}
+
+MACRO {dec} {"Dec."}
+
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jacm} {"Journal of the ACM"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+
+MACRO {tods} {"ACM Transactions on Database Systems"}
+
+MACRO {tog} {"ACM Transactions on Graphics"}
+
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+
+READ
+
+STRINGS { longest.label }
+
+INTEGERS { number.label longest.label.width }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #1 'number.label :=
+  #0 'longest.label.width :=
+}
+
+FUNCTION {longest.label.pass}
+{ number.label int.to.str$ 'label :=
+  number.label #1 + 'number.label :=
+  label width$ longest.label.width >
+    { label 'longest.label :=
+      label width$ 'longest.label.width :=
+    }
+    'skip$
+  if$
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {longest.label.pass}
+
+FUNCTION {begin.bib}
+{ preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{"  longest.label  * "}" * write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%% End of IEEE.bst %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
diff --git a/doc/papers/2010/SPM/final/Makefile b/doc/papers/2010/SPM/final/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..eba742c283bdfd3814cd1a1a939a0f3838ff7e26
--- /dev/null
+++ b/doc/papers/2010/SPM/final/Makefile
@@ -0,0 +1,51 @@
+TEX_SOURCES =	spm.tex
+
+BIB_SOURCES =	spm.bib
+
+FIG_SOURCES =	
+
+JGR_SOURCES =	
+
+JPG_SOURCES =	
+
+PNG_SOURCES =	
+
+STY_SOURCES =	
+
+AUX_FILES =	$(TEX_SOURCES:%.tex=%.aux)
+FIGURES =	$(FIG_SOURCES:%.fig=%.pdf) $(JGR_SOURCES:%.jgr=%.pdf)
+
+GEN_EXT =	bbl blg dvi idx ilg ind lof log lot ps toc ps_pages
+GEN_FILES =	$(AUX_FILES) $(FIGURES) spm.pdf $(GEN_EXT:%=spm.%)\
+		mfput.log missfont.log texput.log
+
+TEXINPUTS =	inputs:.:
+TEXFONTS =	:
+
+%.pdf:		%.jgr
+		jgraph $< | epstopdf --filter > $@
+
+%.pdf:		%.fig
+		fig2dev -L pdf $< $@
+
+spm.pdf:	$(TEX_SOURCES) $(STY_SOURCES) $(BIB_SOURCES) $(FIGURES)
+		TEXINPUTS=$(TEXINPUTS) TEXFONTS=$(TEXFONTS) pdflatex spm
+		bibtex spm
+		TEXINPUTS=$(TEXINPUTS) TEXFONTS=$(TEXFONTS) pdflatex spm
+		TEXINPUTS=$(TEXINPUTS) TEXFONTS=$(TEXFONTS) pdflatex spm
+
+spm.ps:		spm.pdf
+		#pdftops -paper letter spm.pdf
+		pdftops -paper A4 spm.pdf
+
+evince::	spm.pdf
+		evince $<
+
+xpdf::		spm.pdf
+		xpdf -g 900x1200 $<
+
+lpr::		spm.ps
+		lpr $<
+
+clean::
+		rm -f $(GEN_FILES)
diff --git a/doc/papers/2010/SPM/bio-nieuwpoort.txt b/doc/papers/2010/SPM/final/bio-nieuwpoort.txt
similarity index 100%
rename from doc/papers/2010/SPM/bio-nieuwpoort.txt
rename to doc/papers/2010/SPM/final/bio-nieuwpoort.txt
diff --git a/doc/papers/2010/SPM/final/bio-romein.txt b/doc/papers/2010/SPM/final/bio-romein.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f5896a279abffb59ed94c0daafa13416314da3c7
--- /dev/null
+++ b/doc/papers/2010/SPM/final/bio-romein.txt
@@ -0,0 +1,9 @@
+John W. Romein is a senior system researcher high-performance
+computing at ASTRON, where he is responsible for the central,
+real-time data processing of LOFAR.  He obtained his Ph.D. on
+distributed search algorithms for board-game playing at the Vrije
+Universiteit, Amsterdam.  As a postdoc, he solved the game of Awari
+using a large computer cluster, and did research on parallel
+algorithms for bio-informatics.  His research interests include
+high-performance computing, parallel algorithms, networks, programming
+languages, and compiler construction.
diff --git a/doc/papers/2010/SPM/final/copyright form IEEE SPM Nieuwpoort.pdf b/doc/papers/2010/SPM/final/copyright form IEEE SPM Nieuwpoort.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..3f51a9435035f550ed704215110de1cfdb7ef6ae
Binary files /dev/null and b/doc/papers/2010/SPM/final/copyright form IEEE SPM Nieuwpoort.pdf differ
diff --git a/doc/papers/2010/SPM/final/figures/fig1.jpg b/doc/papers/2010/SPM/final/figures/fig1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..40354ca9db08c19b5e2c74a92c8cdc4ad549df32
Binary files /dev/null and b/doc/papers/2010/SPM/final/figures/fig1.jpg differ
diff --git a/doc/papers/2010/SPM/final/figures/fig1.pdf b/doc/papers/2010/SPM/final/figures/fig1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..21178983fb1b0e29ae72780cdd4191f94fafe91d
Binary files /dev/null and b/doc/papers/2010/SPM/final/figures/fig1.pdf differ
diff --git a/doc/papers/2010/SPM/final/figures/fig2.pdf b/doc/papers/2010/SPM/final/figures/fig2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..1216bb63ecb001dad62f1172f7dcf4dc54eae6cd
Binary files /dev/null and b/doc/papers/2010/SPM/final/figures/fig2.pdf differ
diff --git a/doc/papers/2010/SPM/final/figures/fig3.pdf b/doc/papers/2010/SPM/final/figures/fig3.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..9d77949fe3c7ad8aae50cea18749e81065affc05
Binary files /dev/null and b/doc/papers/2010/SPM/final/figures/fig3.pdf differ
diff --git a/doc/papers/2010/SPM/final/figures/fig4.pdf b/doc/papers/2010/SPM/final/figures/fig4.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..72721e6fd0f263fa4515e0c9fee7fc035c9ce1f8
Binary files /dev/null and b/doc/papers/2010/SPM/final/figures/fig4.pdf differ
diff --git a/doc/papers/2010/SPM/final/figures/fig5.pdf b/doc/papers/2010/SPM/final/figures/fig5.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..88b61b18db58727c0b80a43587a01043219fff8a
Binary files /dev/null and b/doc/papers/2010/SPM/final/figures/fig5.pdf differ
diff --git a/doc/papers/2010/SPM/final/spconf.sty b/doc/papers/2010/SPM/final/spconf.sty
new file mode 100644
index 0000000000000000000000000000000000000000..02322c3162335e1c44b85b2d363ad4c938aab044
--- /dev/null
+++ b/doc/papers/2010/SPM/final/spconf.sty
@@ -0,0 +1,252 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% File:     spconf.sty          (LaTeX Document style option "spconf")
+%
+% Usage:    \documentclass{article}
+%           \usepackage{spconf}
+%
+%           Or for LaTeX 2.09:
+% Usage:    \documentstyle[...,spconf,...]{article}
+%
+% Purpose:
+%
+% Style file for Signal Processing Society Conferences (ICASSP, ICIP).
+% Features:
+%    - correct page size (175mm x 226mm)
+%    - twocolumn format
+%    - boldfaced, numbered, and centered section headings
+%    - correct subsection and subsubsection headings
+%    - use \title{xx} for title, will be typeset all uppercase
+%    - use \name{xx} for author name(s) only, will be typeset in italics
+%    - use \address{xx} for one address of all authors
+%    - use \twoauthors{author1}{address1}{author2}{address2}
+%         for two (or more) authors with two separate addresses
+%    - note: no need for \author nor \date
+%    - optional: can use \thanks{xx} within \name or \twoauthors,
+%         asterisk is not printed after name nor in footnote
+%    - optional: can use \sthanks{xx} after each name within \name or
+%         \twoauthors if different thanks for each author,
+%         footnote symbol will appear for each name and footnote
+%    - optional: use \ninept to typeset text in 9 pt; default is 10pt.
+%
+% Example of use for one or more authors at a common address and
+%    common support. For distinct support acknowledgments,
+%    use \sthanks{xx} after each name.
+%
+%                 \documentclass{article}
+%                 \usepackage{spconf}
+%                 \title{Title of the paper}
+%                 \name{George P. Burdell and John Q. Professor
+%                       \thanks{This work was supported by...}}
+%                 \address{Common address, department \\
+%                          City, etc \\
+%                          optional e-mail address}
+%
+%                 \begin{document}
+%  OPTIONAL -->   \ninept            <-- OPTIONAL, for nine pt only
+%                 \maketitle
+%                 \begin{abstract}
+%                 This is the abstract for my paper.
+%                 \end{abstract}
+%                         .
+%                 Insert text of paper
+%                         .
+%                 \end{document}
+%
+% Example of use for two authors at two distinct addresses with only
+%    one support acknowledgment. For distinct support acknowledgments,
+%    use \sthanks{xx} after each name.
+%
+%                 \documentclass{article}
+%                 \usepackage{spconf}
+%                 \title{Title of the paper}
+%                 \twoauthors{John Doe
+%                       \thanks{This work was supported by...}}
+%                            {Doe's address, department \\
+%                             City, etc \\
+%                             optional e-mail address}
+%                            {Judy Smith}
+%                            {Smith's address, department \\
+%                             City, etc \\
+%                             optional e-mail address}
+%
+%                 \begin{document}
+%  OPTIONAL -->   \ninept            <-- OPTIONAL, for nine pt only
+%                 \maketitle
+%                 \begin{abstract}
+%                 This is the abstract for my paper.
+%                 \end{abstract}
+%                         .
+%                 Insert text of paper
+%                         .
+%                 \end{document}
+%
+% Preprint Option (Only for preprints, not for submissions!):
+%    - can create a preprint titlepage footer by using the
+%         "preprint" option with the \usepackage{spconf} command
+%    - use \copyrightnotice{\copyright xx} for copyright information
+%    - use \toappear{To appear in xx} for publication name
+% Example of preprint use:
+%
+%                 \documentclass{article}
+%                 \usepackage[preprint]{spconf}
+%                         .
+%                 \copyrightnotice{\copyright\ IEEE 2000}
+%                 \toappear{To appear in {\it Proc.\ ICASSP2000,
+%                    June 05-09, 2000, Istanbul, Turkey}}
+%
+%
+% PLEASE REPORT ANY BUGS
+%
+% Author:  Stephen Martucci  -- stephen.martucci@ieee.org
+%
+% Date:    3 May 2000
+%
+% Updated: Lance Cotton, Ulf-Dietrich Braumann, 11 May 2006
+% Change:  Added keywords/Index Terms section
+% Change:  Added \emergencystretch=11pt, Lance Cotton, 26-Sept-2007
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+% These commands change default text fonts to the scalable PostScript
+% fonts Times, Helvetica, and Courier. However, they do not change
+% the default math fonts. After conversion to PDF, text will look good
+% at any scale but math symbols and equations may not.
+% If instead you use the PostScript Type 1 implementation of the
+% Computer Modern fonts from the American Mathematical Society, which
+% will make all fonts (text and math) scalable, comment out the
+% following three lines. Those fonts use the same metrics as the Knuth
+% Computer Modern fonts and therefore no font redefinition is needed.
+\renewcommand{\sfdefault}{phv}
+\renewcommand{\rmdefault}{ptm}
+\renewcommand{\ttdefault}{pcr}
+
+%\oddsidemargin  -0.31in
+%\evensidemargin -0.31in
+\oddsidemargin  -6.2truemm
+\evensidemargin -6.2truemm
+
+\topmargin 0truept
+\headheight 0truept
+\headsep 0truept
+%\footheight 0truept
+%\footskip 0truept
+\textheight 229truemm
+\textwidth 178truemm
+
+\twocolumn
+\columnsep 6truemm
+\pagestyle{empty}
+
+\emergencystretch=11pt
+
+\def\ninept{\def\baselinestretch{.95}\let\normalsize\small\normalsize}
+
+\def\maketitle{\par
+ \begingroup
+ \def\thefootnote{}
+ \def\@makefnmark{\hbox
+ {$^{\@thefnmark}$\hss}}
+ \if@twocolumn
+ \twocolumn[\@maketitle]
+ \else \newpage
+ \global\@topnum\z@ \@maketitle \fi\@thanks
+ \endgroup
+ \setcounter{footnote}{0}
+ \let\maketitle\relax
+ \let\@maketitle\relax
+ \gdef\thefootnote{\arabic{footnote}}\gdef\@@savethanks{}%
+ \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
+
+\def\@maketitle{\newpage
+ \null
+ \vskip 2em \begin{center}
+ {\large \bf \@title \par} \vskip 1.5em {\large \lineskip .5em
+\begin{tabular}[t]{c}\@name \\ \@address
+ \end{tabular}\par} \end{center}
+ \par
+ \vskip 1.5em}
+
+\def\title#1{\gdef\@title{\uppercase{#1}}}
+\def\name#1{\gdef\@name{{\em #1}\\}}
+\def\address#1{\gdef\@address{#1}}
+\gdef\@title{\uppercase{title of paper}}
+\gdef\@name{{\em Name of author}\\}
+\gdef\@address{Address - Line 1 \\
+               Address - Line 2 \\
+               Address - Line 3}
+
+\let\@@savethanks\thanks
+\def\thanks#1{\gdef\thefootnote{}\@@savethanks{#1}}
+\def\sthanks#1{\gdef\thefootnote{\fnsymbol{footnote}}\@@savethanks{#1}}
+
+\def\twoauthors#1#2#3#4{\gdef\@address{}
+   \gdef\@name{\begin{tabular}{@{}c@{}}
+        {\em #1} \\ \\
+        #2\relax
+   \end{tabular}\hskip 1in\begin{tabular}{@{}c@{}}
+        {\em #3} \\ \\
+        #4\relax
+\end{tabular}}}
+
+\def\@sect#1#2#3#4#5#6[#7]#8{
+   \refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname.\hskip 0.6em}
+       \begingroup \ifnum #2=1\bf\centering
+          {\interlinepenalty \@M
+             \@svsec\uppercase{#8}\par}\else\ifnum #2=2\bf
+          \noindent{\interlinepenalty \@M \@svsec #8\par}\else\it
+          \noindent{\interlinepenalty \@M
+             \@svsec #8\par}\fi\fi\endgroup
+       \csname #1mark\endcsname{#7}\addcontentsline
+         {toc}{#1}{\protect\numberline{\csname the#1\endcsname} #7}
+     \@tempskipa #5\relax
+     \@xsect{\@tempskipa}}
+
+\def\abstract{\begin{center}
+{\bf ABSTRACT\vspace{-.5em}\vspace{0pt}}
+\end{center}}
+\def\endabstract{\par}
+
+% Keyword section, added by Lance Cotton, adapted from IEEEtrans, corrected by Ulf-Dietrich Braumann
+\def\keywords{\vspace{.5em}
+{\bfseries\textit{Index Terms}---\,\relax%
+}}
+\def\endkeywords{\par} 
+
+\def\copyrightnotice#1{\gdef\@copyrightnotice{#1}}
+\let\@copyrightnotice\relax
+\def\toappear#1{\gdef\@toappear{#1}}\let\@toappear\relax
+
+\newif\if@preprint\@preprintfalse
+\@namedef{ds@preprint}{\global\@preprinttrue}
+\@options
+\def\ps@preprint{\def\mypage{}\let\@mkboth\@gobbletwo\def\@oddhead{}
+  \def\@oddfoot{\rlap{\@toappear}\hfil\mypage\hfil
+    \llap{\@copyrightnotice}
+    \gdef\mypage{\thepage}\gdef\@toappear{}\gdef\@copyrightnotice{}}}
+
+\if@preprint\ps@preprint
+\else\ps@empty\flushbottom\fi
+
+\def\thebibliography#1{\section{References}\list
+ {[\arabic{enumi}]}{\settowidth\labelwidth{[#1]}\leftmargin\labelwidth
+ \advance\leftmargin\labelsep
+ \usecounter{enumi}}
+ \def\newblock{\hskip .11em plus .33em minus .07em}
+ \sloppy\clubpenalty4000\widowpenalty4000
+ \sfcode`\.=1000\relax}
+\let\endthebibliography=\endlist
+
+\long\def\@makecaption#1#2{
+ \vskip 10pt
+ \setbox\@tempboxa\hbox{#1. #2}
+ \ifdim \wd\@tempboxa >\hsize #1. #2\par \else \hbox
+to\hsize{\hfil\box\@tempboxa\hfil}
+ \fi}
+
+\def\fnum@figure{{\bf Fig.\ \thefigure}}
+\def\fnum@table{{\bf Table \thetable}}
+
+\flushbottom
+
+%%%% EOF
diff --git a/doc/papers/2010/SPM/final/spm.bib b/doc/papers/2010/SPM/final/spm.bib
new file mode 100644
index 0000000000000000000000000000000000000000..846f18044df8df9082f3ac57b143250d8a809ac2
--- /dev/null
+++ b/doc/papers/2010/SPM/final/spm.bib
@@ -0,0 +1,3444 @@
+@string
+{
+    AAAI	= {AAAI National Conference}
+}
+
+@string
+{
+    ACM		= {ACM Annual Conference}
+}
+
+@string
+{
+    ACMCS	= {ACM Computing Surveys}
+}
+
+@string
+{
+    ADASS	= {Astronomical Data Analysis Software and Systems}
+}
+
+@string
+{
+    AI		= {Artificial Intelligence}
+}
+
+@string
+{
+    AIM		= {AI Magazine}
+}
+
+@string
+{
+    ACC		= {Advances in Computer Chess}
+}
+
+@string
+{
+    ASP   	= {Astronomical Society of the Pacific}
+}
+
+
+@string
+{
+    ASPLOS	= {Architectural Support for Programming Languages and Operating Systems}
+}
+
+@string
+{
+    BIOINF	= {Bioinformatics}
+}
+
+@string
+{
+    CABIOS	= {Computer Applications in the Biosciences}
+}
+
+@string
+{
+    CACM	= {Communications of the ACM}
+}
+
+@string
+{
+    CCAI	= {Canadian Conference on Artificial Intelligence}
+}
+
+@string
+{
+    CCGRID	= {IEEE International Symposium on Cluster Computing and the Grid (CCGRID)}
+}
+
+@string
+{
+    CI		= {Computational Intelligence}
+}
+
+@string
+{
+    COMPUTER	= {IEEE Computer}
+}
+
+@string
+{
+    COSB	= {Current Opinion in Structural Biology}
+}
+
+@string
+{
+    DC		= {Distributed Computing}
+}
+
+@string
+{
+    HPCA	= {International Symposium On High Performance Computer Architecture}
+}
+
+@string
+{
+    HPDC	= {International Symposium on High Performance Distributed Computing}
+}
+
+@string
+{
+    ICCA	= {Journal of the International Computer Chess Association}
+}
+
+@string
+{
+    ICGA	= {Journal of the International Computer Games Association}
+}
+
+@string
+{
+    ICDCS	= {International Conference on Distributed Computing Systems}
+}
+
+@string
+{
+    ICPP	= {International Conference on Parallel Processing}
+}
+
+@string
+{
+    ICS		= {ACM International Conference on Supercomputing}
+}
+
+@string
+{
+    ICSPC	= {IEEE International Conference on Signal Processing and Communications}
+}
+
+@string
+{
+    IEEE	= {Proceedings of the IEEE}
+}
+
+@string
+{
+    IJCAI	= {International Joint Conference on Artificial Intelligence}
+}
+
+@string
+{
+    IJIS	= {International Journal of Intelligent Systems}
+}
+
+@string
+{
+    IJRD	= {IBM Journal of Research and Development}
+}
+
+@string
+{
+    IPL		= {Information Processing Letters}
+}
+
+@string
+{
+    IPPS	= {International Parallel Processing Symposium}
+}
+
+@string
+{
+    IS		= {Information Sciences}
+}
+
+@string
+{
+    ISCA	= {International Symposium on Computer Architecture}
+}
+
+@string
+{
+    JPDC	= {Journal of Parallel and Distributed Computing}
+}
+
+@string
+{
+    JMB		= {Journal of Molecular Biology}
+}
+
+@string
+{
+    LFU		= {The Low-Frequency Universe}
+}
+
+@string
+{
+    LNCS	= {Lecture Notes in Computer Science}
+}
+
+@string
+{
+    MICRO	= {IEEE Micro}
+}
+
+@string
+{
+    NAR		= {Nucleic Acids Research}
+}
+
+@string
+{
+    NRC		= {NRC Handelsblad}
+}
+
+@string
+{
+    OSDI	= {Operating System Design and Implementation}
+}
+
+@string
+{
+    PAMI	= {IEEE Transactions on Pattern Analysis and Machine Intelligence}
+}
+
+@string
+{
+    PAMIV	= {Parallel Algorithms for Machine Intelligence and Vision}
+}
+
+@string
+{
+    PDC		= {Principles of Distributed Computing}
+}
+
+@string
+{
+    PDPTA	= {Parallel and Distributed Processing Techniques and Applications}
+}
+
+@string
+{
+    PLDI	= {ACM SIGPLAN Conference on Programming Language Design and Implementation}
+}
+
+@string
+{
+    PPOPP	= {ACM SIGPLAN Symposium on Principles and Practice on Parallel Programming}
+}
+
+@string
+{
+    PSFG	= {Proteins: Structure, Function, and Genetics}
+}
+
+@string
+{
+    SC		= {SuperComputing}
+}
+
+@string
+{
+    SOSP	= {Symposium on Operating System Principles}
+}
+
+@string
+{
+    SPAA	= {ACM Symposium on Parallel Algorithms and Architectures}
+}
+
+@string
+{
+    SPE		= {Software --- Practice and Experience}
+}
+
+@string
+{
+    SPDP	= {Symposium on Parallel and Distributed Processing}
+}
+
+@string
+{
+    TIBS	= {Trends in Biochemical Sciences}
+}
+
+@string
+{
+    TOCS	= {ACM Transactions on Computer Systems}
+}
+
+@string
+{
+    TOPLAS	= {ACM Transactions on Programming Languages and Systems}
+}
+
+@string
+{
+    TPDS	= {IEEE Transactions on Parallel and Distributed Systems}
+}
+
+@string
+{
+    VK		= {De Volkskrant}
+}
+
+
+@book
+{
+		  Aho:86,
+    title	= {{Compilers: Principles, Techniques, and Tools}},
+    author	= {A.V. Aho and R. Sethi and J.D. Ullman},
+    isbn	= {0-201-10088-6},
+    publisher	= {Addison-Wesley},
+    year	= {1986}
+}
+
+@inproceedings
+{
+		  Akl:77,
+    title	= {{The Principal Continuation and the Killer Heuristic}},
+    author	= {S.G. Akl and M.M. Newborn},
+    booktitle	= ACM,
+    pages	= {466--473},
+    year	= {1977}
+}
+
+@inproceedings
+{
+		  Alexandrov:95,
+    title	= {{LogGP: Incorporating Long Messages into the LogP Model --- One Step Closer Towards a Realistic Model for Parallel Computation}},
+    author	= {A. Alexandrov and M.F. Ionescu and K.E. Schauser and C. Scheiman},
+    booktitle	= SPAA # { (SPAA'95)},
+    pages	= {95--105},
+    year	= {1995}
+}
+    
+@incollection
+{
+		  Allen:89,
+    title	= {{A Note on the Computer Solution of Connect-Four}},
+    author	= {J.D. Allen},
+    booktitle	= {Heuristic Programming in Artificial Intelligence 1: the First Computer Olympiad},
+    editor	= {D.N.L. Levy and D.F. Beal},
+    pages	= {134--135},
+    publisher	= {Ellis Horwood},
+    address	= {Chichester, England},
+    year	= {1989}
+}
+
+@incollection
+{
+		  Allis:91,
+    title	= {{Databases in Awari}},
+    author	= {L.V. Allis and M. van der Meulen and H.J. van den Herik},
+    booktitle	= {Heuristic Programming in Artificial Intelligence 2: the Second Computer Olympiad},
+    editor	= {D.N.L. Levy and D.F. Beal},
+    pages	= {73--86},
+    publisher	= {Ellis Horwood},
+    address	= {Chichester, England},
+    year	= {1991}
+}
+    
+@phdthesis
+{
+		  Allis:94,
+    title	= {{Searching for Solutions in Games and Artificial Intelligence}},
+    author	= {L.V. Allis},
+    school	= {University of Limburg},
+    address	= {Maastricht, the Netherlands},
+    month	= {September},
+    year	= {1994}
+}
+
+@article
+{
+		  Allis:94b,
+    title	= {{Proof-Number Search}},
+    author	= {L.V. Allis and {M. van der} Meulen and {H.J. van den} Herik},
+    journal	= AI,
+    volume	= {66},
+    number	= {1},
+    pages	= {91--124},
+    year	= {1994}
+}
+
+@inproceedings
+{
+		  Altmann:88,
+    title	= {{Accounting for Parallel Tree Search Overheads}},
+    author	= {E. Altmann and T.A. Marsland and T. Breitkreutz},
+    booktitle	= ICPP,
+    volume	= {III, Algorithms and Applications},
+    pages	= {198--201},
+    address	= {University Park, Penn.},
+    month	= {August},
+    year	= {1988}
+}
+
+@article
+{
+		  Aluru:03,
+    title	= {{Parallel Biological Sequence Comparison using Prefix Computations}},
+    author	= {S. Aluru and N. Futamura and K. Mehrotra},
+    journal	= JPDC,
+    volume	= {63},
+    pages	= {264--272},
+    year	= {2003}
+}
+
+@article
+{
+		  Andrews:91,
+    title	= {{Paradigms for Process Interaction in Distributed Programs}},
+    author	= {G.R. Andrews},
+    journal	= ACMCS,
+    volume	= {23},
+    number	= {1},
+    pages	= {49--90},
+    month	= {March},
+    year	= {1991}
+}
+
+@article
+{
+		  Anantharaman:90,
+    title	= {{Singular Extensions: Adding Selectivity to Brute-Force Searching}},
+    author	= {T.S. Anantharaman and M.S. Campbell and F.H. Hsu},
+    journal	= AI,
+    volume	= {43},
+    number	= {1},
+    pages	= {99--109},
+    year	= {1990}
+}
+
+@inproceedings
+{
+		  August:98,
+    title	= {{Integrated Predicated and Speculative Execution in the IMPACT EPIC Architecture}},
+    author	= {D.I. August and D.A. Connors and S.A. Mahlke and J.W. Sias and K.M. Crozier and B.-C. Cheng and P.R. Eaton and Q.B. Olaniran and W.-m.W. Hwu},
+    booktitle	= ISCA,
+    pages	= {227-237},
+    month	= {July},
+    year	= {1998},
+}
+
+@article
+{
+		  Bal:86,
+    title	= {{A Summary of Parallel Alpha-Beta Search Results}},
+    author	= {H.E. Bal and {R. van} Renesse},
+    journal	= ICCA,
+    volume	= {9},
+    number	= {3},
+    pages	= {146--149},
+    month	= {September},
+    year	= {1986}
+}
+
+@article
+{
+		  Bal:89,
+    title	= {{Programming Languages for Distributed Computing Systems}},
+    author	= {H.E. Bal and J.G. Steiner and A.S. Tanenbaum},
+    journal	= ACMCS,
+    volume	= {21},
+    number	= {3},
+    pages	= {261--322},
+    month	= {September},
+    year	= {1989}
+}
+
+@inproceedings
+{
+		  Bal:95,
+    title	= {{Parallel Retrograde Analysis on a Distributed System}},
+    author	= {H.E. Bal and L.V. Allis},
+    booktitle	= SC # { '95},
+    address	= {San Diego, CA},
+    month	= {December},
+    year	= {1995}
+}
+
+@techreport
+{
+		  Bal:96,
+    title	= {{Orca: a Portable User-Level Shared Object System}},
+    author	= {H.E. Bal et al},
+    institution	= {Dept. of Mathematics and Computer Science, Vrije Universiteit, Amsterdam},
+
+    number	= {IR-408},
+    month	= {July},
+    year	= {1996}
+}
+
+@article
+{
+		  Bal:98,
+    title	= {{Performance Evaluation of the Orca Shared Object System}},
+    author	= {H.E. Bal and R. Bhoedjang and R. Hofman and C. Jacobs and K. Langendoen and T. R\"{u}hl and M.F. Kaashoek},
+    journal	= TOCS,
+    volume	= {16},
+    number	= {1},
+    pages	= {1--40},
+    month	= {February},
+    year	= {1998}
+}
+
+@article
+{
+		  Bare:67,
+    title	= {{Interferometer Experiment with Independent Local Oscillators}},
+    author	= {C. Bare and B. Clark and K. Kellermann and M. Cohen and D. Jauncey},
+    journal	= {Science},
+    volume	= {157},
+    number	= {3785},
+    pages	= {189--191},
+    month	= {July},
+    year	= {1967}
+}
+
+@article
+{
+		  Beal:95,
+    title	= {{Multiple Probes of Transposition Tables}},
+    author	= {D.F. Beal and M.C. Smith},
+    journal	= ICCA,
+    volume	= {19},
+    number	= {4},
+    pages	= {227--233},
+    month	= {December},
+    year	= {1995}
+}
+
+@incollection
+{
+		  Beal:89,
+    title	= {{Experiments with the Null-Move}},
+    author	= {D.F. Beal},
+    booktitle	= ACC # { 5},
+    editor	= {D.F. Beal},
+    publisher	= {Elsevier Science Publishers},
+    pages	= {65--79},
+    address	= {Amsterdam},
+    year	= {1989}
+}
+
+@article
+{
+		  Beal:90,
+    title	= {{A Generalized Quiescence Search Algorithm}},
+    author	= {D.F. Beal},
+    journal	= AI,
+    volume	= {43},
+    number	= {1},
+    pages	= {85--98},
+    year	= {1990}
+}
+
+@inproceedings
+{
+		  Berliner:73,
+    title	= {{Some Necessary Conditions for a Master Chess Program}},
+    author	= {H.J. Berliner},
+    booktitle	= IJCAI,
+    address	= {Stanford, MA},
+    pages	= {77--85},
+    year	= {1973}
+}
+
+@article
+{
+		  Bhoedjang:93,
+    title	= {{Panda: A Portable Platform to Support Parallel Programming Languages}},
+    author	= {R.A.F. Bhoedjang and T. R\"{u}hl and R. Hofman and K. Langendoen and H.E. Bal and M.F. Kaashoek},
+    journal	= {Symposium on Experiences with Distributed and Multiprocessor Systems},
+    address	= {San Diego},
+    pages	= {213--226},
+    month	= {September},
+    year	= {1993}
+}
+
+@inproceedings
+{
+		  Bhoedjang:98,
+    title	= {{Optimizing Distributed Data Structures Using Application-Specific Network Interface Software}},
+    author	= {R.A.F. Bhoedjang and J.W. Romein and H.E. Bal},
+    booktitle	= ICPP,
+    address	= {Minneapolis, MN},
+    pages	= {485--492},
+    month	= {August},
+    year	= {1998}
+}
+
+@inproceedings
+{
+		  Bhoedjang:98b,
+    title	= {{Efficient Multicast On Myrinet Using Link-Level Flow Control}},
+    author	= {R.A.F. Bhoedjang and T. R\"{u}hl and H.E. Bal},
+    booktitle	= ICPP,
+    address	= {Minneapolis, MN},
+    pages	= {381--390},
+    month	= {August},
+    year	= {1998}
+}
+
+@phdthesis
+{
+		  Bhoedjang:00,
+    title	= {{Communication Architectures for Parallel-Programming Systems}},
+    author	= {R.A.F. Bhoedjang},
+    school	= {Vrije Universiteit},
+    address	= {Amsterdam, the Netherlands},
+    month	= {June},
+    year	= {2000}
+}
+
+@inproceedings
+{
+		  Bhoedjang:00b,
+    title	= {{Evaluating Design Alternatives for Reliable Communication on High-Speed Networks}},
+    author	= {R.A.F. Bhoedjang and K. Verstoep and T. Ruhl and H.E. Bal and R.F.H. Hofman},
+    booktitle	= ASPLOS,
+    address	= {Cambridge, MA},
+    month	= {November},
+    year	= {2000}
+}
+
+@inproceedings
+{
+		  Blumofe:95,
+    title	= {{Cilk: An Efficient Multithreaded Runtime System}},
+    author	= {R.D. Blumofe and C.F. Joerg and B.C. Kuszmaul and C.E. Leiserson and K.H. Randall and Y. Zhou},
+    booktitle	= PPOPP,
+    pages	= {207--216},
+    address	= {Santa Barbara, CA},
+    month	= {July},
+    year	= {1995}
+}
+
+@inproceedings
+{
+		  Blumofe:96,
+    title	= {{Dag-Consistent Distributed Shared Memory}},
+    author	= {R.D. Blumofe and M. Frigo and C.F. Joerg and C.E. Leiserson and K.H. Randall},
+    booktitle	= IPPS,
+    pages	= {132--141},
+    address	= {Honolulu, Hawaii},
+    month	= {April},
+    year	= {1996}
+}
+
+@article
+{
+		  Boden:95,
+    author	= {N.J. Boden and D. Cohen and R.E. Felderman and A.E. Kulawik and C.L. Seitz and J.N. Seizovic and W. Su},
+    title	= {{Myrinet: A Gigabit-per-second Local Area Network}},
+    journal	= {IEEE Micro},
+    volume	= {15},
+    number	= {1},
+    pages	= {29--36},
+    month	= {February},
+    year	= {1995}
+}
+
+@inproceedings
+{
+		  Bonacina:97,
+    title	= {{The Clause-Diffusion Theorem Prover Peers-mcd}},
+    author	= {M.P. Bonacina},
+    booktitle	= {Proceedings of the 14th CADE (LNAI 1249)},
+    editor	= {W. McCune},
+    publisher	= {Springer-Verlag},
+    pages	= {53--56},
+    month	= {July},
+    year	= {1997}
+}
+
+@mastersthesis
+{
+		  Boonstoppel:08,
+    title	= {{Semi-Transparent Dual-Processing on Blue Gene/L I/O~Nodes}},
+    author	= {P. Boonstoppel},
+    school	= {Dept. of Mathematics and Computer Science, Vrije Universiteit, Amsterdam},
+    month	= {May},
+    year	= {2008}
+}
+
+@incollection
+{
+		  Bratko:82,
+    title	= {{A Test for Comparison of Human and Computer Performance}},
+    author	= {I. Bratko and D. Kopec},
+    booktitle	= ACC # { 3},
+    editor	= {M.R.B. Clarke},
+    publisher	= {Pergamon Press, Oxford},
+    pages	= {31--56},
+    year	= {1982}
+}
+
+@article
+{
+		  Breuker:96,
+    title	= {{Replacement Schemes and Two-Level Tables}},
+    author	= {D.M. Breuker and J.W.H.M. Uiter\-wijk and {H.J. van den} Herik},
+    journal	= ICCA,
+    volume	= {19},
+    number	= {3},
+    pages	= {175--180},
+    month	= {September},
+    year	= {1996}
+}
+
+@inproceedings
+{
+		  Breuker:98,
+    title	= {{A Solution for the GHI Problem for Best-First Search}},
+    author	= {D.M. Breuker and {H.J. van den} Herik and J.W.H. M Uiter\-wijk and L.V. Allis},
+    booktitle	= {Computers and Games},
+    series	= LNCS,
+    volume	= {1558},
+    editor	= {{H.J. van den} Herik and Hiroyuki Iida},
+    pages	= {25--49},
+    month	= {November},
+    year	= {1998}
+}
+
+@phdthesis
+{
+		  Breuker:98b,
+    title	= {{Memory versus Search in Games}},
+    author	= {D.M. Breuker},
+    school	= {Universiteit Maastricht},
+    address	= {the Netherlands},
+    isbn	= {90-9012006-8},
+    year	= {1998}
+}
+
+@inproceedings
+{
+		  Brockington:96,
+    title	= {{The APHID Parallel Alpha-Beta Search Algorithm}},
+    author	= {M.G. Brockington and J. Schaeffer},
+    booktitle	= SPDP,
+    address	= {New Orleans},
+    pages	= {432--436},
+    month	= {October},
+    year	= {1996}
+}
+
+@article
+{
+		  Brockington:96b,
+    title	= {{A Taxonomy Of Parallel Game-Tree Search Algorithms}},
+    author	= {M.G. Brockington},
+    journal	= ICCA,
+    volume	= {19},
+    number	= {3},
+    pages	= {162--174},
+    month	= {September},
+    year	= {1996}
+}
+
+@incollection
+{
+		  Brockington:97,
+    title	= {{APHID Game-Tree Search}},
+    author	= {M.G. Brockington and J. Schaeffer},
+    booktitle	= ACC # { 8},
+    editor	= {{H.J. van den} Herik and J. Uiter\-wijk},
+    publisher	= {Universiteit Maastricht},
+    address	= {the Netherlands},
+    pages	= {69--91},
+    year	= {1997}
+}
+
+@phdthesis
+{
+		  Brockington:98,
+    title	= {{Asynchronous Parallel Game-Tree Search}},
+    author	= {M.G. Brockington},
+    school	= {University of Alberta},
+    address	= {Edmonton, Alberta, Canada},
+    month	= {November},
+    year	= {1997}
+}
+
+@article
+{
+		  Brockington:00,
+    title	= {{Computer Chess meets Planning}},
+    author	= {M.G. Brockington},
+    journal	= ICGA,
+    volume	= {23},
+    number	= {2},
+    pages	= {85--93},
+    month	= {June},
+    year	= {2000}
+}
+
+@article{lofar,
+	author = "M.P. van Haarlem",
+	title = "LOFAR: The Low Frequency Array",
+	DOI= "10.1051/eas:2005169",
+	note = {\url{http://dx.doi.org/10.1051/eas:2005169}},
+	journal = "European Astronomical Society Publications Series",
+	year = 2005,
+	volume = 15,
+	pages = "431-444",
+}
+
+@phdthesis
+{
+		  Buro:94,
+    title	= {{Techniken f\"ur die Bewertung von Spielsituationen anhand von Beispielen}},
+    author	= {M. Buro},
+    school	= {Universit\"at--GH--Paderborn},
+    address	= {Paderborn, Germany},
+    note	= {In German.},
+    month	= {December},
+    year	= {1994}
+}
+    
+@article
+{
+		  Buro:97,
+    title	= {{The Othello Match of the Year: Takeshi Murakami vs. Logistello}},
+    author	= {M. Buro},
+    journal	= ICCA,
+    volume	= {20},
+    number	= {3},
+    pages	= {189--193},
+    month	= {September},
+    year	= {1997}
+}
+
+@article
+{
+		  Butcher:04,
+    title	= {{LOFAR: First of a New Generation of Radio Telescopes}},
+    author	= {H.R. Butcher},
+    journal	= {Proceedings of the SPIE},
+    volume	= {5489},
+    pages	= {537--544},
+    month	= {October},
+    year	= {2004}
+}
+
+@inproceedings
+{
+		  Buzzard:96,
+    title	= {{An Implementation of the Hamlyn Sender-managed Interface Architecture}},
+    author	= {G. Buzzard and D. Jacobson and M. MacKey and S. Marovich and J. Wilkes},
+    booktitle	= OSDI,
+    pages	= {245--259},
+    address	= {Seattle, WA},
+    month	= {October},
+    year	= {1996}
+}
+
+@article
+{
+		  Calmthout:02,
+    title	= {{Gigabytes in de kuiltjes}},
+    author	= {{M. van} Calmthout},
+    journal	= VK,
+    month	= {August 17,},
+    year	= {2002},
+    note	= {(in Dutch)}
+}
+
+@article
+{
+		  Ciancarini:94,
+    title	= {{Distributed Searches: A Basis for Comparison}},
+    author	= {P. Ciancarini},
+    journal	= ICCA,
+    volume	= {17},
+    number	= {4},
+    pages	= {194--206},
+    month	= {December},
+    year	= {1994}
+}
+
+@manual
+{
+		  Cilk:96,
+    title	= {{Cilk-4.1 (Beta 1) Reference Manual}},
+    organization= {Supercomputing Technologies Group, MIT Laboratory for Computer Science},
+    month	= {September},
+    year	= {1996}
+}
+
+@inproceedings
+{
+		  Cook:97,
+    title	= {{Maximizing the Benefits of Parallel Search Using Machine Learning}},
+    author	= {D. Cook and R. Varnell},
+    booktitle	= AAAI,
+    pages	= {559--564},
+    month	= {July},
+    year	= {1997}
+}
+
+@techreport
+{
+		  Culberson:94,
+    title	= {{Efficiently Searching the 15-Puzzle}},
+    author	= {J. Culberson and J. Schaeffer},
+    institution	= {Department of Computing Science, University of Alberta},
+    number	= {94-08},
+    year	= {1994}
+}
+
+@inproceedings
+{
+		  Culberson:96,
+    title	= {{Searching with Pattern Databases}},
+    author	= {J. Culberson and J. Schaeffer},
+    booktitle	= {Advances in Artificial Intelligence, CSCSI (LNAI 1081)},
+    publisher	= {Springer-Verlag},
+    pages	= {402--416},
+    month	= {May},
+    year	= {1996}
+}
+
+@article
+{
+		  Culberson:98,
+    title	= {{Pattern Databases}},
+    author	= {J. Culberson and J. Schaeffer},
+    journal	= CI,
+    volume	= {14},
+    number	= {4},
+    pages	= {318--334},
+    year	= {1998}
+}
+
+@inproceedings
+{
+		  Culler:93,
+    title	= {{Two Fundamental Limits on Dataflow Multiprocessing}},
+    author	= {D.E. Culler and K.E. Schauser and {T. von} Eicken},
+    booktitle	= {Proceedings of the IFIP WG 10.3 Working Conference on Architectures and Compilation Techniques for Fine and Medium Grain Parallelism},
+    address	= {Orlando, FL},
+    publisher	= {North-Holland},
+    month	= {January},
+    year	= {1993}
+}
+
+@inproceedings
+{
+		  Dean:88,
+    title	= {{An Analysis of Time-Dependent Planning}},
+    author	= {T.L. Dean and M. Boddy},
+    booktitle	= AAAI,
+    pages	= {49--54},
+    month	= {August},
+    year	= {1988}
+}
+
+@unpublished
+{
+		  Deller:05,
+    title	= {{Electronic Transmission and Computation of Very Long Baseline Interferometry and Its Application to Next Generation Radio Telescopes}},
+    author	= {C. Phillips and T. Tzioumis and A. Deller and S. Tingay and C. Harris and K. Haines},
+    note	= {Poster in the first IEEE International Conference on e-Science and Grid Computing},
+    month	= {December},
+    year	= {2005},
+}
+
+@article
+{
+		  Deller:07,
+    title	= {{DiFX: A Software Correlator for Very Long Baseline Interferometry Using Multiprocessor Computing Environments}},
+    author	= {A. Deller and S. Tingay and M. Bailes and C. West},
+    journal	= ASP,
+    volume	= {119},
+    pages	= {318--336},
+    year	= {2007}
+}
+
+
+@inproceedings
+{
+		  Dubnicki:97,
+    title	= {{VMMC-2: Efficient Support for Reliable, Connection-Oriented Communication}},
+    author	= {C. Dubnicki and A. Bilas and Y. Chen and S. Damianakis and K. Li},
+    booktitle	= {Hot Interconnects '97},
+    address	= {Stanford, CA},
+    month	= {April},
+    year	= {1997}
+}
+
+@phdthesis
+{
+		  Ebeling:86,
+    title	= {{All the Right Moves: A VLSI Architecture for Chess}},
+    author	= {C. Ebeling},
+    school	= {Carnegie-Mellon University},
+    address	= {Pittsburg, PA},
+    year	= {1986}
+}
+
+@inproceedings
+{
+		  Eicken:92,
+    title	= {{Active Messages: A Mechanism for Integrated Communication and Computation}},
+    author	= {{T. von} Eicken and D.E. Culler and S.C. Goldstein and K.E. Schausser},
+    booktitle	= ISCA,
+    pages	= {256--266},
+    address	= {Gold Coast, Australia},
+    month	= {May},
+    year	= {1992}
+}
+
+@inproceedings
+{
+		  Eicken:95,
+    title	= {{U-Net: A User-Level Network Interface for Parallel and Distributed Computing}},
+    author	= {{T. von} Eicken and A. Basu and V. Buch and W. Vogels},
+    booktitle	= SOSP,
+    pages	= {303--316},
+    address	= {Copper Mountain, CO},
+    month	= {December},
+    year	= {1995}
+}
+
+@book
+{
+		  Ellis:90,
+    title	= {{The Annotated C++ Reference Manual}},
+    author	= {M.A. Ellis and B. Stroustrup},
+    publisher	= {Addison-Wesley},
+    isbn	= {0-201-51459-1},
+    year	= {1990}
+}
+
+@phdthesis
+{
+		  Engelen:98,
+    title	= {{CTADEL: A Generator of Efficient Numerical Codes}},
+    author	= {{R.A. van} Engelen},
+    school	= {Rijks\-uni\-ver\-sit\-eit Leiden},
+    address	= {the Netherlands},
+    month	= {October},
+    year	= {1998}
+}
+
+@article
+{
+		  Epstein:92,
+    title	= {{Prior Knowledge Strengthens Learning to Control Search in Weak Theory Domains}},
+    author	= {S.L. Epstein},
+    journal	= IJIS,
+    volume	= {7},
+    pages	= {547--586},
+    year	= {1992}
+}
+
+@article
+{
+		  Epstein:96,
+    title	= {{Pattern-Based Learning and Spatially-Oriented Concept Formation with a Multi-Agent, Decision-Making Expert}},
+    author	= {S.L. Epstein and J. Gelfand and J. Lesniak},
+    journal	= CI,
+    volume	= {12},
+    number	= {1},
+    pages	= {199--221},
+    year	= {1996}
+}
+
+@article
+{
+		  Evett:95,
+    title	= {{PRA*: Massively Parallel Heuristic Search}},
+    author	= {M. Evett and J. Hendler and A. Mahanti and D. Nau},
+    journal	= JPDC,
+    volume	= {25},
+    pages	= {133--143},
+    year	= {1995},
+}
+
+@phdthesis
+{
+		  Feldmann:93,
+    title	= {{Game Tree Search on Massively Parallel Systems}},
+    author	= {R. Feldmann},
+    school	= {University of Paderborn},
+    month	= {August},
+    year	= {1993}
+}
+
+@inproceedings
+{
+		  Feldmann:94,
+    title	= {{Studying Overheads in Massively Parallel MIN/MAX-Tree Evaluation}},
+    author	= {R. Feldmann and P. Mysliwietz and B. Monien},
+    booktitle	= SPAA,
+    pages	= {94--103},
+    year	= {1994}
+}
+
+@incollection
+{
+		  Feldmann:94b,
+    title	= {{Game-Tree Search on a Massively Parallel System}},
+    author	= {R. Feldmann and P. Mysliwietz and B. Monien},
+    booktitle	= ACC # { 7},
+    editor	= {{H.J. van den} Herik and I. S Herschberg and J.W.H.M. Uiter\-wijk},
+    publisher   = {Universiteit Maastricht},
+    address     = {the Netherlands},
+    pages       = {203--218},
+    year        = {1994}
+}
+
+@article
+{
+		  Finkel:82,
+    title	= {{Parallelism in Alpha-Beta Search}},
+    author	= {R.A. Finkel and J. Fishburn},
+    journal	= AI,
+    volume	= {19},
+    pages	= {89--106},
+    year	= {1982}
+}
+
+@article
+{
+		  Finkel:87,
+    title	= {{DIB --- A Distributed Implementation of Backtracking}},
+    author	= {R.A. Finkel and U. Manber},
+    journal	= TOPLAS,
+    volume	= {9},
+    number	= {2},
+    pages	= {235--256},
+    month	= {April},
+    year	= {1987},
+}
+
+@unpublished
+{
+		  Fiuczynki,
+    title	= {{SPINE: Safe Programmable Integrated Network Environment}},
+    author	= {M. Fiuczynki},
+    note	= {\texttt{http://www.cs.washington.edu/homes/mef/research/spine}}
+}
+
+@inproceedings
+{
+		  Frigo:98,
+    title	= {{The Implementation of the Cilk-5 Multithreaded Language}},
+    author	= {M. Frigo and C.E. Leiserson and K.H. Randall},
+    booktitle	= PLDI,
+    address	= {Montreal, Canada},
+    month	= {June},
+    year	= {1998}
+}
+
+@phdthesis
+{
+		  Gasser:95,
+    title	= {{Harnessing Computational Resources for Efficient Exhaustive Search}},
+    author	= {R. Gasser},
+    school	= {ETH Z\"{u}rich, Switzerland},
+    year	= {1995}
+}
+
+@article
+{
+		  Gasser:96,
+    title	= {{Solving Nine-Man's-Morris}},
+    author	= {R. Gasser},
+    journal	= CI,
+    volume	= {12},
+    number	= {1},
+    pages	= {24--41},
+    year	= {1996}
+}
+
+@article
+{
+		  George:00,
+    title	= {{The REPRO Server: Finding Protein Internal Sequence Repeats Through the Web}},
+    author	= {R.A. George and J. Heringa},
+    journal	= TIBS,
+    volume	= {25},
+    number	= {10},
+    pages	= {515--517},
+    month	= {October},
+    year	= {2000}
+}
+
+@article
+{
+		  Gschwind:06,
+    title	= {{Synergistic Processing in Cell's Multicore Architecture}},
+    author	= {M. Gschwind and H.P. Hofstee and B. K. Flachs and M. Hopkins and Y. Watanabe and T. Yamazaki},
+    journal	= MICRO,
+    volume      = {26},
+    number      = {2},
+    pages       = {10--24},
+    month       = {March--April},
+    year        = {2006}
+}
+
+@inproceedings
+{
+		  Goot:00,
+    title	= {{Awari Endgame Databases}},
+    author	= {R. van der Goot},
+    booktitle	= {Computers and Games},
+    publisher	= {Springer},
+    series	= LNCS,
+    volume	= {2063},
+    pages	= {87--95},
+    isbn	= {3-540-43080-6},
+    year	= {2002}
+}
+
+@inproceedings
+{
+		  Goux:00,
+    title	= {{An Enabling Framework for Master-Worker Applications on the Compurational Grid}},
+    author	= {J.-P. Goux and S. Kulkarni and J. Linderoth and Michael Yoder},
+    booktitle	= HPDC,
+    pages	= {43--50},
+    address	= {Pittsburgh, PA},
+    month	= {August},
+    year	= {2000}
+}
+ 
+@inproceedings
+{
+		  Greenblatt:67,
+    title	= {{The Greenblatt Chess Program}},
+    author	= {R.D. Greenblatt and D.E. Eastlake III and S.D. Crocker},
+    booktitle	= {Proceedings of the Fall Joint Computing Conference},
+    address	= {San Fransisco},
+    pages	= {801--810},
+    year	= {1967}
+}
+
+@inproceedings
+{
+		  Greenwald:96,
+    title	= {{The Synergy Between Non-blocking Synchronization and Operating System Structure}},
+    author	= {M.G. Greenwald and D. Cheriton},
+    booktitle	= OSDI,
+    address	= {Seattle, WA},
+    pages	= {123--136},
+    month	= {October},
+    year	= {1996}
+}
+
+@article
+{
+		  Grune:88,
+    title	= {{A Programmer-friendly LL(1) Parser Generator}},
+    author	= {D. Grune and C.J.H. Jacobs},
+    journal	= SPE,
+    volume	= {18},
+    number	= {1},
+    pages	= {29--38},
+    month	= {January},
+    year	= {1988}
+},
+
+@inproceedings
+{
+		  Gunst:07,
+    title	= {{Signal Processing Aspects of the Low Frequency Array}},
+    author	= {Andre W. Gunst and Mark J. Bentum},
+    booktitle	= ICSPC,
+    pages	= {600--603},
+    address	= {Dubai, United Arab Emirates},
+    month	= {November},
+    year	= {2007}
+}
+ 
+@article
+{
+		  Hamilton:97,
+    title	= {{Deep Blue's Hardware-Software Synergy}},
+    author	= {S. Hamilton and L. Garber},
+    journal	= {IEEE Computer},
+    volume	= {30},
+    number	= {10},
+    pages	= {29--35},
+    year	= {1997}
+}
+
+@article
+{
+		  Hansson:92,
+    title	= {{Criticizing Solutions to Relaxed Models Yields Powerful Admissible Heuristics}},
+    author	= {O. Hansson and A. Mayer and M. Yung},
+    journal	= IS,
+    volume	= {63},
+    number	= {3},
+    pages	= {207--227},
+    year	= {1992}
+}
+
+@article
+{
+		  Hartmann:02,
+    title	= {{889.063.398.406 bordposities - Amsterdamse supercomputer lost bordspel awari op}},
+    author	= {D. Hartmann},
+    journal	= NRC,
+    month	= {September 14,},
+    year	= {2002},
+    note	= {(in Dutch)}
+}
+
+@article
+{
+		  Heger:00,
+    title	= {{Rapid Automatic Detection and Alignment of Repeats in Protein Sequences}},
+    author	= {A. Heger and L. Holm},
+    journal	= PSFG,
+    volume	= {41},
+    pages	= {224--237},
+    year	= {2000}
+}
+
+@article
+{
+		  Herik:02,
+    title	= {{Games Solved: Now and in the Future}},
+    author	= {{H.J. van den} Herik and J.W.H.M. Uiter\-wijk and {J. van} Rijs\-wijck},
+    journal	= AI,
+    volume	= {134},
+    number	= {1--2},
+    pages	= {277--311},
+    month	= {January},
+    year	= {2002}
+}
+
+@article
+{
+		  Heringa:93,
+    title	= {{A Method to Recognize Distant Repeats in Protein Sequences}},
+    author	= {J. Heringa and P. Argos},
+    journal	= PSFG,
+    volume	= {17},
+    pages	= {391--411},
+    year	= {1993}
+}
+
+@article
+{
+		  Heringa:98,
+    title	= {{Detection of Internal Repeats: How Common are They?}},
+    author	= {J. Heringa},
+    journal	= COSB,
+    volume	= {8},
+    pages	= {338--345},
+    year	= {1998}
+}
+
+@article
+{
+		  Herlihy:93,
+    title	= {{A Methodology for Implementing Highly Concurrent Data Objects}},
+    author	= {M. Herlihy},
+    journal	= TOPLAS,
+    volume	= {15},
+    pages	= {745--770},
+    year	= {1993}
+}
+
+@inproceedings
+{
+		  Hessels:09,
+    title	= {{The Radio Sky on Short Timescales with LOFAR: Pulsars and Fast Transients}},
+    author	= {J. Hessels and B. Stappers and J. van Leeuwen},
+    booktitle	= {The Low-Frequency Radio Universe},
+    series	= {ASP Conference Series},
+    note	= {To appear. http://arxiv.org/pdf/0903.1447}
+}
+
+@inproceedings
+{
+		  Holte:99,
+    title	= {{A Space-Time Tradeoff for Memory-Based Heuristics}},
+    author	= {R.C. Holte and I.T. Hern\'{a}dv\"{o}lgyi},
+    booktitle	= AAAI,
+    pages	= {704--709},
+    month	= {July},
+    year	= {1999}
+}
+
+@article
+{
+		  Huang:90,
+    title	= {{A Space-Efficient Algorithm for Local Similarities}},
+    author	= {X. Huang and R.C. Hardison and W. Miller},
+    journal	= CABIOS,
+    volume	= {6},
+    pages	= {373--381},
+    year	= {1990}
+}
+
+@article
+{
+		  Hsu:99,
+    title	= {{IBM's Deep Blue Chess Grandmaster Chips}},
+    author	= {F.-h. Hsu},
+    journal	= {IEEE Micro},
+    volume	= {19},
+    number	= {2},
+    pages	= {70--81},
+    month	= {April},
+    year	= {1999}
+}
+
+@proceedings
+{
+		  IBM:05,
+    title	= {{Blue Gene}},
+    editor	= {J.J. Ritsko and I. Ames and S.I. Raider and J.H. Robinson},
+    publisher	= {IBM Corporation},
+    series	= IJRD,
+    volume	= {49, number 2/3},
+    pages	= {189--500},
+    month	= {March/May},
+    year	= {2005}
+}
+
+@article
+{
+		  IBM:08,
+    title	= {{Overview of the IBM Blue Gene/P Project}},
+    author	= {IBM Blue Gene team},
+    journal	= {{IBM Journal of R\&D}},
+    editor	= {F. Mintzer},
+    volume	= {52},
+    number	= {1/2},
+    OPTmonth	= {January\hspace{0mm}/\hspace{0mm}March},
+    year	= {2008}
+}
+
+@book
+{
+		  Intel-1:96,
+    title	= {{Pentium Pro Family Developer's Manual}},
+    author	= {Intel Corporation},
+    volume	= {1: Specifications},
+    publisher	= {McGraw--Hill},
+    isbn        = {1-55512-259-0},
+    address     = {Mount Prospect, IL},
+    year        = {1996}
+}
+
+@book
+{
+		  Intel-2:96,
+    title	= {{Pentium Pro Family Developer's Manual}},
+    author	= {Intel Corporation},
+    volume	= {2: Programmer's Reference Manual},
+    publisher	= {McGraw--Hill},
+    isbn        = {1-55512-260-4},
+    address     = {Mount Prospect, IL},
+    year        = {1996}
+}
+
+@book
+{
+		  Intel-3:96,
+    title	= {{Pentium Pro Family Developer's Manual}},
+    author	= {Intel Corporation},
+    volume	= {3: Operating System Writer's Manual},
+    publisher	= {McGraw--Hill},
+    isbn        = {1-55512-261-2},
+    address     = {Mount Prospect, IL},
+    year        = {1996}
+}
+
+@misc
+{
+		  Intel:99,
+    title	= {{IA-64 Application Developer's Architecture Guide}},
+    key		= {Intel},
+    note	= {\texttt{http://developer.intel.com/design/ia64/downloads/adag.htm}},
+    month	= {May},
+    year	= {1999}
+}
+
+@article
+{
+		  Irving:00,
+    title	= {{Solving Kalah}},
+    author	= {G. Irving and H.H.L.M. Donkers and J.W.H.M. Uiter\-wijk},
+    journal	= ICGA,
+    volume	= {23},
+    number	= {3},
+    pages	= {139--147},
+    month	= {September},
+    year	= {2000}
+}
+
+
+@inproceedings
+{
+                  Iskra:08,
+    author      = {K. Iskra and J.W. Romein and K. Yoshii and P. Beckman},
+    title       = {{ZOID: I/O-Forwarding Infrastructure for Petascale Architectures}},
+    booktitle   = PPOPP # { (PPoPP'08)},
+    pages       = {153--162},
+    address     = {Salt Lake City, UT},
+    month       = {February},
+    year        = {2008}
+}
+    booktitle   = {Proc.\ of the 13th } # PPOPP # { (PPoPP'08)},
+
+@misc
+{
+		  Jive:06,
+    key		= {ZZZ},
+    howpublished= {Private discussions with people from the Joint Institute for VLBI in Europe},
+    month	= {March},
+    year	= {2006}
+}
+
+@inproceedings
+{
+		  Joerg:94,
+    title	= {{Massively Parallel Chess}},
+    author	= {C.F. Joerg and B.C. Kuszmaul},
+    booktitle	= {Third DIMACS Parallel Implementation Challenge},
+    school	= {Rutgers University},
+    month	= {October},
+    year	= {1994} 
+}
+
+@incollection
+{
+		  Junghanns:97,
+    title	= {{Diminishing Returns for Additional Search in Chess}},
+    author	= {A. Junghanns and J. Schaeffer and M. Brockington and Y. Bjornsson and T.A. Marsland},
+    booktitle	= ACC # { 8},
+    editor	= {{H.J. van den} Herik and J. Uiter\-wijk},
+    publisher	= {Universiteit Maastricht},
+    address	= {the Netherlands},
+    pages	= {53--67},
+    year	= {1997}
+}
+
+@inproceedings
+{
+		  Junghanns:97b,
+    title	= {{Search Versus Knowledge in Game-Playing Programs Revisited}},
+    author	= {A. Junghanns and J. Schaeffer},
+    booktitle	= IJCAI,
+    pages	= {692--697},
+    year	= {1997}
+}
+
+@inproceedings
+{
+		  Junghanns:98,
+    title	= {{Single-Agent Search in the Presence of Deadlocks}},
+    author	= {A. Junghanns and J. Schaeffer},
+    booktitle	= AAAI,
+    pages	= {419--424},
+    month	= {July},
+    year	= {1998}
+}
+
+@inproceedings
+{
+		  Kaashoek:91,
+    title	= {{Group Communication in the Amoeba Distributed Operating System}},
+    author	= {M.F. Kaashoek and A.S. Tanenbaum},
+    booktitle	= ICDCS,
+    pages	= {222-230},
+    address	= {Arlington, TX},
+    month	= {May},
+    year	= {1991}
+}
+
+@phdthesis
+{
+		  Kaashoek:92,
+    title	= {{Group Communication in Distributed Computer Systems}},
+    author	= {M.F. Kaashoek},
+    school	= {Vrije Universiteit},
+    address	= {Amsterdam},
+    month	= {December},
+    year	= {1992}
+}
+
+@article
+{
+		  Kierulf:90,
+    title	= {{Smart Game Board and Go Explorer: A Study in Software and Knowledge Engineering}},
+    author	= {A. Kierulf and K. Chen and J. Nievergelt},
+    journal	= CACM,
+    volume	= {33},
+    number	= {2},
+    pages	= {152--166},
+    year	= {1990}
+}
+
+@article
+{
+		  Kleinjung:04,
+    title	= {{Contact-based Sequence Alignment}},
+    author	= {J. Kleinjung and J.W. Romein and K. Lin and J. Heringa},
+    journal	= NAR,
+    volume	= {32},
+    number	= {8},
+    pages	= {2464--2473},
+    month	= {April},
+    year	= {2004}
+}
+
+@article
+{
+		  Knuth:75,
+    title	= {{An Analysis of Alpha-Beta Pruning}},
+    author	= {D.E. Knuth and R.W. Moore},
+    journal	= AI,
+    volume	= {6},
+    number	= {4},
+    year	= {1975},
+    pages	= {293--326}
+}
+
+@article
+{
+		  Korf:85,
+    title	= {{Depth-first Iterative Deepening: An Optimal Admissible Tree Search}},
+    author	= {R.E. Korf},
+    journal	= AI,
+    volume	= {27},
+    number	= {1},
+    year	= {1985},
+    pages	= {97--109}
+}
+
+@inproceedings
+{
+		  Korf:96,
+    title	= {{Finding Optimal Solutions to the Twenty-Four Puzzle}},
+    author	= {R.E. Korf and L.A. Taylor},
+    booktitle	= AAAI,
+    pages	= {1202--1207},
+    month	= {August},
+    year	= {1996}
+}
+
+@inproceedings
+{
+		  Korf:97,
+    title	= {{Finding Optimal Solutions to Rubik's Cube Using Pattern Databases}},
+    author	= {R.E. Korf},
+    booktitle	= AAAI,
+    pages	= {700--705},
+    month	= {July},
+    year	= {1997}
+}
+
+@article
+{
+		  Koufaty:03,
+    title	= {{Hyperthreading Technology in the Netburst Microarchitecture}},
+    author	= {D. Koufaty and D.T. Marr},
+    journal	= MICRO,
+    volume	= {23},
+    number	= {2},
+    pages	= {56-65},
+    month	= {March},
+    year	= {2003}
+}
+
+@inproceedings
+{
+		  Kruithof:08,
+    title	= {{Real-time Software Correlation}},
+    booktitle	= {International Workshop on Distributed Cooperative Laboratories (INGRID'08)},
+    author	= {N. Kruithof and D. Marchal},
+    note	= {To appear. http://www.jive.nl/dokuwiki/doku.php/scarie:scarie},
+    month	= {April},
+    year	= {2008}
+}
+{
+    booktitle	= {Proceedings of the 3rd International Workshop on Distributed Cooperative Laboratories (INGRID '08)},
+}
+ 
+@inproceedings
+{
+		  Kumar:90,
+    title	= {{Scalable Parallel Formulations of Depth-first Search}},
+    author	= {V. Kumar and V. Rao},
+    booktitle	= PAMIV,
+    publisher	= {Springer-Verlag},
+    editor	= {V. Kumar and P. Gopalakrishnan and L. Kanal},
+    pages	= {1--42},
+    year	= {1990}
+}
+
+@phdthesis
+{
+		  Kuszmaul:94,
+    title	= {{Synchronized MIMD Computing}},
+    author	= {B.C. Kuszmaul},
+    school	= {Massachusetts Institute of Technology},
+    address	= {Cambridge, MA},
+    month	= {May},
+    year	= {1994}
+}
+
+@article
+{
+		  Kuszmaul:95,
+    title	= {{The StarTech Massively Parallel Chess Program}},
+    author	= {B.C. Kuszmaul},
+    journal	= ICCA,
+    volume	= {18},
+    number	= {1},
+    month	= {March},
+    year	= {1995}
+}
+
+@incollection
+{
+		  Lake:94,
+    title	= {{Solving Large Retrograde Analysis Problems Using a Network of Workstations}},
+    author	= {R. Lake and J. Schaeffer and P. Lu},
+    booktitle	= ACC # { 7},
+    editor	= {{H.J. van den} Herik and I.S. Herschberg and J.W.H.M. Uiter\-wijk},
+    pages	= {135--162},
+    publisher	= {Universiteit Maastricht},
+    address	= {the Netherlands},
+    year	= {1994}
+}
+
+@inproceedings
+{
+		  Langendoen:96,
+    title	= {{Integrating Polling, Interrupts, and Thread Management}},
+    author	= {K. Langendoen and J.W. Romein and R.A.F. Bhoedjang and H.E. Bal},
+    booktitle	= {Proceedings of Frontiers'96},
+    address	= {Annapolis, MD},
+    pages	= {13--22},
+    month	= {October},
+    year	= {1996}
+}
+
+@article
+{
+		  Langendoen:97,
+    title	= {{Models for Asynchronous Message Handling}},
+    author	= {K. Langendoen and R.A.F. Bhoedjang and H.E. Bal},
+    journal	= {IEEE Concurency},
+    volume	= {5},
+    number	= {2},
+    pages	= {28--38},
+    month	= {April--June},
+    year	= {1997}
+}
+
+@article
+{
+		  Lee:90,
+    title	= {{The Development of a World Class Othello Program}},
+    author	= {K.-F. Lee and S. Mahajan},
+    journal	= AI,
+    volume	= {43},
+    number	= {1},
+    pages	= {21--36},
+    year	= {1990}
+}
+
+@techreport
+{
+    		  Lesk:75,
+    title	= {{Lex - A Lexical Analyzer Generator}},
+    author	= {M.E. Lesk and E. Schmidt},
+    institution	= {Bell Laboratories Computing Science},
+    number	= {39},
+    month	= {October},
+    year	= {1975}
+}
+
+@article
+{
+		  Lincke:00,
+    title	= {{Large Endgame Databases with Limited Memory Space}},
+    author	= {T.R. Lincke and A. Marzetta},
+    journal	= ICGA,
+    volume	= {23},
+    number	= {3},
+    pages	= {131-138},
+    month	= {September},
+    year	= {2000}
+}
+
+@article
+{
+		  Lincke:00b,
+    title	= {{Marvin Wins Awari Tournament}},
+    author	= {T.R. Lincke and R. van der Goot},
+    journal	= ICGA,
+    volume	= {23},
+    number	= {3},
+    pages	= {173-174},
+    month	= {September},
+    year	= {2000}
+}
+
+@phdthesis
+{
+		  Lincke:02,
+    title	= {{Exploring the Computational Limits of Large Exhaustive Search Problems}},
+    author	= {T.R. Lincke},
+    school	= {ETH Zurich, Swiss},
+    month	= {June},
+    year	= {2002}
+}
+
+@misc
+{
+		  LOFAR_SPECS,
+    note	= {\texttt{http://www.lofar.org/p/astronomy\_spec.htm}},
+}
+
+@article
+{
+		  Lorenz:05,
+    title	= {{Vectorization Techniques for the Blue Gene/L Double FPU}},
+    author	= {J. Lorenz and S. Kral and F. Franchetti and C.W. Ueberhuber},
+    journal	= IJRD,
+    volume	= {49},
+    number	= {2/3},
+    pages	= {437--446},
+    month	= {March},
+    year	= {2005}
+}
+
+
+@article
+{
+		  Manzini:95,
+    title	= {{BIDA*: An Improved Perimeter Search Algorithm}},
+    author	= {G. Manzini},
+    journal	= AI,
+    volume	= {75},
+    number	= {2},
+    pages	= {347--360},
+    month	= {June},
+    year	= {1995}
+}
+
+@inproceedings
+{
+		  Maquelin:96,
+    title	= {{Polling Watchdog: Combining Polling and Interrupts for Efficient Message Handling}},
+    author	= {O. Maquelin and G.R. Gao and H.H. J Hum and K.B. Theobald and X. Tian},
+    booktitle	= ISCA,
+    pages	= {179--188},
+    address	= {Philadelphia, PA},
+    month	= {May},
+    year	= {1996}
+}
+
+@article
+{
+		  Marcotte:98,
+    title	= {{A Census of Protein Repeats}},
+    author	= {E.M. Marcotte and M. Pellegrini and T.O. Yeates and D. Eisenberg},
+    journal	= JMB,
+    volume	= {293},
+    pages	= {151--160},
+    year	= {1998}
+}
+
+@article
+{
+		  Marsland:82,
+    title	= {{Parallel Search of Strongly Ordered Game Trees}},
+    author	= {T.A. Marsland and M. Campbell},
+    journal	= ACMCS,
+    volume	= {14},
+    number	= {4},
+    pages	= {533--551},
+    month	= {December},
+    year	= {1982}
+}
+
+@article
+{
+		  Marsland:85,
+    title	= {{Parallel Game-Tree Search}},
+    author	= {T.A. Marsland and F. Popowich},
+    journal	= PAMI,
+    volume	= {7},
+    number	= {4},
+    pages	= {442--452},
+    month	= {July},
+    year	= {1985}
+}
+
+@misc
+{
+		  Matlab,
+    title	= {{Matlab}},
+    key		= {Matlab},
+    note	= {\texttt{http://www.mathworks.com}}
+}
+
+@article
+{
+		  Mattern:87,
+    title	= {{Algorithms for Distributed Termination Detection}},
+    author	= {F. Mattern},
+    journal	= DC,
+    volume	= {2},
+    pages	= {161--175},
+    year	= {1987}
+}
+
+@inproceedings
+{
+		  Mueller:93,
+    title	= {{A Library Implementation of POSIX Threads under UNIX}},
+    author	= {F. Mueller},
+    booktitle	= {Proceedings of the USENIX Conference},
+    organization= {USENIX},
+    address	= {San Diego, CA},
+    pages	= {29--41},
+    year	= {Winter 1993}
+}
+
+@book
+{
+		  Mood:74,
+    title	= {{Introduction to the Theory of Statistics}},
+    author	= {A.M. Mood and F.A. Graybill and D.C. Boes},
+    edition	= {Third},
+    publisher	= {McGraw--Hill},
+    address	= {Singapore},
+    year	= {1974}
+}
+
+@article
+{
+		  Moran:67,
+    title	= {{Spectral Line Inteferometry with Independent Time Standards at Stations Separated by 845 Kilometers}},
+    author	= {J. Moran and P. Crowther and B. Burke and A. Barrett and A. Rogers and J. Ball and J. Carter and C. Bare},
+    journal	= {Science},
+    volume	= {157},
+    number	= {3789},
+    pages	= {676--677},
+    month	= {August},
+    year	= {1967}
+}
+
+@article
+{
+		  Needleman:70,
+    title	= {{A General Method Applicable to the Search for Similarities in the Amino Acid Sequence of Two Proteins}},
+    author	= {S.B. Needleman and C.D. Wunsch},
+    journal	= JMB,
+    volume	= {48},
+    pages	= {443--453},
+    year	= {1970}
+}
+
+@inproceedings
+{
+		  Newborn:85,
+    title	= {{A Parallel Search Chess Program}},
+    author	= {M. Newborn},
+    booktitle	= ACM,
+    editor	= {S.R. Oliver},
+    publisher	= {ACM Press, New York, NY},
+    pages	= {272--277},
+    year	= {1985}
+}
+
+@book
+{
+		  Nichols:96,
+    title	= {{Pthreads Programming}},
+    author	= {B. Nichols and B. Buttlar and J. Proulx Farrell},
+    publisher	= {O'Reilly \& Associates, Inc.},
+    isbn	= {1-56592-115-1},
+    address	= {Newton, MA},
+    year	= {1996}
+}
+
+@inproceedings
+{
+		  Nieuwpoort:09,
+    title	= {{Using Many-Core Hardware to Correlate Radio Astronomy Signals}},
+    author	= {{R.V. van} Nieuwpoort and J.W. Romein},
+    booktitle	= {Proceedings of ACM International Conference on Supercomputing},
+    address	= {New York, NY},
+    month	= {June},
+    year	= {2009},
+    pages       = {440--449}
+}
+
+@inproceedings
+{
+		  Nijboer:07,
+    author	= {R. J. Nijboer and J. E. Noordam},
+    title	= {{LOFAR Calibration}},
+    booktitle	= ADASS#{ (ADASS XVII)},
+    series	= {ASP Conference Series},
+    number	= {376},
+    pages	= {237--240},
+    editor	= {R. A. Shaw and F. Hill and D. J. Bell},
+    address	= {Kensington, UK},
+    month	= {September},
+    year	= {2007}
+}
+
+@book
+{
+		  Nilsson:71,
+    title	= {{Problem-Solving Methods in Artificial Intelligence}},
+    author	= {N.J. Nilsson},
+    publisher	= {McGraw--Hill},
+    address	= {New York, NY},
+    year	= {1971}
+}
+
+@inproceedings
+{
+		  Ord:08,
+    title	= {{GPUs for data processing in the MWA}},
+    author	= {S. Ord and L. Greenhill and R. Wayth and D. Mitchell and K. Dale and H. Pfister and G. Edgar},
+    booktitle	= ADASS #{ (ADASS XVIII)},
+    month	= {November},
+    year	= {2008},
+    note	= {To appear. http://arxiv.org/abs/0902.0915}
+}
+{
+    address	= {Qu\'ebec, Canada},
+}
+
+@inproceedings
+{
+		  Pakin:95,
+    title	= {{High Performance Messaging on Workstations: Illinois Fast Messages (FM) for Myrinet}},
+    author	= {S. Pakin and M. Lauria and A. Chien},
+    booktitle	= SC # { '95},
+    address	= {San Diego, CA},
+    month	= {December},
+    year	= {1995}
+}
+
+@book
+{
+		  Papert:93,
+    title	= {{Mindstorms: Children, Computers, and Powerful Ideas}},
+    author	= {S. Papert},
+    publisher	= {Harvester Wheatsheaf, New York, N.Y.},
+    isbn	= {0-7450-1604-9},
+    edition	= {Second},
+    pages	= {230},
+    year	= {1993}
+}
+
+@article
+{
+		  Pearl:80,
+    title	= {{Asymptotical Properties of Minimax Trees and Game Searching Procedures}},
+    author	= {J. Pearl},
+    journal	= AI,
+    volume	= {14},
+    number	= {2},
+    pages	= {113--138},
+    year	= {1980}
+}
+
+@article
+{
+		  Pellegrini:99,
+    title	= {{A Fast Algorithm for Genome-Wide Analysis of Proteins with Repeated Sequences}},
+    author	= {M. Pellegrini and E.M. Marcotte and T.O. Yeates},
+    journal	= JMB,
+    volume	= {35},
+    pages	= {440--446},
+    year	= {1999}
+}
+
+@incollection
+{
+		  Pell:92,
+    title	= {{Metagame: A New Challenge for Games and Learning}},
+    author	= {B. Pell},
+    booktitle	= {Heuristic Programming in Artificial Intelligence},
+    volume	= {3},
+    editor	= {{H.J. van den} Herik and L.V. Allis},
+    pages	= {237--251},
+    publisher	= {Ellis Horwood Ltd},
+    address	= {Chichester, West Sussex},
+    year	= {1992}
+}
+
+@phdthesis
+{
+                  Pell:93,
+    title	= {{Strategy Generation and Evaluation for Meta-Game Playing}},
+    author	= {B.D. Pell},
+    school	= {University of Cambridge},
+    month	= {August},
+    year	= {1993}
+}
+
+@inproceedings
+{
+		  Pell:94,
+    title	= {{A Strategic Metagame Player for General Chesslike Games}},
+    author	= {B. Pell},
+    booktitle	= AAAI,
+    pages	= {1378--1385},
+    month	= {July},
+    year	= {1994}
+}
+
+@article
+{
+		  Peterson:81,
+    title	= {{Myths about the Mutual Exclusion Problem}},
+    author	= {G.L. Peterson},
+    journal	= IPL,
+    volume	= {12},
+    pages	= {115-116},
+    month	= {June},
+    year	= {1981}
+}
+
+@phdthesis
+{
+		  Plaat:96,
+    title	= {{Research, Re: Search \& Re-Search}},
+    author	= {A. Plaat},
+    school	= {Erasmus Universiteit Rotterdam},
+    address	= {the Netherlands},
+    month	= {June},
+    year	= {1996}
+}
+
+@article
+{
+		  Plaat:96b,
+    title	= {{Best-First Fixed-Depth Minimax Algorithms}},
+    author	= {A. Plaat and J. Schaeffer and W. Pijls and A. de Bruin},
+    journal	= AI,
+    volume	= {87},
+    number	= {1--2},
+    pages	= {255--293},
+    month	= {November},
+    year	= {1996}
+}
+
+@inproceedings
+{
+                  Plaat:96c,
+    title       = {{Exploiting Graph Properties of Game Trees}},
+    author      = {A. Plaat and J. Schaeffer and W. Pijls and A. de Bruin},
+    booktitle	= AAAI,
+    pages	= {234--239},
+    month       = {August},
+    year        = {1996}
+}
+
+@inproceedings
+{
+		  Plaat:99,
+    title	= {{Sensitivity of Parallel Applications to Large Differences in Bandwidth and Latency in Two-Layer Interconnects}},
+    author	= {A. Plaat and H.E. Bal and R.F.H. Hofman},
+    booktitle	= HPCA,
+    pages	= {244--253},
+    address	= {Orlando, FL.},
+    month	= {January},
+    year	= {1999}
+}
+ 
+@article
+{
+		  Powley:91,
+    title	= {{Single-Agent Parallel Window Search}},
+    author	= {C. Powley and R.E. Korf},
+    journal	= PAMI,
+    volume      = {3},
+    number      = {5},
+    pages	= {466--477},
+    year	= {1991}
+}
+
+@inproceedings
+{
+		  Prylli:98,
+    title	= {{BIP: a New Protocol Designed for High Performance Networking on Myrinet}},
+    author	= {L. Prylli and B. Tourancheau},
+    booktitle	= {Workshop PC-NOW, IPPS/SPDP'98},
+    address	= {Orlando, FL},
+    year	= {1998}
+}
+
+@inproceedings
+{
+		  Prylli:98b,
+    title	= {{Modeling of a High Speed Network to Maximize Throughput Performance: The Experience of BIP over Myrinet}},
+    author	= {L. Prylli and B. Tourancheau and R. Westrelin},
+    booktitle	= PDPTA,
+    address	= {Las Vegas, NV},
+    year	= {1998}
+}
+
+@inproceedings
+{
+		  Rao:87,
+    title	= {{A Parallel Implementation of Iterative-Deepening-A*}},
+    author	= {V. Rao and V. Kumar and K. Ramesh},
+    booktitle	= AAAI,
+    pages	= {178--182},
+    month	= {July},
+    year	= {1987}
+}
+
+@article
+{
+		  Reinefeld:83,
+    title	= {{An Improvement of the Scout Tree-Search Algorithm}},
+    author	= {A. Reinefeld},
+    journal	= ICCA,
+    volume	= {6},
+    number	= {4},
+    pages	= {4--14},
+    month	= {December},
+    year	= {1983}
+}
+
+@inproceedings
+{
+		  Reinefeld:85,
+    title	= {{Information Acquisition in Minimal Window Search}},
+    author	= {A. Reinefeld and J. Schaeffer and T.A. Marsland},
+    booktitle	= IJCAI,
+    volume	= {2},
+    pages	= {1040--1043},
+    year	= {1985}
+}
+
+@article
+{
+		  Reinefeld:94,
+    title	= {{Enhanced Iterative-Deepening Search}},
+    author	= {A. Reinefeld and T.A. Marsland},
+    journal	= PAMI,
+    volume	= {16},
+    number	= {7},
+    pages	= {701--710},
+    month	= {July},
+    year	= {1994}
+}
+
+@inproceedings
+{
+		  Reinefeld:94b,
+    title	= {{AIDA* --- Asynchronous Parallel IDA*}},
+    author	= {A. Reinefeld and V. Schnecke},
+    booktitle	= CCAI,
+    pages	= {295--302},
+    address	= {Banff, Canada},
+    year	= {1994}
+}
+
+@article
+{
+		  Rognes:00,
+    title	= {{Six-Fold Speedup of Smith--Waterman Sequence Database Searches Uisng Parallel Processing on Common Microprocessors}},
+    author	= {T. Rognes and E. Seeberg},
+    journal	= BIOINF,
+    volume	= {16},
+    number	= {8},
+    pages	= {699--706},
+    year	= {2000}
+}
+
+@article
+{
+		  Rognes:01,
+    title	= {{ParAlign: a Parallel Sequence Alignment Algorithm for Rapid and Sensitive Database Searches}},
+    author	= {T. Rognes},
+    journal	= NAR,
+    volume	= {29},
+    number	= {7},
+    pages	= {1647--1652},
+    year	= {2001}
+}
+
+@techreport
+{
+		  Romein:94,
+    title	= {{Parallel N-Body Simulation on a Large-Scale Homogeneous Distributed System}},
+    author	= {J.W. Romein and H.E. Bal},
+    institution	= {Dept. of Mathematics and Computer Science, Vrije Universiteit, Amsterdam},
+
+    number	= {IR-364},
+    month	= {December},
+    year	= {1994}
+}
+
+@mastersthesis
+{
+		  Romein:94b,
+    title	= {{Water -- an N-body Simulation Program on a Distributed Architecture}},
+    author	= {J.W. Romein},
+    institution	= {Dept. of Mathematics and Computer Science, Vrije Universiteit, Amsterdam},
+    month	= {August},
+    year	= {1994}
+}
+
+@inproceedings
+{
+		  Romein:95,
+    title	= {{Parallel N-Body Simulation on a Large-Scale Homogeneous Distributed System}},
+    author	= {J.W. Romein and H.E. Bal},
+    publisher	= {Springer-Verlag},
+    booktitle	= {EuroPar '95},
+    series	= LNCS,
+    volume	= {966},
+    pages	= {473--484},
+    address	= {Stockholm, Sweden},
+    month	= {August},
+    year	= {1995}
+}
+
+@inproceedings
+{
+		  Romein:95b,
+    title	= {{Multigame --- A Very High Level Language for Describing Board Games}},
+    author	= {J.W. Romein and H.E. Bal and D. Grune},
+    booktitle	= {First Annual ASCI Conference},
+    pages	= {278--287},
+    address	= {Heijen, the Netherlands},
+    month	= {May},
+    year	= {1995}
+}
+
+@inproceedings
+{
+                  Romein:97,
+    title       = {{An Application Domain Specific Language for Describing Board Games}},
+    author      = {J.W. Romein and H.E. Bal and D. Grune},
+    booktitle   = PDPTA,
+    pages       = {305--314},
+    volume      = {I},
+    organization= {CSREA},
+    address     = {Las Vegas, NV},
+    month       = {July},
+    year        = {1997}
+}
+
+@inproceedings
+{
+		  Romein:99,
+    title	= {{Transposition Driven Work Scheduling in Distributed Search}},
+    author	= {J.W. Romein and A. Plaat and H.E. Bal and J. Schaeffer},
+    booktitle	= AAAI,
+    pages	= {725--731},
+    address	= {Orlando, FL},
+    month	= {July},
+    year	= {1999}
+}
+
+@techreport
+{
+		  Romein:00b,
+    title	= {{The Multigame Reference Manual}},
+    author	= {J.W. Romein and H.E. Bal and D. Grune},
+    institution	= {Faculty of Sciences, Department of Mathematics and Computer Science, Vrije Universiteit},
+    address	= {Amsterdam, the Netherlands},
+    number	= {IR-475},
+    month	= {August},
+    year	= {2000}
+}
+
+@phdthesis
+{
+		  Romein:01,
+    title	= {{Multigame --- An Environment for Distributed Game-Tree Search}},
+    author	= {J.W. Romein},
+    school	= {Faculty of Sciences, Department of Mathematics and Computer Science, Vrije Universiteit},
+    address	= {Amsterdam, the Netherlands},
+    note	= {\texttt{http://www.cs.vu.nl/\~{}john/thesis/}},
+    month	= {January},
+    year	= {2001}
+}
+
+@inproceedings
+{
+		  Romein:01b,
+    title	= {{Wide-Area Transposition-Driven Scheduling}},
+    author	= {J.W. Romein and H.E. Bal},
+    booktitle	= {IEEE International Symposium on High Performance Distributed Computing},
+    address	= {San Francisco, CA},
+    month	= {August},
+    year	= {2001}
+}
+
+@article
+{
+		  Romein:02,
+    title	= {{A Performance Analysis of Transposition-Table-Driven Scheduling in Distributed Search}},
+    author	= {J.W. Romein and H.E. Bal and J. Schaeffer and A. Plaat},
+    journal	= TPDS,
+    volume	= {13},
+    number	= {5},
+    pages	= {447--459},
+    month	= {May},
+    year	= {2002}
+}
+
+@unpublished
+{
+    		  Romein:02a,
+    title	= {{Solving the Game of Awari on the DAS-2 Distributed Cluster Computer}},
+    author	= {J.W. Romein},
+    note	= {Myrinet User's Group Conference, Vienna, Austria},
+    month	= {May},
+    year	= {2002}
+}
+
+@unpublished
+{
+    		  Romein:02b,
+    title	= {{Solving Awari Using Large-Scale Parallel Retrograde Analysis}},
+    author	= {J.W. Romein},
+    note	= {Third International Conference on Computers and Games'02 (co-located with AAAI'02), Edmonton, AB, Canada},
+    month	= {July},
+    year	= {2002}
+}
+
+@article
+{
+		  Romein:02c,
+    title	= {{Awari is Solved}},
+    author	= {J.W. Romein and H.E. Bal},
+    journal	= ICGA,
+    volume	= {25},
+    number	= {3},
+    pages	= {162--165},
+    month	= {September},
+    year	= {2002}
+}
+
+@article
+{
+		  Romein:03,
+    title	= {{Solving Awari with Parallel Retrograde Analysis}},
+    author	= {J.W. Romein and H.E. Bal},
+    journal	= COMPUTER,
+    volume	= {36},
+    number	= {10},
+    pages	= {26--33},
+    month	= {October},
+    year	= {2003}
+}
+
+@inproceedings
+{
+		  Romein:03b,
+    title	= {{A Million-Fold Speed Improvement in Genomic Repeats Detection}},
+    author	= {J.W. Romein and J. Heringa and H.E. Bal},
+    booktitle	= SC # {'03},
+    address	= {Phoenix, AZ},
+    month	= {November},
+    year	= {2003}
+}
+
+@inproceedings
+{
+		  Romein:06,
+    title	= {{Astronomical Real-Time Streaming Signal Processing on a BG/L Supercomputer}},
+    author	= {J.W. Romein and P.C. Broekema and E. van Meijeren and {K. van der} Schaaf and W.H. Zwart},
+    booktitle	= {{ACM Sym. on Parallel Algorithms and Architectures}},
+    pages	= {59--66},
+    OPTaddress	= {Cambridge, MA},
+    OPTmonth	= {July},
+    year	= {2006}
+}
+
+@techreport
+{
+		  Romein:08,
+    title	= {{Bandpass Correction in LOFAR}},
+    author	= {J.W. Romein},
+    institution	= {ASTRON},
+    note	= {http://www.astron.nl/$\sim$romein/papers/BandPass/bandpass.pdf},
+    month	= {August},
+    year	= {2008}
+}
+
+@inproceedings
+{
+		  Romein:09a,
+    title	= {{FCNP: Fast I/O on the Blue Gene/P}},
+    author	= {J.W. Romein},
+    booktitle	= PDPTA # { (PDPTA'09)},
+    address	= {Las Vegas, NV},
+    month	= {July},
+    year	= {2009}
+}
+
+@misc
+{
+		  Romein:09b,
+    title	= {{Processing Real-Time LOFAR Telescope Data on a Blue Gene/P Supercomputer}},
+    author	= {J.W. Romein and P.C. Broekema and J.D. Mol and R.V. van Nieuwpoort},
+    note	= {Under review},
+    year	= {2009}
+}
+
+@inproceedings
+{
+		  Renesse:98,
+    title	= {{Goal-Oriented Programming, or Composition Using Events, or Threads Considered Harmful}},
+    author	= {{R. van} Renesse},
+    booktitle	= {Proceedings of the Eighth ACM SIGOPS European Workshop},
+    address	= {Sintra, Portugal},
+    month	= {September},
+    year	= {1998}
+}
+
+@article
+{
+		  Schaaf:04,
+    title	= {{The LOFAR Central Processing Facility Architecture}},
+    author	= {{K. van der} Schaaf and C. Broekema and {G. van} Diepen and {E. van} Meijeren},
+    journal	= {Experimental Astronomy},
+    volume	= {17},
+    pages	= {43--58},
+    year	= {2005}
+}
+
+@article
+{
+		  Schaeffer:89,
+    title	= {{The History Heuristic and Alpha-Beta Search Enhancements in Practice}},
+    author	= {J. Schaeffer},
+    journal	= PAMI,
+    volume	= {11},
+    number	= {11},
+    pages	= {1203-1212},
+    year	= {1989}
+}
+
+@article
+{
+		  Schaeffer:89b,
+    title	= {{Distributed Game-Tree Searching}},
+    author	= {J. Schaeffer},
+    journal	= JPDC,
+    volume	= {6},
+    pages	= {90--114},
+    year	= {1989}
+}
+
+@article
+{
+		  Schaeffer:92,
+    title	= {{A World Championship Caliber Checkers Program}},
+    author	= {J. Schaeffer and J. Culberson and N. Treloar and B. Knight and P. Lu and D. Szafron},
+    journal	= AI,
+    volume	= {53},
+    pages	= {273--289},
+    year	= {1992}
+}
+
+@inproceedings
+{
+		  Schaeffer:93,
+    title	= {{A Re-examination of Brute-force Search}},
+    author	= {J. Schaeffer and P. Lu and D. Szafron and R. Lake},
+    booktitle	= {Games: Planning and Learning: AAAI 1993 Fall Symposium, Report FS9302},
+    pages	= {51--58},
+    address	= {Chapel Hill, NC},
+    year	= {1993}
+}
+
+@article
+{
+		  Schaeffer:96,
+    title	= {{Chinook: The World Man-Machine Checkers Champion}},
+    author	= {J. Schaeffer and R. Lake, P. Lu and M. Bryant},
+    journal	= AIM,
+    volume	= {17},
+    number	= {1},
+    pages	= {21--29},
+    year	= {1996},
+}
+
+@incollection
+{
+		  Schaeffer:00,
+    title	= {{Search Ideas in Chinook}},
+    author	= {J. Schaeffer},
+    booktitle	= {Games in AI Research},
+    editor	= {{H.J. van den} Herik and H. Iida},
+    publisher	= {Universiteit Maastricht},
+    address	= {the Netherlands},
+    isbn	= {90-621-6416-1},
+    pages	= {19--30},
+    year	= {2000}
+}
+
+@misc
+{
+		   Schaeffer:02,
+    note        = {\texttt{http://www.cs.ualberta.ca/\~{}awari/}}
+}
+
+@article
+{
+		  Schijf:94,
+    title	= {{Proof-Number Search and Transpositions}},
+    author	= {M. Schijf and L.V. Allis and J.W.H.M. Uiter\-wijk},
+    journal	= ICCA,
+    volume	= {17},
+    number	= {2},
+    pages	= {63--74},
+    month	= {June},
+    year	= {1994}
+}
+
+@incollection
+{
+		  Scott:69,
+    title	= {{A Chess-Playing Program}},
+    author	= {J.J. Scott},
+    booktitle	= {Machine Intelligence},
+    volume	= {4},
+    editor	= {B. Meltzer and D. Michie},
+    publisher	= {Edinburgh University Press},
+    pages	= {255--265},
+    year	= {1969}
+}
+
+@article
+{
+		  Skillicorn:98,
+    title	= {{Models and Languages for Parallel Computation}},
+    author	= {D.B. Skillicorn and D. Talia},
+    journal	= ACMCS,
+    volume	= {30},
+    number	= {2},
+    pages	= {123--169},
+    month	= {June},
+    year	= {1998}
+}
+
+@incollection
+{
+		  Slate:77,
+    title	= {{CHESS 4.5 --- The Northwestern University Chess Program}},
+    author	= {D.J. Slate and L.R. Atkin},
+    booktitle	= {Chess Skill in Man and Machine},
+    editor	= {P.W. Frey},
+    publisher	= {Springer-Verlag},
+    pages	= {82--118},
+    year	= {1977}
+}
+
+@article
+{
+		  Smith:81,
+    title	= {{Identification of Common Molecular Subsequences}},
+    author	= {T.F. Smith and M.S. Waterman},
+    journal	= JMB,
+    volume	= {147},
+    pages	= {195--197},
+    year	= {1981}
+}
+
+@book
+{
+		  SPARC:94,
+    title	= {{The SPARC Architecture Manual}},
+    editor	= {D.L. Weaver and T. Germond},
+    publisher	= {Prentice Hall},
+    isbn	= {0-13-099227-5},
+    address	= {Menlo Park, CA},
+    year	= {1994}
+}
+
+@book
+{
+		  Sparrowhawk:84,
+    title	= {{LOGO: A Language for Learning}},
+    author	= {A. Sparrowhawk},
+    isbn	= {0-330-28676-5},
+    publisher	= {Pan Books, London},
+    pages	= {172},
+    year	= {1984}
+}
+
+@inproceedings
+{
+		  Stern:97,
+    title	= {{Parallelizing the Murphi Verifier}},
+    author	= {U. Stern and D.L. Dill},
+    booktitle	= {Ninth International Conference on Computer Aided Verification},
+    pages	= {256--267},
+    year	= {1997}
+}
+
+@inproceedings
+{
+		  Taylor:93,
+    title	= {{Pruning Duplicate Nodes in Depth-First Search}},
+    author	= {L. Taylor and R.E. Korf},
+    booktitle	= AAAI,
+    pages	= {756--761},
+    month	= {July},
+    year	= {1993}
+}
+
+@article
+{
+		  Tanenbaum:90,
+    title	= {{Experiences with the Amoeba Distributed Operating System}},
+    author	= {A.S. Tanenbaum and {R. van} Renesse and {H. van} Staveren and G.J. Sharp and S.J. Mullender and A.J. Jansen and {G. van} Rossum},
+    journal	= CACM,
+    volume	= {33},
+    number	= {12},
+    pages	= {46--63},
+    month	= {December},
+    year	= {1990}
+}
+
+@book
+{
+		  Tanenbaum:95,
+    title	= {{Distributed Operating Systems}},
+    author	= {A.S. Tanenbaum},
+    publisher	= {Prentice Hall},
+    isbn	= {0132199084},
+    month	= {January},
+    year	= {1995}
+}
+
+@inproceedings
+{
+		  Thekkath:94,
+    title	= {{Separating Data and Control Transfer in Distributed Operating Systems}},
+    author	= {C.A. Thekkath and H.M. Levy and E.D. Lazowska},
+    booktitle	= ASPLOS,
+    pages	= {2--11},
+    address	= {San Jose, CA},
+    month	= {October},
+    year	= {1994}
+}
+
+@incollection
+{
+		  Thompson:82,
+    title	= {{Computer Chess Strength}},
+    author	= {K. Thompson},
+    booktitle	= ACC # { 3},
+    editor	= {M.R.B. Clarke},
+    publisher	= {Pergamon Press, Oxford},
+    pages	= {55-56},
+    year	= {1982}
+}
+
+@article
+{
+		  Thompson:86,
+    title	= {{Retrograde Analysis of Certain Endgames}},
+    author	= {K. Thompson},
+    journal	= ICCA,
+    volume	= {9},
+    number	= {3},
+    pages	= {131--139},
+    month	= {September},
+    year	= {1986}
+}
+
+@article
+{
+		  Vaas:99,
+    title	= {{Konstellation}},
+    author	= {R. Vaas},
+    journal	= {Bild der Wissenschaft},
+    number	= {Sonne-Special},
+    pages	= {2},
+    month	= {July},
+    year	= {1999}
+}
+
+@inproceedings
+{
+		  Valois:95,
+    title	= {{Lock-Free Linked Lists Using Compare-and-Swap}},
+    author	= {J.D. Valois},
+    booktitle	= PDC,
+    pages	= {214--222},
+    year	= {1995}
+}
+
+@inproceedings
+{
+		  Verstoep:96,
+    title	= {{Efficient Reliable Multicast on Myrinet}},
+    author	= {K. Verstoep and K.G. Langendoen and H.E. Bal},
+    booktitle	= ICPP,
+    volume	= {3},
+    address	= {Bloomingdale, IL},
+    pages	= {156-165},
+    month	= {August},
+    year	= {1996}
+}
+
+@inproceedings
+{
+                  Verstoep:08,
+    title	= {{Experiences with Fine-grained Distributed Supercomputing on a 10G Testbed}},
+    author	= {K. Verstoep and J. Maassen and H.E. Bal and J.W. Romein},
+    booktitle	= CCGRID,
+    address	= {Lyon, France},
+    month	= {May},
+    year	= {2008}
+}
+
+
+@inproceedings
+{
+		  deVos:01,
+    title	= {{Cluster Computers and Grid Processing in the First Radio-Telescope of a New Generation}},
+    author	= {{C.M. de} Vos and {K. van der} Schaaf and J.D. Bregman},
+    booktitle	= CCGRID,
+    pages	= {156-160},
+    month	= {May},
+    year	= {2001}
+}
+
+@article
+{
+		  deVos:09,
+    title	= {{The LOFAR Telescope: System Architecture and Signal Processing}},
+    author	= {{Marco de} Vos and Andre W. Gunst and Ronald Nijboer},
+    journal	= IEEE,
+    note	= {To appear},
+    year        = {2009}
+}
+
+
+@inproceedings
+{
+		  Wasserman:97,
+    title	= {{Performance Evaluation of the SGI Origin2000: A Memory-Centric Characterization of LANL ASCI Applications}},
+    author	= {H. Wasserman and O.M. Lubeck and Y. Luo and F. Bassetti},
+    booktitle	= SC # { '97},
+    address	= {San Jose, CA},
+    month	= {November},
+    year	= {1997}
+}
+
+@article
+{
+		  Waterman:87,
+    title	= {{A New Algorithm for Best Subsequence Alignments with Application to tRNA--rRNA Comparisons}},
+    author	= {M.S. Waterman and M. Eggert},
+    journal	= JMB,
+    volume	= {197},
+    pages	= {723--725},
+    year	= {1987}
+}
+
+@phdthesis
+{
+		  Weill:95,
+    title	= {{Programme d'\'Echecs de Championnat : Architechture Logicielle, Synth\`ese de Fonctions d'\'Evaluations, Parall\'elisme de Recherche}},
+    author	= {J.-C. Weill},
+    school	= {Universit\'e Paris~8},
+    note	= {In French.},
+    month	= {January},
+    year	= {1995}
+}
+
+@inproceedings
+{
+		  Weill:96,
+    title	= {{The ABDADA Distributed Minimax Search Algorithm}},
+    author	= {J.-C. Weill},
+    booktitle	= {Proceedings of the 24th Annual ACM Computer Science Conference},
+    address	= {Philadelphia, PA},
+    month	= {February},
+    year	= {1996}
+}
+
+@article
+{
+		  Weill:96b,
+    title	= {{The ABDADA Distributed Minimax Search Algorithm}},
+    author	= {J.-C. Weill},
+    journal	= ICCA,
+    volume	= {19},
+    number	= {1},
+    pages	= {3--16},
+    month	= {March},
+    year	= {1996}
+}
+
+@incollection
+{
+		  Welch:91,
+    title	= {{Measuring Performance of Caching in the Sprite Network File System}},
+    author	= {B.B. Welch},
+    booktitle	= {Symposium on Experiences with Distributed and Multiprocessor Systems (SEDMS II)},
+    pages	= {229--247},
+    publisher	= {USENIX},
+    address	= {Atlanta, GA},
+    month	= {March},
+    year	= {1991}
+}
+
+@article
+{
+		  Wu:01,
+    title	= {{Solving Chinese Endgames by Database Construction}},
+    author	= {R. Wu and D.F. Beal},
+    journal	= IS,
+    volume	= {135},
+    number	= {3--4},
+    pages	= {207--228},
+    month	= {July},
+    year	= {2001}
+}
+
+@inproceedings
+{
+                  Yoshii:09,
+    title       = {{Characterizing the Performance of Big Memory on Blue Gene Linux}},
+    author      = {K. Yoshii and K. Iskra and P.C. Broekema and H. Naik and P. Beckman},
+    booktitle   = {International Workshop on Parallel Programming Models and Systems Software for High-End Computing (P2S2'09)},
+    year        = {2009}
+}
+
+@techreport
+{
+		  Zobrist:70,
+    title	= {{A New Hashing Method with Application for Game Playing}},
+    author	= {A.L. Zobrist},
+    institution	= {Computer Science Department, University of Wisconsin, Madison},
+    number	= {88},
+    year	= {1970},
+    note	= {Reprinted in: {\em ICCA Journal}, 13(2):69--73, 1990}
+}
+
+@Article{bgp,
+  author = 	 {{IBM Blue Gene team}},
+  title = 	 {{Overview of the IBM Blue Gene/P project}},
+  journal = 	 {IBM Journal of Research and Development},
+  year = 	 {2008},
+  OPTkey = 	 {},
+  volume = 	 {52},
+  number = 	 {1/2},
+  pages = 	 {199-220},
+  month = 	 {January/March},
+  OPTnote = 	 {},
+  OPTannote = 	 {}
+}
+
+@InProceedings{sc09,
+  author = 	 {John W. Romein and P. Chris Broekema and Jan David Mol and {Rob V. van Nieuwpoort}},
+  title = 	 {{Processing Real-Time LOFAR Telescope Data on a Blue Gene/P Supercomputer}},
+  OPTcrossref =  {},
+  OPTkey = 	 {},
+  OPTbooktitle = {},
+  OPTpages = 	 {},
+  year = 	 {2009},
+  OPTeditor = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  OPTaddress = 	 {},
+  OPTmonth = 	 {},
+  OPTorganization = {},
+  OPTpublisher = {},
+  note = 	 {Submitted for publication. See http://www.astron.nl/~romein/papers.},
+  OPTannote = 	 {}
+}
+
+@Manual{amd-manual,
+  title = 	 {AMD Stream Computing User Guide Revision 1.1},
+  OPTkey = 	 {},
+  OPTauthor = 	 {{Advanced Micro Devices Corporation (AMD)}},
+  OPTorganization = {},
+  OPTaddress = 	 {},
+  OPTedition = 	 {},
+  OPTmonth = 	 {August},
+  year = 	 {2008},
+  OPTnote = 	 {},
+  OPTannote = 	 {}
+}
+
+@Article{cell,
+  author = 	 {Michael Gschwind and H. Peter Hofstee and Brian K. Flachs and Martin Hopkins and Yukio Watanabe and Takeshi Yamazaki},
+  title = 	 {{Synergistic Processing in Cell's Multicore Architecture}},
+  journal = 	 {IEEE Micro},
+  year = 	 {2006},
+  OPTkey = 	 {},
+  volume = 	 {26},
+  number = 	 {2},
+  pages = 	 {10-24},
+  OPTmonth = 	 {},
+  OPTnote = 	 {},
+  OPTannote = 	 {}
+}
+
+@Article{larrabee,
+  author = 	 {Larry Seiler and Doug Carmean and Eric Sprangle and Tom Forsyth and Michael Abrash and Pradeep Dubey and Stephen Junkins and Adam Lake and  Jeremy Sugerman and Robert Cavin and Roger Espasa and Ed Grochowski and Toni Juan and Pat Hanrahan},
+  title = 	 {{Larrabee: A Many-Core x86 Architecture for Visual Computing}},
+  journal = 	 {ACM Transactions on Graphics},
+  year = 	 {2008},
+  OPTkey = 	 {},
+  volume = 	 {27},
+  number = 	 {3},
+  OPTpages = 	 {},
+  month = 	 {August},
+  OPTnote = 	 {},
+  OPTannote = 	 {}
+}
+
+@Book{system-performance,
+  author = 	 {Edward D. Lazowska and John Zahorjana and G. Scott Graham and Kenneth C. Sevcik},
+  ALTeditor = 	 {},
+  title = 	 {Quantitative System Performance, Computer System Analysis Using Queueing Network Models},
+  publisher = 	 {Prentice-Hall},
+  year = 	 {1984},
+  OPTkey = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  OPTaddress = 	 {},
+  OPTedition = 	 {},
+  OPTmonth = 	 {},
+  note = 	 {ISBN: 978-0137469758},
+  OPTannote = 	 {It remains the definitive work on analytic modelling of computer systems. The publisher has returned the copyright to the authors, who make the material available here for viewing or downloading, in Adobe Acrobat PDF format}
+}
+
+@InProceedings{fftc,
+  author = 	 {D.A. Bader and V. Agarwal},
+  title = 	 {{FFTC: Fastest Fourier Transform for the IBM Cell Broadband Engine}},
+  OPTcrossref =  {},
+  OPTkey = 	 {},
+  booktitle = {14th IEEE Intl. Conference on High Performance Computing},
+  pages = 	 {172--184},
+  year = 	 {2007},
+  OPTeditor = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {4873},
+  OPTseries = 	 {LNCS},
+  OPTaddress = 	 {Goa, India},
+  OPTmonth = 	 {december},
+  OPTorganization = {},
+  OPTpublisher = {Springer-Verlag},
+  OPTnote = 	 {},
+  OPTannote = 	 {}
+}
+
+@Article{askap,
+  author = 	 {Johnston, S. and Taylor, R. and Bailes, M. and others},
+  OPTauthor = 	 {Johnston, S. and Taylor, R. and Bailes, M. and Bartel, N. and Baugh, C. and Bietenholz, M. and Blake, C. and Braun, R. and Brown, J. and Chatterjee, S. and Darling, J. and Deller, A. and Dodson, R. and Edwards, P. and Ekers, R. and Ellingsen, S. and Feain, I. and Gaensler, B. and Haverkorn, M. and Hobbs, G. and Hopkins, A. and Jackson, C. and James, C. and Joncas, G. and Kaspi, V. and Kilborn, V. and Koribalski, B. and Kothes, R. and Landecker, T. and Lenc, A. and Lovell, J. and Macquart, J.-P. and Manchester, R. and Matthews, D. and McClure-Griffiths, N. and Norris, R. and Pen, U.-L. and Phillips, C. and Power, C. and Protheroe, R. and Sadler, E. and Schmidt, B. and Stairs, I. and Staveley-Smith, L. and Stil, J. and Tingay, S. and Tzioumis, A. and Walker, M. and Wall, J. and  Wolleben, M.},
+  title = 	 {{Science with ASKAP. The Australian Square-Kilometre-Array Pathfinder}},
+  journal = 	 {Experimental Astronomy},
+  year = 	 {2008},
+  OPTkey = 	 {},
+  volume = 	 {22},
+  number = 	 {3},
+  pages = 	 {151-273},
+  OPTmonth = 	 {},
+  OPTnote = 	 {DOI: 10.1007/s10686-008-9124-7},
+  OPTannote = 	 {The future of cm and m-wave astronomy lies with the
+                  Square Kilometre Array (SKA), a telescope under
+                  development by a consortium of 17 countries. The SKA
+                  will be 50 times more sensitive than any existing
+                  radio facility. A majority of the key science for
+                  the SKA will be addressed through large-area imaging
+                  of the Universe at frequencies from 300 MHz to a few
+                  GHz. The Australian SKA Pathfinder (ASKAP) is aimed
+                  squarely in this frequency range, and achieves
+                  instantaneous wide-area imaging through the
+                  development and deployment of phase-array feed
+                  systems on parabolic reflectors. This large
+                  field-of-view makes ASKAP an unprecedented synoptic
+                  telescope poised to achieve substantial advances in
+                  SKA key science. The central core of ASKAP will be
+                  located at the Murchison Radio Observatory in inland
+                  Western Australia, one of the most radio-quiet
+                  locations on the Earth and one of the sites selected
+                  by the international community as a potential
+                  location for the SKA. Following an introductory
+                  description of ASKAP, this document contains 7
+                  chapters describing specific science programmes for
+                  ASKAP. In summary, the goals of these programmes are
+                  as follows:
+
+    The detection of a million galaxies in atomic hydrogen emission
+                  across 75% of the sky out to a redshift of 0.2 to
+                  understand galaxy formation and gas evolution in the
+                  nearby Universe.
+
+Keywords: Radio astronomy techniques, Radio telescopes, Square
+                  kilometre array, Very long baseline interferometry,
+                  Extragalactic HI, Radio continuum surveys,
+                  Cosmological evolution, Galaxy formation, Star
+                  formation, Rotation measure, Extragalactic radio
+                  source polarization, Galactic structure, Galactic
+                  magnetic field, Magellenic clouds, Pulsars, Radio
+                  transient sources, Gamma-ray bursters, Intra-day
+                  variability
+                  }
+}
+
+@Misc{meerkat,
+title = {{Karoo array telescope (MeerKAT)}},
+note = {see \url{http://www.ska.ac.za/}.}
+}
+
+@Article{ska,
+  author = 	 {R.T. Schilizzi and P.E.F. Dewdney and T.J.W. Lazio},
+  title = 	 {{The Square Kilometre Array}},
+  journal = 	 {{Proceedings of SPIE}},
+  year = 	 {2008},
+  OPTkey = 	 {},
+  volume = 	 {7012},
+  OPTnumber = 	 {},
+  OPTpages = 	 {},
+  month = 	 {July},
+  OPTnote = 	 {},
+  OPTannote = 	 {}
+}
+ 
+@Article{embrace,
+  author = 	 {A. Van Ardenne and P. N. Wilkinson and P. D. Patel and J. G. Bij De Vaate},
+  title = 	 {{Electronic Multi-beam Radio Astronomy Concept: Embrace a Demonstrator for the European SKA Program}},
+  journal = 	 {Experimental Astronomy},
+  year = 	 {2004},
+  OPTkey = 	 {},
+  OPTvolume = 	 {17},
+  OPTnumber = 	 {1--3},
+  OPTpages = 	 {65-77},
+  OPTmonth = 	 {June},
+  OPTnote = 	 {},
+  OPTannote = 	 {},
+  publisher = {Springer Netherlands},
+  OPTissn = {0922-6435},
+  OPTdoi = {10.1007/s10686-005-2868-4}
+}
+
+@InProceedings{apertif,
+  author = 	 {Verheijen, M.A.W. and Oosterloo, T.A. and van Cappellen, W.A. and Bakker, L. and Ivashina, M.V. and van der Hulst, J.M.},
+  title = 	 {{Apertif, a focal plane array for the WSRT}},
+  OPTcrossref =  {},
+  OPTkey = 	 {},
+  booktitle = {{The Evolution of Galaxies through the Neutral Hydrogen Window. AIP Conference Proceedings}},
+  pages = 	 {265--271},
+  year = 	 {2008},
+  OPTeditor = 	 {},
+  volume = 	 {1035},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  OPTaddress = 	 {},
+  OPTmonth = 	 {},
+  OPTorganization = {},
+  OPTpublisher = {},
+  OPTnote = 	 {},
+  OPTannote = 	 {},
+  doi = {10.1063/1.2973599}
+}
+
+@InProceedings{brook,
+  author = 	 {Ian Buck and Tim Foley and Daniel Horn and Jeremy Sugerman and Kayvon Fatahalian and Mike Houston and Pat Hanrahan},
+  title = 	 {{Brook for GPUs: Stream Computing on Graphics Hardware}},
+  OPTcrossref =  {},
+  OPTkey = 	 {},
+  booktitle = {ACM Transactions on Graphics, Proceedings of SIGGRAPH},
+  pages = 	 {777-786},
+  year = 	 {2004},
+  OPTeditor = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  OPTaddress = 	 {Los Angeles, California},
+  month = 	 {August},
+  OPTorganization = {},
+  OPTpublisher = {ACM Press},
+  OPTnote = 	 {},
+  OPTannote = 	 {}
+}
+
+@Manual{cuda-manual,
+  title = 	 {{NVIDIA CUDA Programming Guide Version 2.0}},
+  OPTkey = 	 {},
+  OPTauthor = 	 {NVIDIA Corporation},
+  OPTorganization = {},
+  OPTaddress = 	 {},
+  OPTedition = 	 {},
+  OPTmonth = 	 {July},
+  year = 	 {2008},
+  OPTnote = 	 {http://developer.nvidia.com/cuda},
+  OPTannote = 	 {}
+}
+
+@inproceedings{gpu-cache,
+abstract = {
+We present a technique for designing memory-bound algorithms with high
+data reuse on Graphics Processing Units (GPUs) equipped with
+close-to-ALU software-managed memory. The approach is based on the
+efficient use of this memory through the implementation of a
+software-managed cache. We also present an analytical model for
+performance analysis of such algorithms. We apply this technique to
+the implementation of the GPU-based solver of the sum-product or
+marginalize a product of functions (MPF) problem, which arises in a
+wide variety of real-life applications in artificial intelligence,
+statistics, image processing, and digital communications. Our
+motivation to accelerate MPF originated in the context of the analysis
+of genetic diseases, which in some cases requires years to complete on
+modern CPUs. Computing MPF is similar to computing the chain matrix
+product of multi-dimensional matrices, but is more difficult due to a
+complex data-dependent access pattern, high data reuse, and a low
+compute-to-memory access ratio. Our GPU-based MPF solver achieves up
+to 2700-fold speedup on random data and 270-fold on real-life genetic
+analysis datasets on GeForce 8800GTX GPU from NVIDIA over the
+optimized CPU version on an Intel 2.4 GHz Core 2 with a 4 MB L2 cache.
+},
+        author = {Silberstein, Mark   and Schuster, Assaf   and Geiger, Dan   and Patney, Anjul   and Owens, John  D. },
+        booktitle = {Proceedings of the 22nd ACM International Conference on Supercomputing},
+        keywords = {gpu},
+        month = {June},
+        pages = {309-318},
+        title = {{Efficient Computation of Sum-products on GPUs Through Software-Managed Cache}},
+        year = {2008}
+}
+
+@Book{correlator-geophysics,
+  author = 	 {W.M. Telford and L.P. Geldart and R.E. Sheriff},
+  OPTeditor = 	 {},
+  title = 	 {Applied Geophysics},
+  publisher = 	 {Cambridge University Press},
+  year = 	 {1991},
+  OPTkey = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  OPTaddress = 	 {},
+  OPTedition = 	 {},
+  OPTmonth = 	 {},
+  OPTnote = 	 {},
+  OPTannote = 	 {},
+  note =         {Second Edition, ISBN: 0521326931}
+}
+
+@Book{correlator-radar,
+  author = 	 {J.D. Taylor},
+  OPTeditor = 	 {},
+  title = 	 {{Introduction to Ultra-Wideband Radar Systems}},
+  publisher = 	 {CRC Press},
+  year = 	 {1995},
+  OPTkey = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  OPTaddress = 	 {},
+  OPTedition = 	 {},
+  OPTmonth = 	 {},
+  note = 	 {ISBN: 0849344409},
+  OPTannote = 	 {}
+}
+
+@Book{correlator-wireless,
+  author = 	 {P. Chandra and A. Bensky and R. Olexa and D.M. Dobkin and D.A. Lide and F. Dowla},
+  OPTeditor = 	 {},
+  title = 	 {Wireless Networking},
+  publisher = 	 {Newnes Press},
+  year = 	 {2007},
+  OPTkey = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  OPTaddress = 	 {},
+  OPTedition = 	 {},
+  OPTmonth = 	 {},
+  note = 	 {ISBN: 0750685824},
+  OPTannote = 	 {}
+}
+
+@Article{roofline,
+  author = 	 {S. Williams and A. Waterman and D. Patterson},
+  title = 	 {{Roofline: An Insightful Visual Performance Model for Floating-Point Programs and Multicore Architectures}},
+  journal = 	 {Communications of the ACM},
+  year = 	 {2009},
+  OPTkey = 	 {},
+  volume = 	 {52},
+  number = 	 {4},
+  OPTissn = {0001-0782},
+  pages = {65--76},
+  OPTdoi = {http://doi.acm.org/10.1145/1498765.1498785},
+  OPTpublisher = {ACM},
+  OPTaddress = {New York, NY, USA},
+  OPTmonth = 	 {},
+  OPTannote = 	 {}
+}
+
+@inproceedings{gridding,
+ author = {van Amesfoort, A.S. and Varbanescu, A.L. and Sips, H.J. and van Nieuwpoort, R.V.},
+ title = {{Multi-Core Platforms for HPC Data-Intensive Kernels}},
+ booktitle = {Proceedings of ACM Computing Frontiers},
+ year = {2009},
+ OPTisbn = {978-1-60558-413-3},
+ pages = {207--216},
+ address = {Ischia, Italy},
+ OPTdoi = {http://doi.acm.org/10.1145/1531743.1531777},
+OPTpublisher = {ACM},
+OPTaddress = {New York, NY, USA},
+ }
+
+@Book{data-access,
+  author = 	 {Catthoor, F. and Danckaert, K. and Kulkarni, K.K. and Brockmeyer, E. and Kjeldsberg, P.G. and van Achteren, T. and Omnes, T.},
+  ALTeditor = 	 {},
+  title = 	 {Data Access and Storage Management for Embedded Programmable Processors},
+  publisher = 	 {Kluwer Academic Publishers},
+  year = 	 {2002},
+  OPTkey = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  OPTaddress = 	 {},
+  OPTedition = 	 {},
+  OPTmonth = 	 {},
+  note = 	 {ISBN: 978-0-7923-7689-7},
+  OPTannote = 	 {}
+}
+
+@INPROCEEDINGS{Wilson95dynamicstorage,
+    author = {Paul R. Wilson and Mark S. Johnstone and Michael Neely and David Boles},
+    title = {{Dynamic Storage Allocation: A Survey and Critical Review}},
+    booktitle = {Proceedings of International Workshop on Memory Management},
+    year = {1995},
+    pages = {1--116},
+    publisher = {Springer-Verlag},
+volume = {986},
+series = LNCS,
+address = {Kinross, Scotland}
+}
+
+
+@InProceedings{data-locality,
+  author = 	 {Michael E. Wolf and Monica S. Lam},
+  title = 	 {{A Data Locality Optimizing Algorithm}},
+  OPTcrossref =  {},
+  OPTkey = 	 {},
+  booktitle = {{Proceedings of the ACM SIGPLAN 1991 Conference on Programming Language Design and Implementation (PLDI)}},
+  pages = 	 {30--44},
+  year = 	 {1991},
+  OPTeditor = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  address = 	 {Toronto, Ontario, Canada},
+  OPTmonth = 	 {},
+  OPTorganization = {},
+  OPTpublisher = {},
+  note = 	 {ISBN:0-89791-428-7},
+  OPTannote = 	 {}
+}
+
+
+@InProceedings{cache-tlb-compiler,
+  author = 	 {{David F. Bacon, Jyh-Herng Chow, Dz-ching R. Ju, Kalyan Muthukumar and Vivek Sarkar}},
+  title = 	 {{A Compiler Framework for Restructuring Data Declarations to Enhance Cache and TLB Effectiveness}},
+  OPTcrossref =  {},
+  OPTkey = 	 {},
+  booktitle = {Proceedings of the 1994 Conference of the Centre for Advanced Studies on Collaborative Research},
+  pages = 	 {270--282},
+  year = 	 {1994},
+  OPTeditor = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  address = 	 {Toronto, Ontario, Canada},
+  OPTmonth = 	 {},
+  OPTorganization = {},
+  publisher = {IBM Press},
+  OPTnote = 	 {},
+  OPTannote = 	 {}
+}
+
+@ARTICLE{Panda96memorydata,
+    author = {Preeti Ranjan Panda and Nikil D. Dutt and Alexandru Nicolau},
+    title = {{Memory Data Organization for Improved Cache Performance in Embedded Processor Applications}},
+    journal = {{ACM Transactions on Design Automation of Electronic Systems}},
+    year = {1996},
+    volume = {2},
+    pages = {384--409}
+}
+
+@ARTICLE{Chen95effectivehardware-based,
+    author = {{Tien-fu Chen and Jean-loup Baer}},
+    title = {{Effective Hardware-Based Data Prefetching for High-performance Processors}},
+    journal = {{IEEE Transactions on Computers}},
+    year = {1995},
+    volume = {44},
+    pages = {609--623}
+}
+
+ @Book{cache,
+  ALTauthor = 	 {},
+  editor = 	 {Meyer, Ulrich and Sanders, Peter and Sibeyn, Jop},
+  title = 	 {Algorithms for Memory Hierarchies},
+  publisher = 	 {Springer Berlin / Heidelberg},
+  year = 	 {2003},
+  OPTkey = 	 {},
+  volume = 	 {2625},
+  OPTnumber = 	 {},
+  series = 	 {Lecture Notes in Computer Science},
+  OPTaddress = 	 {},
+  OPTedition = 	 {},
+  OPTmonth = 	 {},
+  note = 	 {ISBN: 978-3-540-00883-5},
+  OPTannote = 	 {}
+}
+
+@InProceedings{ppopp2010,
+  author = 	 {John W. Romein and P. Chris Broekema and Jan David Mol and {Rob V. van Nieuwpoort}},
+  title = 	 {{The LOFAR Correlator: Implementation and Performance Analysis}},
+  OPTcrossref =  {},
+  OPTkey = 	 {},
+  booktitle = {15th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP 2010)},
+  OPTpages = 	 {},
+  year = 	 {2010},
+  OPTeditor = 	 {},
+  OPTvolume = 	 {},
+  OPTnumber = 	 {},
+  OPTseries = 	 {},
+  address = 	 {Bangalore, India},
+  month = 	 {January},
+  OPTorganization = {},
+  OPTpublisher = {},
+  note = 	 {Accepted for for publication. See \url{http://www.astron.nl/~romein/papers/}.},
+  OPTannote = 	 {}
+}
diff --git a/doc/papers/2010/SPM/final/spm.tex b/doc/papers/2010/SPM/final/spm.tex
new file mode 100644
index 0000000000000000000000000000000000000000..5778332265efd8b0389af8241fafca8a85a48352
--- /dev/null
+++ b/doc/papers/2010/SPM/final/spm.tex
@@ -0,0 +1,1254 @@
+\documentclass{article}
+
+\newcommand{\longversion}[1]{}
+\newcommand{\shortversion}[1]{#1}
+
+\usepackage{graphicx}
+\usepackage{listings}
+\usepackage{url}
+
+%% for normal spacing
+\usepackage{spconf}
+
+%% for double spacing
+%\usepackage[left=2cm,top=3cm,right=2cm]{geometry} 
+%\usepackage{setspace}
+%\doublespacing
+
+
+%\title{How to Build a Correlator with Many-Core Hardware}
+\title{Building Correlators with Many-Core Hardware}
+
+%% for normal spacing
+\name{Rob V. van Nieuwpoort and John W. Romein}
+\address{Stichting ASTRON (Netherlands Institute for Radio Astronomy) \\
+Oude Hoogeveensedijk 4, 7991 PD\ \ Dwingeloo, The Netherlands \\
+\texttt{\{nieuwpoort,romein\}@astron.nl}
+}
+
+%% for double spacing
+%\author{Rob V. van Nieuwpoort and John W. Romein \\ 
+%Stichting ASTRON (Netherlands Institute for Radio Astronomy) \\
+%Oude Hoogeveensedijk 4, 7991 PD\ \ Dwingeloo, The Netherlands \\
+%\texttt{\{nieuwpoort,romein\}@astron.nl}
+%}
+
+
+
+\begin{document}
+
+\maketitle
+
+\begin{abstract}
+Radio telescopes typically consist of multiple receivers whose
+signals are cross-correlated to filter out noise.  A recent trend
+is to correlate in software instead of custom-built hardware, taking
+advantage of the flexibility that software solutions offer.  Examples
+include e-VLBI and LOFAR.  However, the data rates are usually high
+and the processing requirements challenging.  Many-core processors are
+promising devices to provide the required processing power.
+
+In this paper, we explain how to implement and optimize
+signal-processing applications on multi-core CPUs and many-core
+architectures, such as the Intel Core i7, NVIDIA and ATI GPUs, and the
+\mbox{Cell/B.E.}  We use correlation as a running example. The
+correlator is a streaming, possibly real-time application, and is much
+more I/O intensive than applications that are typically implemented on
+many-core hardware today.  We compare with the LOFAR production
+correlator on an IBM Blue Gene/P supercomputer.  We discuss several
+important architectural problems which cause architectures to perform
+suboptimally, and also deal with programmability.
+
+The correlator on the Blue Gene/P achieves a superb 96\% of the
+theoretical peak performance.  We show that the processing power and
+memory bandwidth of current GPUs are highly imbalanced. Because of
+this, the correlator achieves only 16\% of the peak on ATI GPUs, and
+32\% on NVIDIA GPUs.  The \mbox{Cell/B.E.} processor, in contrast,
+achieves an excellent 92\%.  Many of the insights we discuss here are not only
+applicable to telescope correlators, are valuable when developing
+signal-processing applications in general.
+\end{abstract}
+
+
+\section{Introduction}
+
+Radio telescopes produce enormous amounts of data.
+The \emph{Low-Frequency Array\/} (LOFAR)~\cite{deVos:09}, for instance, will produce some tens
+of petabits per day, and the \emph{Australian SKA Pathfinder\/} will
+even produce over six exabits per day~\cite{askap}.
+These modern radio telescopes use many separate receivers as building blocks,
+and combine their signals to form a single large and sensitive instrument.
+
+To extract the sky signal from the system noise, the \emph{correlator\/}
+correlates the signals from different receivers, and integrates the
+correlations over time, to reduce the amount of data.
+This is a challenging problem in radio astronomy,
+since the data volumes are large, and the computational demands grow
+quadratically with the number of receivers.
+Correlators are not limited to astronomy, but are also used 
+in geophysics~\cite{correlator-geophysics},
+radar systems~\cite{correlator-radar}, 
+wireless networking~\cite{correlator-wireless}, etc.
+
+Traditionally, custom-built hardware, and later FPGAs were used to correlate telescope signals.
+A recent development is to use a supercomputer~\cite{ppopp2010}.
+Both approaches have important advantages and disadvantages.
+Custom-built hardware is efficient and consumes modest amounts of power, but is
+inflexible, expensive to design, and has a long development time.
+Solutions that use a supercomputer are much more flexible, but are less
+efficient, and consume more power. %, and are expensive to purchase and maintain.
+Future instruments, like the Square Kilometre Array (SKA), need several orders
+of magnitude more computational resources.
+It is likely that the requirements of the SKA cannot be met by using
+current supercomputer technology. Therefore, it is important to investigate
+alternative hardware solutions.
+
+General-purpose architectures no longer
+achieve performance improvements by increasing the clock frequency, but
+by adding more compute cores and by exploiting parallelism.  Intel's
+recent Core~i7 processor is a good example of this. It has four
+cores and supports additional vector parallelism.
+Furthermore, the high-performance computing community is
+steadily adopting clusters of Graphics Processor Units (GPUs) as a viable
+alternative to supercomputers, due to their unparalleled growth in
+computational performance, increasing flexibility and programmability,
+high power efficiency, and low purchase costs.
+GPUs are highly parallel and contain hundreds of processor cores.
+%% However, their usefulness is often limited to applications that do not require
+%% double-precision floating-point arithmetics, since there is no need for
+%% double-precision calculations to play games.
+%% Hence, the support for double-precision arithmetic is typically poor.
+%% Fortunately, many signal-processing applications do not require double
+%% precision.
+An example of a processor that combines GPU and CPU
+qualities into one design is the Cell Broadband Engine~\cite{cell}.
+The \mbox{Cell/B.E.} consists of an ``ordinary'' PowerPC core and eight powerful
+vector processors that provide the bulk of the processing power.
+Programming the \mbox{Cell/B.E.} requires more effort than programming an ordinary CPU,
+but various studies showed that the \mbox{Cell/B.E.} performs well on
+signal-processing tasks like FFTs~\cite{fftc}.
+
+In this article, we explain how many-core architectures can be
+exploited for signal-processing purposes.  We give
+insights into their architectural limitations, and how to best cope
+with them.  We treat five different, popular architectures with
+multiple cores: the \mbox{Cell/B.E.}, GPUs from both NVIDIA and ATI, the Intel Core i7 processor, and
+the IBM Blue Gene/P (BG/P) supercomputer.  We discuss their
+similarities and differences, and how the architectural differences
+affect optimization choices and the eventual performance of a
+correlator. We also discuss the programmability of the architectures.
+We focus on correlators, but many of the
+findings, claims, and optimizations hold for other signal-processing
+algorithms as well, both inside and outside the area of radio astronomy.
+For instance, we discuss another signal-processing algorithm, radio-astronomy imaging, on many-core
+hardware elsewhere~\cite{gridding}. 
+%, but this
+%paper should be of special interest to those who are willing to invest
+%some extra programming effort to obtain good performance, even if
+%high-level programming support is not available.
+In this paper, we use the LOFAR telescope as a running example, and
+use its production correlator on the BG/P as a comparison. This way,
+we demonstrate how many-core architectures can be used in practice for a real
+application.
+For educational purposes, we made the correlator implementations for all architectures available online.
+They exemplify the different optimization choices for the different architectures.
+The code may be reused under the GNU public license.
+We describe and analyze the correlator on many-core
+platforms in much more detail in~\cite{Nieuwpoort:09}. 
+
+
+\section{Trends in radio astronomy}
+
+During the past decade, new types of radio-telescope concepts emerged that
+rely less on concrete, steel, and extreme cooling techniques, but more on
+signal-processing techniques.
+For example, LOFAR~\cite{deVos:09}, MeerKAT (Karoo Array Telescope)~\cite{meerkat} and
+ASKAP (Australian Square Kilometre Array Pathfinder)~\cite{askap}
+are distributed sensor networks
+that combines the signals of many receiver elements.
+All three are pathfinders for the future SKA (Square Kilometre Array)~\cite{ska} telescope, which
+will be orders of magnitude larger.
+%Also, aperture array tiles like Embrace~\cite{embrace} and focal plane arrays
+%like Apertif~\cite{apertif} are novel multi-receiver concepts.
+%Unlike single-pixel feeds from traditional dish-based telescopes, 
+These
+instruments combine the advantages of higher sensitivity, higher resolution,
+and multiple concurrent observation directions.
+But, they require huge
+amounts of processing power to combine the data from the receiving elements.
+
+The signal-processing hardware technology used to process telescope
+data also changes rapidly.  Only a decade ago, correlators required
+special-purpose ASICs to keep up with the high data rates and
+processing requirements.  The advent of sufficiently fast FPGAs
+significantly lowered the developments times and costs of
+correlators, and increased the flexibility
+substantially. LOFAR requires even more flexibility to support many
+different processing pipelines for various observation modes, and uses
+FPGAs for on-the-field processing and a BG/P
+supercomputer to perform real-time, central processing.
+We describe LOFAR in more detail below.
+
+%  dit past hier niet
+%% Recent many-core architectures seem to be a viable complement to the aforementioned processing platforms.
+%% GPUs provide more processing power and are more power-efficient than CPUs,
+%% while GPUs are more flexible and easier to program than FPGAs.
+%% Since GPUs of different vendors are mutually quite different, we did an
+%% extensive performance comparison between the architectures of popular GPUs 
+%% for signal-processing purposes, particularly, for correlation
+%% purposes~\cite{Nieuwpoort:09}.
+
+
+\subsection{The LOFAR telescope}
+
+\begin{figure}[t]
+\vspace{-0.4cm}
+\begin{center}
+\includegraphics[width=60mm]{figures/fig1.jpg}
+\end{center}
+\vspace{-0.5cm}
+\caption{A field with LOFAR antennas.}
+\label{fig:lba-field}
+\end{figure}
+
+\begin{figure*}[t]
+\begin{minipage}[b]{11cm}
+\includegraphics[width=11cm]{figures/fig2.pdf}
+\caption{A simplified overview of the LOFAR processing.}
+\label{fig:lofar-overview}
+\end{minipage}
+\hfill
+\begin{minipage}[b]{55mm}
+\includegraphics[width=0.95\columnwidth]{figures/fig3.pdf}
+\caption{LOFAR layout.}
+\label{fig:map}
+\end{minipage}
+\end{figure*}
+
+LOFAR is an aperture array radio telescope operating in the
+10 to 250~MHz frequency range~\cite{deVos:09}.  It is the first of a new generation of
+radio telescopes, that breaks with the concepts of traditional
+telescopes in several ways.  Rather than using large, expensive
+dishes, LOFAR uses many thousands of simple antennas that have no
+movable parts, see
+Figure~\ref{fig:lba-field}.  Essentially, it is a distributed sensor
+network that monitors the sky and combines all signals centrally.
+This concept requires much more signal processing, but the 
+costs of the silicon for the processing are much lower that the costs of steel that would
+be needed for dishes. Moreover, LOFAR can observe the sky in many
+directions concurrently and switch directions instantaneously.  In
+several ways, LOFAR will be the largest telescope of the world.
+The antennas are simple, but there are a lot of them: 44000 in the full LOFAR
+design. To make radio pictures of the sky with adequate resolution,
+these antennas are to be arranged in clusters.
+In the rest of this paper, we call a cluster of antenna's \emph{a receiver}.
+The receivers will be spread out over
+an area of ultimately 350 km in diameter. This is shown in Figure~\ref{fig:map}.
+%In the current phase, 31.000 antenna's with a maximum distance of 100 km will
+%be built. 
+Data transport requirements are in the range of many
+tera-bits/sec and the processing power needed is tens of tera-ops.
+
+Another novelty is the elaborate
+use of \emph{software\/} to process the telescope data in real time.
+%The signals from the
+%antennas are digitised, transported to a central location,
+%and combined in software to emulate a conventional instrument. 
+LOFAR thus is an IT-telescope. 
+The cost
+is dominated by the cost of computing and will follow Moore's law,
+becoming cheaper with time and allowing increasingly large telescopes
+to be built. 
+
+LOFAR will enable exciting new science cases.  First, we expect to see
+the \emph{Epoch of Reionization\/} (EoR), the time that the first star
+galaxies and quasars were formed. Second, LOFAR offers a unique
+possibility in particle astrophysics for studying the origin of
+high-energy \emph{cosmic rays}.  Third, LOFAR's ability to
+continuously monitor a large fraction of the sky makes it uniquely
+suited to find new \emph{pulsars} and to study \emph{transient
+  sources}.  Since LOFAR has no moving parts, it can instantaneously
+switch focus to some galactic event.  Fourth, \emph{Deep Extragalactic
+  Surveys\/} will be carried out to find the most distant radio
+galaxies and study star-forming galaxies.  Fifth, LOFAR will be
+capable of observing the so far unexplored radio waves emitted by
+\emph{cosmic magnetic fields}.  For a more extensive description of
+the astronomical aspects of the LOFAR system, see~\cite{lofar}.
+
+A global overview of the LOFAR processing is given in
+Figure~\ref{fig:lofar-overview}. The thickness of the lines indicates
+the size of the data streams.  Initial processing is done in the
+field, using FPGA technology.  Typical operations that are performed
+there include analog-to-digital conversion, filtering, frequency
+selection, and combination of the signals from the different
+antenna's.  Next, the data is transported to the central processing
+location in Groningen via dedicated optical wide-area networks.
+
+The real-time central processing of LOFAR data is done on
+a BG/P supercomputer.  There, we filter the data, and
+perform phase shift and bandpass corrections.
+Next, the signals from all receivers are cross-correlated.  The
+correlation process performs a data reduction by integrating samples
+over time.  Finally, the data is forwarded to a storage cluster, where
+results can be kept for several days.  After an observation has
+finished, further processing, such as RFI removal, calibration, and imaging is done off-line, on commodity cluster
+hardware.  
+In this paper, we focus on the correlator step (the
+highlighted part in the red box in
+Figure~\ref{fig:lofar-overview}), because it must deal with the
+full data streams from all receivers. Moreover, its costs grow
+quadratically with the number of receivers, while all other steps have
+a lower time complexity.
+
+
+\section{Correlating signals}
+\label{sec:correlating}
+
+
+%XF vs FX. lofar is FX. Met een groter aantal inputs is FX efficienter?
+%transpose
+
+
+%% \begin{figure*}[t]
+%% \begin{center}
+%% \includegraphics[width=12cm]{figures/processing-overview.pdf}
+%% \end{center}
+%% \vspace{-0.5cm}
+%% \caption{A simplified view of LOFAR processing.}
+%% \label{fig-processing-overview}
+%% \end{figure*}
+
+
+%The data streams from the receivers contain samples, which are complex
+%numbers that represent the amplitude and phase of a signal.  
+LOFAR's receivers are dual-polarized; they take separate
+samples from orthogonal (X and Y) directions.  The receivers
+support 4, 8 and 16 bit integer samples, where the normal mode of
+operation uses the 16 bit samples to help mitigate the impact of strong RFI. The smaller samples are important
+for observations that require larger sky coverage. 
+Before filtering and correlating, the
+samples are converted to single-precision floating point, since all
+architectures support this well.  This is
+accurate enough for our purposes. From the perspective of the
+correlator, samples thus consist of \emph{four} 32-bit floating point
+numbers: two polarizations, each with a real and an imaginary part.
+
+LOFAR uses an FX correlator: it first filters the different frequencies, and
+then correlates the signals. This is more efficient than an XF correlator for larger numbers of receivers.
+Prior to correlation, the data that comes from
+the receivers must be reordered:
+each input carries the signals of many frequency bands from a single
+receiver, but the correlator needs data from a single frequency of all inputs.
+Depending on the data rate, switching the data can be a real challenge.
+The data reordering phase is outside the scope of this paper, but a correlator
+implementation cannot ignore this issue.
+The LOFAR Blue Gene/P correlator uses the fast 3D~torus for this purpose;
+other multi-core architectures need external switches.
+
+
+The received signals from sky sources are so weak, that the antennas 
+mainly receive noise. To see if there is statistical coherence
+in the noise, simultaneous samples of each pair of receivers are correlated, 
+by multiplying the sample of one receiver with the complex
+conjugate of the sample of the other receiver.
+To reduce the output size, the correlations are integrated over time, by accumulating all products. 
+Therefore, the correlator is mostly multiplying and adding complex numbers.
+Both polarizations of a station A are correlated with both polarizations 
+of a station B, yielding correlations in XX, XY, YX, and YY
+directions.
+The correlator algorithm itself thus is straightforward, and can be
+written in a single formula: \\
+$C_{s_1,s_2\geq s_1,p_1\in\{X,Y\},p_2\in\{X,Y\}} = \displaystyle\sum_{t} Z_{s_1,t,p_1} * Z_{s_2,t,p_2}^\ast$ 
+
+The total number of correlations we have to compute is $(nrReceivers \times
+(nrReceivers + 1)) / 2$, since we need each pair of correlations only
+once. This includes the autocorrelations (the correlation of a receiver with itself),
+since we need them later in the pipeline for calibration purposes.
+The autocorrelations can be computed with less instructions.
+We can implement the correlation operation very efficiently, with only
+four fused-multiply-add (fma) instructions, doing eight floating-point
+operations in total. For each pair of receivers, we have to do this
+four times, once for each combination of polarizations. Thus, in total
+we need 32 operations. To perform these operations, we have to load
+the samples generated by two different receivers from memory.  As
+explained above, the samples each consist of four single-precision
+floating-point numbers.  Therefore, we need to load 8 floats or 32 bytes in
+total.  This results in \emph{exactly one FLOP/byte}. 
+We will describe the implementation and optimization of the correlator on the
+many-core systems in more detail in Section~\ref{sec:optimizing}, but first, we explain the architectures themselves. 
+
+
+\section{Many-core architectures}
+
+\begin{table*}[t]
+\begin{center}
+%{\footnotesize % for normal layout
+{\scriptsize % for double spaced
+\begin{tabular}{|l|l|l|l|l|l|}                                                   
+\hline
+Architecture                                 & Intel Core i7 & IBM Blue Gene/P& ATI 4870 &  NVIDIA Tesla C1060 & STI Cell/B.E. \\
+\hline
+\textbf{gflops per chip}                     & \textbf{85}   & \textbf{13.6}  & \textbf{1200}  & \textbf{936}  & \textbf{204.8}\\
+Clock frequency (GHz)                        & 2.67          & 0.850          & 0.75           & 1.296         & 3.2           \\
+cores x FPUs per core = \textbf{total FPUs}  & 4 x 4 = \textbf{16} & 4 x 2 = \textbf{8} & 160 x 5 = \textbf{800} & 30 x 8 = \textbf{240} & 8 x 4 = \textbf{32} \\
+%operations per cycle per FPU                & 2             &   2            & 2              & 2             & 2             \\
+%\hline
+registers per core x register width          & 16 x 4        & 64 x 2         & 1024 x 4      & 2048 x 1       & 128 x 4       \\
+%\hline
+%total L1 data cache size per chip (KB)      & 32            & 128            & undisclosed   & undisclosed    & 2048          \\
+%total L1 cache bandwidth (GB/s)             & undisclosed   & 54.4           & 480           & undisclosed    & 409.6         \\
+total device RAM bandwidth (GB/s)            & n.a.          & n.a.           & 115.2         & 102            & n.a.          \\
+\textbf{total host RAM bandwidth (GB/s)}     & \textbf{25.6} & \textbf{13.6}  & \textbf{4.6}  & \textbf{5.6}   & \textbf{25.8} \\
+%\hline
+%Process Technology (nm)                      & 45            & 90             & 55            & 65             & 65            \\
+%TDP (W)                                      & 130           & 24             & 160           & 236            & 70            \\
+%\textbf{gflops / Watt (based on TDP)}       & \textbf{0.65} & \textbf{0.57}  & \textbf{7.50} & \textbf{3.97}  & \textbf{2.93} \\
+%\hline
+%\textbf{gflops/device bandwidth (gflops / GB/s)}& n.a.       &  n.a.          & \textbf{10.4} & \textbf{9.2}   & n.a.         \\
+%\textbf{gflops/host bandwidth (gflops / GB/s)} & \textbf{3.3}& \textbf{1.0}   & \textbf{150}  & \textbf{117}   & \textbf{7.9} \\
+\hline
+\end{tabular}
+} %\small
+\end{center}
+\vspace{-0.5cm}
+\caption{Properties of the different many-core platforms.}
+\label{architecture-properties}
+\end{table*}
+
+%% \begin{table*}
+%% \begin{center}
+%% \begin{small}
+%% \begin{tabular}{|l|rrrrrr|}
+%% \hline
+%% & GTX~280 & RV770 & Cell BE & BG/P & Core i7 920 & Larrabee \\
+%% \hline
+%% peak performance (GFLOPS) & 936 & 1,200 & 205(SPEs) + 25.6(PPU) & 13.6 & 85 & ? \\
+%% clock (GHz) & 1,3 & 0.75 & 3.2 & 0.85 & 2.67 & ? \\
+%% \#cores & 240 & 800 & 8 & 4 & 4 & $\mathcal{O}$(10) \\
+%% \#threads/core & & & 1 & 1 & 2 & 4 \\
+%% L1 cache size/core (KiB) & & & 256 (I+D) & 32(I) + 32(D) & 32(I) + 32(D) & \\
+%% L2 cache size/core (KiB) & & & & 2 (prefetcher) & 256 (I+D) & \\
+%% L3 cache size/chip (MiB) & & & & 8 & 8 & \\
+%% (device) memory size (GiB) & 4 & &  & 2 or 4 & & \\
+%% peak memory bandwidth (GiB/s) & 102 & 115.2 & & & & \\
+%% \#registers/core & & & 128 & 32 & 16 & 32 \\
+%% \#floats/register (= vector size) & 1 & 1  & 4 & 2 & 4 & 16 \\
+%% manufacturing process (nm) & 65 & & & 90 & 45 & \\
+%% Thermal Design Power (Watt) & 236 & 160 & & & 130 & \\
+%% \hline
+%% \end{tabular}
+%% \end{small}
+%% \end{center}
+%% \end{table*}
+
+In this section, we explain key properties of five different
+architectures with multiple cores, and the most important differences between them. 
+Table~\ref{architecture-properties}
+shows the most important properties of the different many-core
+architectures. 
+
+
+\noindent \\ \emph{General Purpose multi-core CPUs (Intel Core i7)}
+
+\noindent As a reference, we implemented the correlator on a multi-core
+general-purpose architecture, in this case an Intel Core~i7.  The
+theoretical peak performance of the system is 85~gflops, in single
+precision.  The parallelism comes from four cores with 
+hyperthreading.
+Using two threads per core allows the hardware to overlap
+load delays and pipeline stalls with useful work from the other thread.
+The SSE4 instruction set provides SIMD (Single Instruction, Multiple Data) parallelism with a vector length of four floats.
+
+%% SSE4 does not provide fused multiply-add instructions, but the Core~i7
+%% issues vector-multiply and vector-add instructions concurrently in
+%% different pipelines, allowing eight flops per cycle per core.  
+
+\noindent \\ \emph{IBM Blue Gene/P supercomputer}
+
+\noindent The IBM Blue Gene/P~\cite{IBM:08} is the architecture that is
+currently used for the LOFAR correlator.
+Four PowerPC processor cores are integrated on each BG/P chip.
+Each core is extended with two floating-point units, that provide the bulk of the processing power.
+The BG/P is an energy-efficient supercomputer.
+This is accomplished by using many small, low-power chips, at a low clock
+frequency.
+%The supercomputer also has excellent I/O capabilities, there are five
+%specialized networks for communication.
+
+
+\noindent \\ \emph{ATI GPUs}
+
+\noindent ATI's GPU with the highest performance is
+the Radeon 4870~\cite{amd-manual}.  The chip contains 160 cores, with 800 FPUs in total, 
+and has a theoretical peak performance of
+1.2~teraflops. The board uses a PCI-express~2.0 interface
+for communication with the host system.
+The GPU has 1 GB of device memory on-board.
+It is possible to specify if a read should be
+cached by the texture cache or not.
+Each streaming processor also has 16 KB of shared
+memory that is completely managed by the application. 
+On both ATI and NVIDIA GPUs, the application should run many more threads
+than the number of cores. This allows the hardware to overlap memory load delays with useful
+work from other threads.
+
+
+\noindent \\ \emph{NVIDIA GPUs}
+
+\noindent NVIDIA's Tesla C1060 contains a GTX~280 GPU with 240 single
+precision and 30 double precision FPUs~\cite{cuda-manual}. The GTX~280
+uses a two-level hierarchy to group cores.  There are 30~independent
+\emph{multiprocessors\/} that each have 8~cores.  Current NVIDIA GPUs
+have fewer cores than ATI GPUs, but the individual cores are faster.
+The theoretical peak performance is 933 gflops.  The number of
+registers is large: each multiprocessor has 16384 32-bit floating point registers,
+that are shared between all threads that run on it.
+There also is 16~KB of shared memory per
+multiprocessor.  Finally, texture-caching hardware
+is available.  The application can specify which area of device memory
+must be cached, while the shared memory is completely managed by the
+application.
+
+
+
+\noindent \\ \emph{The Cell Broadband Engine}
+
+\noindent The \mbox{Cell/B.E.}~\cite{cell} is a
+heterogeneous many-core processor, designed by Sony, Toshiba and IBM
+(STI).  The \mbox{Cell/B.E.} has nine cores: one Power Processing
+Element (PPE), acting as a main processor, and eight Synergistic
+Processing Elements (SPEs) that provide the real processing power.
+%The cores, the main memory, and the external I/O are connected by a
+%high-bandwidth element interconnection bus.
+%The PPE's main role is to
+%run an operating system and to coordinate the SPEs.  
+An SPE contains
+a RISC core, a 256KB Local Store (LS), and a DMA controller.
+The LS is an extremely fast local memory for both code and data
+and is managed \emph{entirely by the application} with explicit DMA
+transfers to and from main memory.  The LS can be considered the SPU's (explicit) L1 cache.  The
+\mbox{Cell/B.E.} has a large number of registers: each SPU has 128,
+which are 128-bit (4 floats) wide.
+ The SPU can dispatch two
+instructions in each clock cycle using the two pipelines designated
+\emph{even} and \emph{odd}. Most of the arithmetic instructions
+execute on the even pipe, while most of the memory instructions
+execute on the odd pipe. 
+For the performance evaluation, we use a QS21 Cell blade with two
+\mbox{Cell/B.E.} processors.
+The 8 SPEs of a single chip in the
+system have a total theoretical single-precision peak performance of
+205 gflops.
+
+
+\section{Mapping signal-processing algorithms on many-core hardware}
+
+Many-core architectures derive their performance from parallelism.
+Several different forms of parallelism can be identified:
+multi-threading (with or without shared memory), overlapping of I/O
+and computations, instruction-level parallelism, and vector parallelism. Most
+many-core architectures combine several of these methods.  
+Unfortunately, an application has to handle all available levels of parallelism to
+obtain good performance.
+Therefore, it is clear that algorithms have to be adapted to efficiently exploit
+many-core hardware.
+Additional parallelism can be obtained by using multiple processor chips.
+In this paper, however, we restrict ourselves to single chips for simplicity.
+
+
+
+\subsection{Finding parallelism}
+
+The first step is to find parallelism in the algorithm, on all
+different levels.  Basically, this means looking for independent
+operations.  With the correlator, for example, the thousands of
+different frequency channels are completely independent, and can be
+processed in parallel. But there are other, more fine-grained sources
+of parallelism as well.  The correlations for each pair of receivers
+are independent, just like the four combinations of  
+polarizations.  Finally, samples taken at different times can
+be correlated independently, as long as the sub-results are integrated
+later. Of course, the problem now is how to map the parallelism in the
+algorithm to the parallelism provided by the architecture. We found
+that, even for the relatively straightforward correlator algorithm,
+the different architectures require very different mappings and
+strategies.
+
+
+\subsection{Optimizing memory pressure and access patterns}
+
+On many-core architectures, the memory bandwidth is shared between the
+cores.  This has shifted the balance between between computational and
+memory performance.  The available memory bandwidth \emph{per operation} has
+decreased dramatically compared to traditional processors.  For the
+many-core architectures we use here, the theoretical bandwidth per operation is
+3--10 times lower than on the BG/P, for instance. In practice, if algorithms
+are not optimized well for many-core platforms, the achieved memory bandwidth can
+easily be ten to a hundred times lower than the theoretical maximum.
+Therefore, we must
+treat memory bandwidth as a scarce resource, and it is important to
+minimize the number of memory accesses.  In fact, one of the most
+important lessons of this paper is that on many-core architectures,
+optimizing the memory properties of the algorithms is more important
+than focusing on reducing the number of compute cycles that is used,
+as is traditionally done on systems with only a few or just one core.
+
+
+\subsubsection{Well-known memory optimization techniques}
+
+The insight that optimizing the interaction with the memory system is
+becoming more and more important is not new.  The book by Catthoor et
+al.~\cite{data-access} is an excellent starting point for more
+information on memory-system related optimizations. 
+%The authors focus
+%on multimedia applications, but the techniques described there are
+%also applicable to the field of signal processing, which has many
+%similarities to multimedia.
+
+We can make a distinction between hardware and software memory
+optimization techniques.  Examples of hardware-based techniques include caching, data
+prefetching, write combining, and pipelining. The software techniques can be divided
+further into compiler optimizations and algorithmic improvements.
+  The distinction between hardware and
+software is not entirely black and white. Data prefetching, for
+instance, can be done both in hardware and software.  Another good
+example is the explicit cache of the \mbox{Cell/B.E.} processor. This is
+an architecture where the programmer handles the cache
+replacement policies instead of the hardware.
+
+Many optimizations focus on utilizing data caches more efficiently.
+Hardware cache hierarchies can, in principle, transparently improve application performance.
+Nevertheless, it is important to take the sizes of the
+different cache levels into account when optimizing an algorithm.  A
+cache line is the smallest unit of memory than can be transferred
+between the main memory and the cache.  Code can be optimized for the
+cache line size of a particular architecture.  Moreover, the
+associativity of the cache can be important.  If a cache is N-way set
+associative, this means that any particular location in memory can be
+cached in either of N locations in the data cache. Algorithms can be
+designed such that they take care that cache lines that are needed
+later are not replaced prematurely. 
+In addition, write combining,
+a technique that allows data writes to be combined and written later in burst mode,
+can be used if the ordering of writes is not important.
+Finally, prefetching can be used to load data into caches or registers ahead of time.
+
+Many cache-related optimization techniques have been described in the
+literature, both in the context of hardware and software. For
+instance, an efficient implementation of hardware-based prefetching is
+described in~\cite{Chen95effectivehardware-based}.  As we will
+describe in Section~\ref{sec:optimizing}, we implemented prefetching
+manually in software, for example by using multi-buffering on the
+\mbox{Cell/B.E.}, or by explicitly loading data into shared memory or
+registers on the GPUs.  A good starting point for cache-aware or
+cache-oblivious algorithms is~\cite{cache}. An example of a technique
+that we used to improve cache efficiencies for the correlator is the
+padding of multi-dimensional arrays with extra ``dummy'' data
+elements.  This can be especially important if memory is accessed with
+a stride of a (large) power of two.  This way, we can make sure that
+cache replacement policies work well, and subsequent elements in an
+array dimension are not mapped onto the same cache location.  This
+well-known technique is described, for instance, by Bacon et
+al.~\cite{cache-tlb-compiler}.  Many additional data access patterns
+optimization techniques are described in~\cite{data-access}.
+
+Many memory optimization techniques have been developed in the context
+of optimizing compilers and runtime systems (e.g., efficient memory allocators). 
+For instance, a lot of research effort has been invested in cache-aware memory allocation; see
+e.g.,~\cite{Wilson95dynamicstorage}.  Compilers can exploit many
+techniques to optimize locality, by applying code and loop
+transformations such as interchange, reversal, skewing, and
+tiling~\cite{data-locality}.  Furthermore, compilers can optimize code for
+the parameters and sizes of the caches, by carefully choosing the
+placement of variables, objects, and arrays in
+memory~\cite{Panda96memorydata}.
+
+The memory systems of the many-core architectures are quite
+complex. GPUs, for instance, have banked device memory, several levels of texture cache,
+in addition to local memory, application-managed
+shared memory (also divided over several banks), and write combining buffers.
+There also are complex interactions between the memory
+system and the hardware thread scheduler.
+GPUs literally run tens of thousands of parallel threads
+to overlap memory latencies, trying to keep all functional units fully occupied.
+We apply the techniques described
+above in software by hand, since we found that the current compilers
+for the many-core architectures do not (yet) implement them well on
+their complex memory systems.
+
+
+% OpenCL helpt misschien, door runtime compilation.
+
+\subsubsection{Applying the techniques}
+
+So, the second step of mapping a signal-processing algorithm to a many-core architecture
+is optimizing the memory behavior. We can split this step into two phases:
+an algorithm phase and an architectural phase.
+In the first phase, we identify algorithm-specific, but
+architecture-independent optimizations. 
+In this phase, it is of key importance to understand that, although a
+set of \emph{operations} in an algorithm can be independent, the \emph{data
+  accesses} may not be.  This is essential for good performance, even though it may not be a
+factor in the correctness of the algorithm. The \emph{number} of memory accesses per operation should
+be reduced as much as possible, sometimes even at the cost of more
+compute cycles. An example is a case
+where different parallel operations read (but not write) the
+same data.  For the correlator, the most important insight here
+is a technique to exploit date reuse opportunities, reducing the number of memory
+loads. We explain this in detail in Section~\ref{sec:tiling}.
+
+\begin{table}[t]
+\begin{center}
+{\footnotesize
+\begin{tabular}{|l|l|l|}
+\hline
+feature                   & Cell/B.E.                      & GPUs \\
+\hline
+access times              & uniform                        & non-uniform \\
+\hline
+cache sharing level       & single thread (SPE)            & all threads in a \\
+                          &                                & multiprocessor \\
+\hline
+access to off-chip mem.   & through DMA only               & supported \\
+\hline
+memory access             & asynchronous DMA               & hardware-managed \\
+overlapping               &                                & thread preemption \\
+\hline
+communication             & DMA between SPEs               & independent thread  \\
+                          &                                & blocks \& shared   \\
+                          &                                & mem. within a block \\
+\hline
+\end{tabular}
+} %\small
+\end{center}
+\vspace{-0.5cm}
+\caption{Differences between memory architectures.}
+\label{memory-properties}
+\end{table}
+
+The second phase deals with architecture-specific optimizations.
+In this phase, we do not reduce the \emph{number} of memory loads, but think about the
+memory \emph{access patterns}. Typically, several cores share one or
+more cache levels. Therefore, the access patterns of several different
+threads that share a cache should be tailored accordingly. On GPUs,
+for example, this can be done by \emph{coalescing} memory accesses.
+This means that different concurrent threads read subsequent memory
+locations.  This can be counter-intuitive, since traditionally, it was
+more efficient to have linear memory access patterns within a
+thread. Table~\ref{memory-properties} summarizes the differences in
+memory architectures of the different platforms.
+Other techniques that are performed in this phase include optimizing cache
+behavior, avoiding load delays and pipeline stalls, exploiting special floating-point instructions, etc.
+We explain several examples of this in more detail in Section~\ref{sec:architecture-optimizations}.
+
+
+\subsection{A simple analytical tool}
+
+A simple analytic approach, the Bound and Bottleneck
+analysis~\cite{system-performance,roofline}, can provide more insight
+on the memory properties of an algorithm. It also gives us a reality
+check, and calculates what the expected maximal performance is that can be
+achieved on a particular platform.
+The number of operations that is performed per byte that have to be
+transferred (the flop/byte ratio) is called the \emph{arithmetic intensity}, or
+$AI$~\cite{system-performance}.  Performance is bound by the product
+of the bandwidth and the $AI$: 
+$\mathit{perf_{max}}$ = $AI \times bandwidth$. 
+Several important assumptions are made
+with this method. First, it assumes that the bandwidth is
+independent of the access pattern.  Second, it assumes a complete
+overlap of communication and computation, i.e., all latencies
+are completely hidden.  Finally, the method does not take caches into
+account. Nevertheless, it gives a rough idea of
+the performance than can be achieved.
+
+It is insightful to apply this method to the correlator on the GPUs.
+We do it for the NVIDIA GPU here, but the results for the ATI hardware is similar.
+With the GPUs, there are several communication steps that influence
+the performance. First, the data has to be transferred from the host to
+the device memory.  Next, the data is read from the device memory into
+registers. The host-to-device bandwidth is limited by the low
+PCI-express throughput, 5.6 GB/s in this case. We can easily show that this is a bottleneck
+by computing the $AI$ for the full system, using the host-to-device transfers. (The AI can also be computed for
+the device memory.)
+ 
+As explained in Section~\ref{sec:correlating}, the number of flops in the correlator is the
+number of receiver combinations times 32 operations, while the number
+of bytes that have to be loaded in total is 16 bytes times the number
+of receivers.  The
+number of combinations is $(nrReceivers \times (nrReceivers + 1)) / 2$ (see Section~\ref{sec:correlating}).
+If we substitute this, we find that the $AI = nrReceivers + 1$.  For
+LOFAR, we can assume 64 receivers (each in turn containing many
+antennas), so the $AI$ is 65 in our case.  Therefore, the performance
+bound on NVIDIA hardware is $65 \times 5.6 = 363$ gflops. This is only
+39\% of the theoretical peak.  Note that this even is optimistic,
+since it assumes perfect overlap of communication and computation.
+%The efficiency can improve if additional processing is performed on the GPU 
+%(e.g., a filter step before the correlator) and intermediate data is kept in device memory.
+
+\subsection{Complex numbers}
+
+\longversion{
+Programming languages like C99 and FORTRAN have native support for complex
+numbers, which is useful for signal-processing applications.
+Arrays of complex numbers can easily be declared like
+\texttt{complex float array[16];}.
+This declaration enforces real and imaginary values at alternating memory
+locations.
+Unfortunately, not all architectures have efficient support for complex
+multiply and division operations on vector elements that are declared this
+way, since these operations require real and imaginary values to be shuffled.
+Moreover, the real and imaginary parts of a product are computed differently.
+
+The Blue Gene/P is the only architecture that efficiently supports all complex
+operations, while the SSE4 instructions of the Core i7 provide limited support.
+To obtain good performance on the other architectures, it may be necessary
+to split the complex array into separate arrays for real and imaginary values.
+This increases the programming effort, since all complex operations must be
+programmed in terms of real operations.
+Both formats are commonly used; a library like FFTW3 supports both of them.
+}
+
+Support for complex numbers is important for signal processing. 
+Explicit hardware support for complex operations is
+preferable, both for programmability and performance. 
+%If it is not available, we can circumvent this by using separate arrays for
+%real values and for imaginary values.  
+Except for the BG/P, none of the architectures support this.
+The different architectures require two different approaches of
+dealing with this problem. If an architecture does not use
+explicit vector parallelism, the complex operations can simply
+be expressed in terms of normal floating point operations. This puts
+an extra burden on the programmer, but achieves good performance. The
+NVIDIA GPUs work this way.  If an architecture does use vector
+parallelism, we can either store the real and complex parts alternatingly inside a
+single vector, or have separate vectors for the two parts.  In both
+cases, support for shuffling data inside the vector registers is
+essential, since complex multiplications operate on both the real and imaginary parts.
+The architectures differ considerably in this
+respect.  The \mbox{Cell/B.E.} excels; its vectors contain four floats, which
+can be shuffled around in arbitrary patterns. Moreover, 
+shuffling and computations can be overlapped effectively.  On ATI
+GPUs, this works similarly.  The SSE4 instructions in the
+Intel CPUs do not support arbitrary shuffling patterns.
+This has a large impact on the way the code is vectorized.
+%, and requires a different SIMDization strategy. 
+
+
+
+
+\section{Implementation and optimization}
+\label{sec:optimizing}
+
+In this section,
+we explain the techniques described above by applying them to the
+correlator for all different architectures.
+
+
+\subsection{Architecture independent optimizations}
+\label{sec:tiling}
+
+\begin{figure}[t]
+\begin{center}
+\includegraphics[width=4.2cm]{figures/fig4.pdf}
+\end{center}
+\vspace{-0.5cm}
+\caption{An example correlation triangle.}
+\label{fig-correlation}
+\end{figure}
+
+%Although in reality the receivers are dual-polarized, and the samples are complex numbers, 
+%we use single-polarized real samples in the following example for simplicity.
+An unoptimized correlator would read the samples from two receivers and
+multiply them, requiring two sample loads for one multiplication.
+We can optimize this by reusing a sample 
+as often as possible, by using it for multiple correlations (see
+Figure~\ref{fig-correlation}).
+The figure is triangular, because we compute
+the correlation of each pair of receivers only once. The squares labeled \emph{A} are
+autocorrelations.
+For example, the samples from receivers 8, 9, 10, and 11 can be correlated
+with the samples from receivers 4, 5, 6, and 7 (the red square in the figure),
+reusing each fetched sample four times.
+By dividing the correlation triangle in $4\times4$ \emph{tiles}, eight samples are read from memory for sixteen
+correlations, reducing the amount of memory operations by a factor
+of four.
+The maximum number of receivers that can
+be simultaneously correlated this way (i.e., the tile size) is limited by the number of registers that an architecture has.
+The samples and accumulated correlations are best kept in registers, and the number of
+required registers grows rapidly with the number of receiver inputs.
+The example above already requires 16 accumulators.
+To obtain good performance, it is important to tune the tile size to the
+architecture.
+%Caches and memory prefetch units can also improve the performance.
+%However, a cache-size dependent tradeoff must be made.
+%On the one hand, correlating and integrating over long periods of time
+%is good for pipelined FPU operation, on the other hand, the 
+There still is
+opportunity for additional data reuse \emph{between} tiles.  The tiles
+within a row or column in the triangle still need the same samples.
+In addition to registers, caches can thus also be used to increase
+data reuse. 
+
+%The efficiency of the cache, however, depends highly on the chosen 
+%integration time, and on the cache-replacement algorithm 
+%(Least-Recently Used works much better here than Round Robin).
+
+%It is important to realize that the
+%correlator itself is \emph{trivially parallel}, since the tens of thousands of
+%frequency channels that LOFAR uses can be processed independently.  This allows us to
+%efficiently exploit many-core hardware.
+
+
+\begin{figure}[t]
+\begin{center}
+\includegraphics[width=\columnwidth]{figures/fig5.pdf} % for normal layout
+%\includegraphics[width=0.5\columnwidth]{figures/performance-graph-v2.pdf} % for double spacing
+\end{center}
+\vspace{-0.5cm}
+\caption{Achieved performance on the different platforms.}
+\label{performance-graph}
+\end{figure}
+
+
+
+\subsection{Architecture-specific optimizations}
+\label{sec:architecture-optimizations}
+
+We will now describe the implementation of the correlator on
+the different architectures, evaluating the performance and optimizations needed in detail. 
+For comparison reasons, we use the performance
+\emph{per chip} for each architecture.
+%We choose 64 as the number of receivers (each in turn consisting of hundreds of antennas), since
+%that is a realistic number for LOFAR.  
+The performance results are shown in Figure~\ref{performance-graph}.
+
+
+\noindent \\ \emph{Intel CPUs}
+
+\noindent The SSE4 instruction set can be used to exploit vector parallelism.  
+Unlike the \mbox{Cell/B.E.} and ATI GPUs, a
+problem with SSE4 is the limited support for shuffling data within
+vector registers.  Computing the
+correlations of the four polarizations within a vector is
+inefficient, and computing four samples with subsequent time stamps in a vector works
+better. 
+%We achieve only a speedup of a factor of 2.8 compared to
+%a version without SSE4.  
+%We found that, unlike on the other platforms,
+%computing four samples with subsequent time stamps in a vector works
+%better.  
+The use of SSE4 improves the performance by a factor of 3.6
+in this case.  In addition, multiple threads should be used to utilize all
+four cores.  To benefit from hyperthreading, twice as many
+threads as cores are needed.  For the correlator, hyperthreading increases performance by 6\%. 
+Also, the number of vector registers is small.
+Therefore, there is not much opportunity to reuse data in registers,
+limiting the tile size to $2 \times 2$; reuse has to come from the
+L1~cache.
+
+
+\noindent \\ \emph{The BG/P supercomputer}
+
+\noindent 
+We found that the BG/P is extremely suitable for our application,
+since it is highly optimized for processing of complex numbers.
+However, the BG/P performs \emph{all} floating point operations in double
+precision, which is overkill for our application.
+Although the BG/P can keep the same number of values in register as the 
+Intel chip, an important difference is that the BG/P has 32
+registers of width 2, compared to Intel's 16 of width 4.  The smaller
+vector size reduces the amount of shuffle instructions needed.
+%The (small) $2 \times 2$ tile size performs best.
+In contrast to all other architectures we evaluate, the problem is compute
+bound instead of I/O bound, thanks to the BG/P's high memory bandwidth per
+operation, which is 3--10 times higher than for the other architectures.
+
+
+\noindent \\ \emph{ATI GPUs}
+
+\noindent The ATI architecture has several important
+drawbacks for data-intensive applications.  First, the
+host-to-device bandwidth is a bottleneck.  Second, 
+overlapping communication with computation does not work well.
+We observed kernel slowdowns of more than \emph{a factor of
+two} due to asynchronous transfers in the background. This can clearly be seen in Figure~\ref{performance-graph}.
+Third, the architecture does not provide random write access to device
+memory, but only to \emph{host} memory.
+%The kernel output can be written to at most 8 output registers
+%(each 4 floats wide).  For the correlator, this effectively limits the
+%tile size to $2\times2$.
+%Random write access to \emph{host} memory is
+%provided.  
+The correlator reduces the data by a large amount, and the
+results are never reused by the kernel. Therefore, they can be
+directly streamed to host memory. Nevertheless, in general, the absence of random
+write access to device memory significantly reduces the programmability, and prohibits the use
+of traditional programming models.
+ATI offers two separate programming models, at different abstraction
+levels~\cite{amd-manual}.  The low-level programming model is called CAL.  
+It provides communication primitives and
+an assembly language, allowing fine-tuning of device
+performance. For high-level programming, ATI provides Brook+.  We
+implemented the correlator with both models.
+In both cases, the programmer has to do the vectorization,
+unlike with NVIDIA GPUs.  CAL provides a feature called
+\emph{swizzling}, which is used to select parts of vector registers in
+arithmetic operations.  We found this improves readability of the code. 
+However, the
+programming tools still are unsatisfactory. The high-level Brook+ model does
+not achieve acceptable performance. The low-level
+CAL model does, but it is difficult to use.
+The best-performing implementation uses a tile size of 4x3, thanks to
+the large number of registers.  
+Due to the low I/O performance, we achieve only 16\% of the theoretical peak.
+
+
+
+\noindent \\ \emph{NVIDIA GPUs}
+
+\noindent NVIDIA's programming model is called Cuda~\cite{cuda-manual}.
+Cuda is relatively high-level, and achieves good performance.
+An advantage of NVIDIA hardware, in contrast to ATI, is that the application does not have to do 
+vectorization. This is thanks to the fact that all cores have their own address generation units. 
+All data parallelism is expressed by using threads.
+When accessing device memory, it is important to make sure that
+simultaneous memory accesses by different threads are \emph{coalesced}
+into a single memory transaction.  In contrast to ATI hardware, NVIDIA
+GPUs support random write access to device memory. This allows a
+programming model that is much closer to traditional models, greatly
+simplifying software development.
+It is important to use shared memory or the texture cache to enable data reuse.
+In our case, we use the texture cache to speed-up access to the sample data. 
+Cuda provides barrier synchronization between threads within a thread block.
+We exploit this feature to let
+the threads that access the same samples run in lock step.  This way,
+we pay a small synchronization overhead, but we can increase the cache hit
+ratio significantly.  We found that this optimization improved performance by a factor of 2.
+This optimization is a good example that shows that, on GPUs, it is important to optimize
+memory behavior, even at the cost of additional instructions and synchronization overhead.
+
+We also investigated the use of the per-multiprocessor shared memory as an
+application-managed cache.  Others report good results with this
+approach~\cite{gpu-cache}.  However, we found that, for our
+application, the use of shared memory only led to performance
+degradation compared to the use of the texture caches.
+
+Registers are a shared resource. Using fewer registers in a kernel
+allows the use of more concurrent threads, hiding load delays.
+We found that using a relatively small tile size (3x2) and many threads increases performance.
+The kernel itself, without host-to-device communication achieves 38\%
+of the theoretical peak performance.  If we include communication, the
+performance drops to 32\% of the peak. Just like with the ATI
+hardware, this is caused by the low PCI-e bandwidth.  With NVIDIA
+hardware significant performance gains can be achieved by using asynchronous host-to-device I/O.
+
+
+\begin{table*}[t]
+\begin{center}
+%{\footnotesize % for normal layout
+{\scriptsize % for double spaced
+\begin{tabular}{l|l|l|l|l}
+Intel Core i7 920     & IBM Blue Gene/P          & ATI 4870                      & NVIDIA Tesla C1060     & STI  Cell/B.E.                      \\
+\hline
+ + well-known         &  + L2 prefetch unit      &  + largest number of cores    &  + random write access &  + power efficiency                 \\
+-- few registers      &  + high memory bandwidth &  + swizzling support          &  + Cuda is high-level  &  + random write access              \\
+-- no fma instruction &  + fast interconnects    & -- low PCI-e bandwidth        & -- low PCI-e bandwidth &  + shuffle capabilities             \\
+-- limited shuffling  & -- double precision only & -- transfer slows down kernel &                        &  + explicit cache (performance)     \\
+                      & -- expensive             & -- no random write access     &                        & -- explicit cache (programmability) \\
+                      &                          & -- bad programming support    &                        & -- multiple parallelism levels      \\
+\end{tabular}
+} %\small
+\end{center}
+\vspace{-0.5cm}
+\caption{Strengths and weaknesses of the different platforms for signal-processing applications.}
+\label{architecture-results-table}
+\end{table*}
+
+\noindent \\ \emph{The Cell Broadband Engine}
+
+\noindent With the
+\mbox{Cell/B.E.} it is important to exploit all levels of parallelism.
+Applications deal with task and data parallelism across multiple SPEs,
+vector parallelism inside the SPEs, and multi-buffering for
+asynchronous DMA transfers~\cite{cell}.  Acceptable performance can be achieved by
+programming the \mbox{Cell/B.E.}  in C or C++, while using intrinsics
+to manually express vector parallelism.  Thus, the programmer
+specifies which instructions have to be used, but can typically leave
+the instruction scheduling and register allocation to the compiler.
+
+A distinctive property of the architecture is that cache transfers are
+explicitly managed by the application, using DMA. This is unlike other 
+architectures, where caches work transparently.
+%% By dividing the
+%% integration time into smaller intervals, we can keep the sample data
+%% for \emph{all stations} in the local store.  
+%% Because of this, we have to load and store the correlations to main
+%% memory several times, since the sub-results have to
+%% be accumulated.  
+Communication can be overlapped with computation, by using multiple buffers.
+Although issuing explicit DMA commands complicates programming,
+we found that this usually is not problematic for signal-processing applications.
+Thanks to the explicit cache,
+the correlator implementation fetches each sample from main memory
+\emph{only exactly once}. 
+The large number of registers allows a big tile size of 
+$4\times3$, leading to a lot of data reuse.
+We exploit the vector parallelism of the \mbox{Cell/B.E.} by computing the four
+polarization combinations in parallel.  We found that this performs
+better than vectorizing over the integration time.  This is thanks to the \mbox{Cell/B.E.}'s
+excellent support for shuffling data around in the vector registers.
+%The shuffle instruction is executed
+%in the odd pipeline, while the arithmetic is executed in the even
+%pipeline, allowing them to overlap.
+Due to the high
+memory bandwidth and the ability to reuse data, we achieve 92\% of the peak
+performance on one chip.  If we use both chips in a cell blade, we still achieve
+91\%.  Even though the memory
+bandwidth per operation of the \mbox{Cell/B.E.} is eight times lower than
+that of the BG/P, we still achieve excellent performance, thanks to
+the high data reuse factor.
+
+
+\subsection{Comparison and Evaluation}
+\label{sec:perf-compare}
+
+Figure~\ref{performance-graph} shows the performance on all
+architectures we evaluated. The NVIDIA GPU achieves the highest
+\emph{absolute} performance. Nevertheless, the GPU \emph{efficiencies}
+are much lower than on the other platforms.  The \mbox{Cell/B.E.}
+achieves the highest efficiency of all many-core architectures, close
+to that of the BG/P. 
+Although the theoretical peak performance of the
+\mbox{Cell/B.E.} is 4.6 times lower than the NVIDIA chip, the absolute
+performance is only 1.6 times lower.
+ If both chips in the cell blade
+are used, the \mbox{Cell/B.E.} also has the highest absolute
+performance. For the GPUs, it is possible to use more than one chip as
+well, for instance with the ATI 4870x2 device. However, we found that this does not help, since the
+performance is already limited by the low PCI-e throughput, and the
+chips have to share this resource.
+In Table~\ref{architecture-results-table} we summarize the
+architectural strengths and weaknesses that we discussed.  
+
+%Although
+%we focus on the correlator application in this paper, the
+%results are applicable to signal processing applications in
+%general.
+
+%@@@ larrabee / lange vectoren
+
+\section{Programmability of the platforms}
+
+The performance gap between assembly and a high-level programming language 
+is quite different for the different platforms. It also
+depends on how much the compiler is helped by manually unrolling
+loops, eliminating common sub-expressions, the use of register variables,
+etc., up to a level that the C code becomes almost as low-level as assembly
+code. The difference varies between only a few percent to a factor of 10. 
+
+For the BG/P, the performance from compiled C++ code was by far not
+sufficient. The assembly code is approximately 10 times faster.
+For both the \mbox{Cell/B.E.} and the Intel Core~i7, we found that
+high-level code in C or C++ in combination with the use of intrinsics
+to manually describe the SIMD parallelism yields acceptable
+performance compared to optimized assembly code.  Thus, the programmer
+specifies which instructions have to be used, but can typically leave
+the instruction scheduling and register allocation to the compiler.
+On NVIDIA hardware, the high-level Cuda model delivers excellent
+performance, as long as the programmer helps by using SIMD data types
+for loads and stores, and separate local variables for values that
+should be kept in registers. With ATI hardware, this is different.  We
+found that the high-level Brook+ model does not achieve acceptable
+performance compared to hand-written CAL code.  Manually written assembly 
+is more than three times faster. Also, the Brook+ documentation is insufficient.
+
+\longversion{
+\section{Applying the techniques: a case study with the Intel Larrabee}
+
+Intel recently disclosed some details about the upcoming Larrabee processor,
+a fully programmable GPU based on the well-known x86 instruction set.
+Although performance details are unknown, it is interesting to compare the
+Larrabee to the aforementioned architectures, and to see how a correlator
+should be implemented to obtain optimal performance.
+
+The processing power comes from Larrabee's relatively long vector size:
+a vector holds 16~elements, where the other architectures have vectors lengths
+of at most~4.
+The long vector size forces us to reconsider our parallelization strategy.
+There are several options to perform 16~simultaneous FMAs.
+One option is to operate on 16~samples with consecutive time stamps.
+A minor drawback is that the data must be ``horizontally'' added to integrate,
+but this can be done outside the main loop.
+Another option is to operate on samples from 16~consecutive frequencies.
+%% An advantage of this may be that the input is in the right order (i.e.,
+%% the 16~values can be read from consecutive memory locations) if a Poly-Phase
+%% Filter precedes the correlator: the FFT outputs consecutive frequencies into
+%% consecutive memory locations.
+%% Both 
+
+Another option is to correlate samples from different receivers as illustrated
+by Figure~\ref{fig-correlation}.
+This method minimizes memory loads, but requires additional shuffling of data.
+Unfortunately, the most efficient method can only be determined empirically,
+when the hardware is available.
+} % end of \longversion
+
+
+\section{Conclusions}
+\label{conclusions}
+Radio telescopes require large amounts of signal processing, 
+and have high computational and I/O demands.
+We presented general insights on how to use many-core
+platforms for signal-processing applications, looking at the aspects of
+performance, optimization and programmability.
+As an example, we evaluated the extremely
+data-intensive correlator algorithm on today's many-core
+architectures. 
+
+The many-core architectures have a significantly lower memory
+bandwidth \emph{per operation} compared to traditional architectures.
+This requires completely different algorithm implementation and optimization strategies:
+minimizing the number of memory loads per operation is of key
+importance to obtain good performance.  A high memory bandwidth per
+operation is not strictly necessary, as long as the architecture (and the
+algorithm) allows efficient data reuse.  This can be achieved through
+caches, shared memory, local stores and registers.  It is clear that
+application-level control of cache behavior (either through explicit
+DMA or thread synchronization) has a substantial performance benefit,
+and is of key importance for signal-processing
+applications.
+
+We demonstrated that the many-core architectures have very
+different performance characteristics, and require different
+implementation and optimization strategies.  The BG/P supercomputer
+achieves high efficiencies thanks to the high memory bandwidth per
+operation. The GPUs are unbalanced: they provide an enormous
+computational power, but have a relatively low bandwidth per
+operation, both internally and externally (between the host and the device).
+Because of this, many data-intensive signal-processing applications will
+achieve only a small fraction of the theoretical peak.
+The \mbox{Cell/B.E.} performs excellently on signal-processing
+applications, even though its memory bandwidth per operation is eight
+times lower than the BG/P.  Applications can exploit the
+application-managed cache and the large number of registers. For the
+correlator, this results in optimal reuse of all sample data.  Nevertheless,
+it is clear that, for signal-processing applications,
+the recent trend of increasing the number of cores will not work indefinitely if
+I/O is not scaled accordingly.
+
+
+
+\section*{Acknowledgments}
+This work was performed in the context of the NWO STARE
+AstroStream project.  We gratefully acknowledge NVIDIA, and in
+particular Dr. David Luebke, for providing freely some of the GPU
+cards used in this work. 
+
+\bibliographystyle{IEEEbib}
+
+\begin{small}
+\bibliography{spm}
+\end{small}
+
+\end{document}