46 files changed, 1693 insertions, 2051 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am
index df667655..3ea21835 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -1,26 +1,31 @@
 
-bin_PROGRAMS = reconstruct_weights
+bin_PROGRAMS = reconstruct_weights atools
 
-noinst_PROGRAMS = ts phmt
-TESTS = ts phmt
+noinst_PROGRAMS = ts phmt mfcr_test
+TESTS = ts phmt mfcr_test
 
 if HAVE_GTEST
 noinst_PROGRAMS += \
+  crp_test \
   dict_test \
+  m_test \
   weights_test \
   logval_test \
   small_vector_test
 
-TESTS += small_vector_test logval_test weights_test dict_test
+TESTS += crp_test small_vector_test logval_test weights_test dict_test m_test
 endif
 
 reconstruct_weights_SOURCES = reconstruct_weights.cc
 
+atools_SOURCES = atools.cc
+
 noinst_LIBRARIES = libutils.a
 
 libutils_a_SOURCES = \
   alignment_pharaoh.cc \
   b64tools.cc \
+  corpus_tools.cc \
   dict.cc \
   tdict.cc \
   fdict.cc \
@@ -38,10 +43,16 @@ endif
 
 phmt_SOURCES = phmt.cc
 ts_SOURCES = ts.cc
+m_test_SOURCES = m_test.cc
+m_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
 dict_test_SOURCES = dict_test.cc
 dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
+mfcr_test_SOURCES = mfcr_test.cc
+mfcr_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
 weights_test_SOURCES = weights_test.cc
 weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
+crp_test_SOURCES = crp_test.cc
+crp_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
 logval_test_SOURCES = logval_test.cc
 logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
 small_vector_test_SOURCES = small_vector_test.cc
diff --git a/utils/agenda.h b/utils/agenda.h
deleted file mode 100755
index d4f13696..00000000
--- a/utils/agenda.h
+++ /dev/null
@@ -1,140 +0,0 @@
-#ifndef AGENDA_H
-#define AGENDA_H
-
-#define DBG_AGENDA(x) x
-/*
-  a priority queue where you expect to queue the same item at different
-  priorities several times before finally popping it.  higher priority = better.
-  so in best first you'd be using negative cost or e^-cost (probabilities, in
-  other words).
-
-  this means you have a way to look up a key and see its location in the queue,
-  so its priority can be adjusted (or, simpler implementation: so when you pop,
-  you see if you've already popped before at a lower cost, and skip the
-  subsequent pops).
-
-  it's assumed that you'll never queue an item @ a better priority after it has
-  already been popped.  that is, the agenda will track already completed items.
-  maybe in the future i will let you recompute a cheaper way to reach things
-  after first-pop also, it's assumed that we're always improving prios of
-  existing items, never making them worse (even though technically this is
-  possible and sensible if it hasn't been popped yet).
-
-  simple binary max heap for now.  there are better practical options w/
-  superior cache locaility.  movements in the heap need to update a record for
-  that key of where the key went.  i do this by creating canonical key pointers
-  out of boost object pools (if the key were lightweight e.g. an int, then it
-  would make sense to use the hash lookup too
-
-  since i'm doing key hashing to start with, i also allow you to attach some
-  arbitrary data (value) payload beyond key+priority.
-
-  hash map from key to done (has been popped) -> set where doneness is marked in key item?
-
-  a slightly different way to make an adjustable heap would be to use
-  tree-structured parent/children links intrusively (or mapped by key) in the
-  key, rather than indices in a compact binary-tree heap
-
- */
-
-#include "best.h"
-#include "intern_pool.h"
-#include "d_ary_heap.h"
-#include "lvalue_pmap.h"
-#include <vector>
-#include <functional>
-
-/*
-template <class P>
-struct priority_traits {
-  typedef typename P::priority_type priority_type;
-};
-*/
-
-typedef best_t agenda_best_t;
-typedef unsigned agenda_location_t;
-
-PMAP_MEMBER_INDIRECT(LocationMap,agenda_location_t,location)
-PMAP_MEMBER_INDIRECT(PriorityMap,agenda_best_t,priority)
-
-struct Less {
-  typedef bool result_type;
-  template <class A,class B>
-  bool operator()(A const& a,B const& b) const { return a<b; }
-};
-
-// LocMap and PrioMap are boost property maps put(locmap,key,size_t), Better(get(priomap,k1),get(priomap,k2)) means k1 should be above k2 (be popped first).  Locmap and PrioMap may have state; the rest are assumed stateless functors
-// make sure the (default) location is not -1 for anything you add, or else an assertion may trigger
-template <class Item,class Better=Less, /* intern_pool args */ class KeyF=get_key<Item>,class HashKey=boost::hash<typename KeyF::result_type>,class EqKey=std::equal_to<typename KeyF::result_type>, class Pool=boost::object_pool<Item> >
-struct Agenda : intern_pool<Item,KeyF,HashKey,EqKey,Pool> {
-  typedef intern_pool<Item,KeyF,HashKey,EqKey,Pool> Intern; // inherited because I want to use construct()
-  /* this is less generic than it could be, because I want to use a single hash mapping to intern to canonical mutable object pointers, where the property maps are just lvalue accessors */
-  typedef typename KeyF::result_type Key;
-  typedef Item * Handle;
-  typedef LocationMap<Handle> LocMap;
-  typedef PriorityMap<Handle> PrioMap;
-  LocMap locmap;
-  PrioMap priomap; // note: priomap[item] is set by caller before giving us the item; then tracks best (for canonicalized item) thereafter
-
-  Better better;
-  //NOT NEEDED: initialize function object state (there is none)
-
-  typedef Item *ItemC; //canonicalized pointer
-  typedef Item *ItemP;
-  static const std::size_t heap_arity=4; // might be fastest possible (depends on key size probably - cache locality is bad w/ arity=2)
-  typedef std::vector<ItemC> HeapStorage;
-  typedef d_ary_heap_indirect<Handle,heap_arity,LocMap,PrioMap,Better,HeapStorage,agenda_location_t> Heap;
-  Heap q;
-
-  // please don't call q.push etc. directly.
-  void add(ItemP i) {
-    bool fresh=interneq(i);
-    DBG_AGENDA(assert(fresh && !q.contains(i)));
-    q.push(i);
-  }
-  bool improve(ItemP i) {
-    ItemP c=i;
-    bool fresh=interneq(c);
-    if (fresh) {
-      add(c);
-      return true;
-    }
-    DBG_AGENDA(assert(q.contains(c)));
-    return q.maybe_improve(priomap[i]);
-  }
-  inline bool empty() {
-    return q.empty();
-  }
-  // no need to destroy the canon. item because we want to remember the best cost and reject more expensive ways of using it).
-  ItemC pop() {
-    ItemC r=q.top();
-    q.pop();
-    return r;
-  }
-  void pop_discard() {
-    q.pop();
-  }
-
-  ItemC top() {
-    DBG_AGENDA(assert(!empty()));
-    return q.top();
-  }
-
-  agenda_best_t best() const {
-    return q.best(); //TODO: cache/track the global best?
-  }
-
-  agenda_best_t second_best() const {
-    return q.second_best();
-  }
-
-  // add only if worse than queue current best, otherwise evaluate immediately (e.g. for early stopping w/ expensive to compute additional cost).  return true if postponed (added)
-  bool postpone(ItemP i) {
-    if (better(priomap[i],best())) return false;
-    return improve(i);
-  }
-
-  Agenda(unsigned reserve=1000000,LocMap const& lm=LocMap(),PrioMap const& pm=PrioMap(),EqKey const& eq=EqKey(),Better const& better=Better()) : locmap(lm), priomap(pm), better(better), q(priomap,locmap,better,reserve) {  }
-};
-
-#endif
diff --git a/utils/atools.cc b/utils/atools.cc
new file mode 100644
index 00000000..c0a91731
--- /dev/null
+++ b/utils/atools.cc
@@ -0,0 +1,369 @@
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <queue>
+#include <map>
+#include <boost/program_options.hpp>
+#include <boost/shared_ptr.hpp>
+
+#include "filelib.h"
+#include "alignment_pharaoh.h"
+
+namespace po = boost::program_options;
+using namespace std;
+using boost::shared_ptr;
+
+struct Command {
+  virtual ~Command() {}
+  virtual string Name() const = 0;
+
+  // returns 1 for alignment grid output [default]
+  // returns 2 if Summary() should be called [for AER, etc]
+  virtual int Result() const { return 1; }
+
+  virtual bool RequiresTwoOperands() const { return true; }
+  virtual void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) = 0;
+  void EnsureSize(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
+    x->resize(max(a.width(), b.width()), max(a.height(), b.height()));
+  }
+  static bool Safe(const Array2D<bool>& a, int i, int j) {
+    if (i >= 0 && j >= 0 && i < a.width() && j < a.height())
+      return a(i,j);
+    else
+      return false;
+  }
+  virtual void Summary() { assert(!"Summary should have been overridden"); }
+};
+
+// compute fmeasure, second alignment is reference, first is hyp
+struct FMeasureCommand : public Command {
+  FMeasureCommand() : matches(), num_predicted(), num_in_ref() {}
+  int Result() const { return 2; }
+  string Name() const { return "fmeasure"; }
+  bool RequiresTwoOperands() const { return true; }
+  void Apply(const Array2D<bool>& hyp, const Array2D<bool>& ref, Array2D<bool>* x) {
+    (void) x;   // AER just computes statistics, not an alignment
+    int i_len = ref.width();
+    int j_len = ref.height();
+    for (int i = 0; i < i_len; ++i) {
+      for (int j = 0; j < j_len; ++j) {
+        if (ref(i,j)) {
+          ++num_in_ref;
+          if (Safe(hyp, i, j)) ++matches;
+        } 
+      }
+    }
+    for (int i = 0; i < hyp.width(); ++i)
+      for (int j = 0; j < hyp.height(); ++j)
+        if (hyp(i,j)) ++num_predicted;
+  }
+  void Summary() {
+    if (num_predicted == 0 || num_in_ref == 0) {
+      cerr << "Insufficient statistics to compute f-measure!\n";
+      abort();
+    }
+    const double prec = static_cast<double>(matches) / num_predicted;
+    const double rec = static_cast<double>(matches) / num_in_ref;
+    cout << "P: " << prec << endl;
+    cout << "R: " << rec << endl;
+    const double f = (2.0 * prec * rec) / (rec + prec);
+    cout << "F: " << f << endl;
+  }
+  int matches;
+  int num_predicted;
+  int num_in_ref;
+};
+
+struct DisplayCommand : public Command {
+  string Name() const { return "display"; }
+  bool RequiresTwoOperands() const { return false; }
+  void Apply(const Array2D<bool>& in, const Array2D<bool>&, Array2D<bool>* x) {
+    *x = in;
+    cout << *x << endl;
+  }
+};
+
+struct ConvertCommand : public Command {
+  string Name() const { return "convert"; }
+  bool RequiresTwoOperands() const { return false; }
+  void Apply(const Array2D<bool>& in, const Array2D<bool>&, Array2D<bool>* x) {
+    *x = in;
+  }
+};
+
+struct InvertCommand : public Command {
+  string Name() const { return "invert"; }
+  bool RequiresTwoOperands() const { return false; }
+  void Apply(const Array2D<bool>& in, const Array2D<bool>&, Array2D<bool>* x) {
+    Array2D<bool>& res = *x;
+    res.resize(in.height(), in.width());
+    for (int i = 0; i < in.height(); ++i)
+      for (int j = 0; j < in.width(); ++j)
+        res(i, j) = in(j, i);
+  }
+};
+
+struct IntersectCommand : public Command {
+  string Name() const { return "intersect"; }
+  bool RequiresTwoOperands() const { return true; }
+  void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
+    EnsureSize(a, b, x);
+    Array2D<bool>& res = *x;
+    for (int i = 0; i < a.width(); ++i)
+      for (int j = 0; j < a.height(); ++j)
+        res(i, j) = Safe(a, i, j) && Safe(b, i, j);
+  }
+};
+
+struct UnionCommand : public Command {
+  string Name() const { return "union"; }
+  bool RequiresTwoOperands() const { return true; }
+  void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
+    EnsureSize(a, b, x);
+    Array2D<bool>& res = *x;
+    for (int i = 0; i < res.width(); ++i)
+      for (int j = 0; j < res.height(); ++j)
+        res(i, j) = Safe(a, i, j) || Safe(b, i, j);
+  }
+};
+
+struct RefineCommand : public Command {
+  RefineCommand() {
+    neighbors_.push_back(make_pair(1,0));
+    neighbors_.push_back(make_pair(-1,0));
+    neighbors_.push_back(make_pair(0,1));
+    neighbors_.push_back(make_pair(0,-1));
+  }
+  bool RequiresTwoOperands() const { return true; }
+
+  void Align(int i, int j) {
+    res_(i, j) = true;
+    is_i_aligned_[i] = true;
+    is_j_aligned_[j] = true;
+  }
+
+  bool IsNeighborAligned(int i, int j) const {
+    for (int k = 0; k < neighbors_.size(); ++k) {
+      const int di = neighbors_[k].first;
+      const int dj = neighbors_[k].second;
+      if (Safe(res_, i + di, j + dj))
+        return true;
+    }
+    return false;
+  }
+
+  bool IsNeitherAligned(int i, int j) const {
+    return !(is_i_aligned_[i] || is_j_aligned_[j]);
+  }
+
+  bool IsOneOrBothUnaligned(int i, int j) const {
+    return !(is_i_aligned_[i] && is_j_aligned_[j]);
+  }
+
+  bool KoehnAligned(int i, int j) const {
+    return IsOneOrBothUnaligned(i, j) && IsNeighborAligned(i, j);
+  }
+
+  typedef bool (RefineCommand::*Predicate)(int i, int j) const;
+
+ protected:
+  void InitRefine(
+      const Array2D<bool>& a,
+      const Array2D<bool>& b) {
+    res_.clear();
+    EnsureSize(a, b, &res_);
+    in_.clear(); un_.clear(); is_i_aligned_.clear(); is_j_aligned_.clear();
+    EnsureSize(a, b, &in_);
+    EnsureSize(a, b, &un_);
+    is_i_aligned_.resize(res_.width(), false);
+    is_j_aligned_.resize(res_.height(), false);
+    for (int i = 0; i < in_.width(); ++i)
+      for (int j = 0; j < in_.height(); ++j) {
+        un_(i, j) = Safe(a, i, j) || Safe(b, i, j);
+        in_(i, j) = Safe(a, i, j) && Safe(b, i, j);
+        if (in_(i, j)) Align(i, j);
+    }
+  }
+  // "grow" the resulting alignment using the points in adds
+  // if they match the constraints determined by pred
+  void Grow(Predicate pred, bool idempotent, const Array2D<bool>& adds) {
+    if (idempotent) {
+      for (int i = 0; i < adds.width(); ++i)
+        for (int j = 0; j < adds.height(); ++j) {
+          if (adds(i, j) && !res_(i, j) &&
+              (this->*pred)(i, j)) Align(i, j);
+        }
+      return;
+    }
+    set<pair<int, int> > p;
+    for (int i = 0; i < adds.width(); ++i)
+      for (int j = 0; j < adds.height(); ++j)
+        if (adds(i, j) && !res_(i, j))
+          p.insert(make_pair(i, j));
+    bool keep_going = !p.empty();
+    while (keep_going) {
+      keep_going = false;
+      for (set<pair<int, int> >::iterator pi = p.begin();
+           pi != p.end(); ++pi) {
+        if ((this->*pred)(pi->first, pi->second)) {
+          Align(pi->first, pi->second);
+          p.erase(pi);
+          keep_going = true;
+        }
+      }
+    }
+  }
+  Array2D<bool> res_;  // refined alignment
+  Array2D<bool> in_;   // intersection alignment
+  Array2D<bool> un_;   // union alignment
+  vector<bool> is_i_aligned_;
+  vector<bool> is_j_aligned_;
+  vector<pair<int,int> > neighbors_;
+};
+
+struct DiagCommand : public RefineCommand {
+  DiagCommand() {
+    neighbors_.push_back(make_pair(1,1));
+    neighbors_.push_back(make_pair(-1,1));
+    neighbors_.push_back(make_pair(1,-1));
+    neighbors_.push_back(make_pair(-1,-1));
+  }
+};
+
+struct GDCommand : public DiagCommand {
+  string Name() const { return "grow-diag"; }
+  void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
+    InitRefine(a, b);
+    Grow(&RefineCommand::KoehnAligned, false, un_);
+    *x = res_;
+  }
+};
+
+struct GDFCommand : public DiagCommand {
+  string Name() const { return "grow-diag-final"; }
+  void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
+    InitRefine(a, b);
+    Grow(&RefineCommand::KoehnAligned, false, un_);
+    Grow(&RefineCommand::IsOneOrBothUnaligned, true, a);
+    Grow(&RefineCommand::IsOneOrBothUnaligned, true, b);
+    *x = res_;
+  }
+};
+
+struct GDFACommand : public DiagCommand {
+  string Name() const { return "grow-diag-final-and"; }
+  void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
+    InitRefine(a, b);
+    Grow(&RefineCommand::KoehnAligned, false, un_);
+    Grow(&RefineCommand::IsNeitherAligned, true, a);
+    Grow(&RefineCommand::IsNeitherAligned, true, b);
+    *x = res_;
+  }
+};
+
+map<string, boost::shared_ptr<Command> > commands;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  ostringstream os;
+  os << "Operation to perform:";
+  for (map<string, boost::shared_ptr<Command> >::iterator it = commands.begin();
+       it != commands.end(); ++it) {
+    os << ' ' << it->first;
+  }
+  string cstr = os.str();
+  opts.add_options()
+        ("input_1,i", po::value<string>(), "[REQUIRED] Alignment 1 file, - for STDIN")
+        ("input_2,j", po::value<string>(), "Alignment 2 file, - for STDIN")
+	("command,c", po::value<string>()->default_value("convert"), cstr.c_str())
+        ("help,h", "Print this help message and exit");
+  po::options_description clo("Command line options");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  po::notify(*conf);
+
+  if (conf->count("help") || conf->count("input_1") == 0 || conf->count("command") == 0) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+  const string cmd = (*conf)["command"].as<string>();
+  if (commands.count(cmd) == 0) {
+    cerr << "Don't understand command: " << cmd << endl;
+    exit(1);
+  }
+  if (commands[cmd]->RequiresTwoOperands()) {
+    if (conf->count("input_2") == 0) {
+      cerr << "Command '" << cmd << "' requires two alignment files\n";
+      exit(1);
+    }
+    if ((*conf)["input_1"].as<string>() == "-" && (*conf)["input_2"].as<string>() == "-") {
+      cerr << "Both inputs cannot be STDIN\n";
+      exit(1);
+    }
+  } else {
+    if (conf->count("input_2") != 0) {
+      cerr << "Command '" << cmd << "' requires only one alignment file\n";
+      exit(1);
+    }
+  }
+}
+
+template<class C> static void AddCommand() {
+  C* c = new C;
+  commands[c->Name()].reset(c);
+}
+
+int main(int argc, char **argv) {
+  AddCommand<ConvertCommand>();
+  AddCommand<DisplayCommand>();
+  AddCommand<InvertCommand>();
+  AddCommand<IntersectCommand>();
+  AddCommand<UnionCommand>();
+  AddCommand<GDCommand>();
+  AddCommand<GDFCommand>();
+  AddCommand<GDFACommand>();
+  AddCommand<FMeasureCommand>();
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  Command& cmd = *commands[conf["command"].as<string>()];
+  boost::shared_ptr<ReadFile> rf1(new ReadFile(conf["input_1"].as<string>()));
+  boost::shared_ptr<ReadFile> rf2;
+  if (cmd.RequiresTwoOperands())
+    rf2.reset(new ReadFile(conf["input_2"].as<string>()));
+  istream* in1 = rf1->stream();
+  istream* in2 = NULL;
+  if (rf2) in2 = rf2->stream();
+  while(*in1) {
+    string line1;
+    string line2;
+    getline(*in1, line1);
+    if (in2) {
+      getline(*in2, line2);
+      if ((*in1 && !*in2) || (*in2 && !*in1)) {
+        cerr << "Mismatched number of lines!\n";
+        exit(1);
+      }
+    }
+    if (line1.empty() && !*in1) break;
+    shared_ptr<Array2D<bool> > out(new Array2D<bool>);
+    shared_ptr<Array2D<bool> > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1);
+    if (in2) {
+      shared_ptr<Array2D<bool> > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2);
+      cmd.Apply(*a1, *a2, out.get());
+    } else {
+      Array2D<bool> dummy;
+      cmd.Apply(*a1, dummy, out.get());
+    }
+    
+    if (cmd.Result() == 1) {
+      AlignmentPharaoh::SerializePharaohFormat(*out, &cout);
+    }
+  }
+  if (cmd.Result() == 2)
+    cmd.Summary();
+  return 0;
+}
+
diff --git a/utils/batched_append.h b/utils/batched_append.h
index fe4a12fc..fe4a12fc 100755..100644
--- a/utils/batched_append.h
+++ b/utils/batched_append.h
diff --git a/utils/best.h b/utils/best.h
deleted file mode 100755
index ed15e0be..00000000
--- a/utils/best.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef UTILS__BEST_H
-#define UTILS__BEST_H
-
-#include "max_plus.h"
-
-typedef MaxPlus<double> best_t;
-
-inline bool better(best_t const& a,best_t const& b) {
-  return a.v_>b.v_; // intentionally reversed, so default min-heap, sort, etc. put best first.
-}
-
-inline bool operator <(best_t const& a,best_t const& b) {
-  return a.v_>b.v_; // intentionally reversed, so default min-heap, sort, etc. put best first.
-}
-struct BetterP {
-  inline bool operator ()(best_t const& a,best_t const& b) const {
-    return a.v_>b.v_; // intentionally reversed, so default min-heap, sort, etc. put best first.
-  }
-};
-
-inline void maybe_improve(best_t &a,best_t const& b) {
-  if (a.v_>b.v_)
-    a.v_=b.v_;
-}
-
-template <class O>
-inline void maybe_improve(best_t &a,O const& b) {
-  if (a.v_>b.v_)
-    a.v_=b.v_;
-}
-
-#endif
diff --git a/utils/ccrp.h b/utils/ccrp.h
new file mode 100644
index 00000000..4a8b80e7
--- /dev/null
+++ b/utils/ccrp.h
@@ -0,0 +1,309 @@
+#ifndef _CCRP_H_
+#define _CCRP_H_
+
+#include <numeric>
+#include <cassert>
+#include <cmath>
+#include <list>
+#include <iostream>
+#include <vector>
+#include <tr1/unordered_map>
+#include <boost/functional/hash.hpp>
+#include "sampler.h"
+#include "slice_sampler.h"
+#include "m.h"
+
+// Chinese restaurant process (Pitman-Yor parameters) with table tracking.
+
+template <typename Dish, typename DishHash = boost::hash<Dish> >
+class CCRP {
+ public:
+  CCRP(double disc, double strength) :
+      num_tables_(),
+      num_customers_(),
+      discount_(disc),
+      strength_(strength),
+      discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()),
+      discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
+      strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+      strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {
+    check_hyperparameters();
+  }
+
+  CCRP(double d_strength, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) :
+      num_tables_(),
+      num_customers_(),
+      discount_(d),
+      strength_(c),
+      discount_prior_strength_(d_strength),
+      discount_prior_beta_(d_beta),
+      strength_prior_shape_(c_shape),
+      strength_prior_rate_(c_rate) {
+    check_hyperparameters();
+  }
+
+  void check_hyperparameters() {
+    if (discount_ < 0.0 || discount_ >= 1.0) {
+      std::cerr << "Bad discount: " << discount_ << std::endl;
+      abort();
+    }
+    if (strength_ <= -discount_) {
+      std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl;
+      abort();
+    }
+  }
+
+  double discount() const { return discount_; }
+  double strength() const { return strength_; }
+  void set_discount(double d) { discount_ = d; check_hyperparameters(); }
+  void set_strength(double a) { strength_ = a; check_hyperparameters(); }
+
+  bool has_discount_prior() const {
+    return !std::isnan(discount_prior_strength_);
+  }
+
+  bool has_strength_prior() const {
+    return !std::isnan(strength_prior_shape_);
+  }
+
+  void clear() {
+    num_tables_ = 0;
+    num_customers_ = 0;
+    dish_locs_.clear();
+  }
+
+  unsigned num_tables() const {
+    return num_tables_;
+  }
+
+  unsigned num_tables(const Dish& dish) const {
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    if (it == dish_locs_.end()) return 0;
+    return it->second.table_counts_.size();
+  }
+
+  unsigned num_customers() const {
+    return num_customers_;
+  }
+
+  unsigned num_customers(const Dish& dish) const {
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    if (it == dish_locs_.end()) return 0;
+    return it->total_dish_count_;
+  }
+
+  // returns +1 or 0 indicating whether a new table was opened
+  template <typename T>
+  int increment(const Dish& dish, const T& p0, MT19937* rng) {
+    DishLocations& loc = dish_locs_[dish];
+    bool share_table = false;
+    if (loc.total_dish_count_) {
+      const T p_empty = T(strength_ + num_tables_ * discount_) * p0;
+      const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_);
+      share_table = rng->SelectSample(p_empty, p_share);
+    }
+    if (share_table) {
+      double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
+      for (typename std::list<unsigned>::iterator ti = loc.table_counts_.begin();
+           ti != loc.table_counts_.end(); ++ti) {
+        r -= (*ti - discount_);
+        if (r <= 0.0) {
+          ++(*ti);
+          break;
+        }
+      }
+      if (r > 0.0) {
+        std::cerr << "Serious error: r=" << r << std::endl;
+        Print(&std::cerr);
+        assert(r <= 0.0);
+      }
+    } else {
+      loc.table_counts_.push_back(1u);
+      ++num_tables_;
+    }
+    ++loc.total_dish_count_;
+    ++num_customers_;
+    return (share_table ? 0 : 1);
+  }
+
+  // returns -1 or 0, indicating whether a table was closed
+  int decrement(const Dish& dish, MT19937* rng) {
+    DishLocations& loc = dish_locs_[dish];
+    assert(loc.total_dish_count_);
+    if (loc.total_dish_count_ == 1) {
+      dish_locs_.erase(dish);
+      --num_tables_;
+      --num_customers_;
+      return -1;
+    } else {
+      int delta = 0;
+      // sample customer to remove UNIFORMLY. that is, do NOT use the discount
+      // here. if you do, it will introduce (unwanted) bias!
+      double r = rng->next() * loc.total_dish_count_;
+      --loc.total_dish_count_;
+      for (typename std::list<unsigned>::iterator ti = loc.table_counts_.begin();
+           ti != loc.table_counts_.end(); ++ti) {
+        r -= *ti;
+        if (r <= 0.0) {
+          if ((--(*ti)) == 0) {
+            --num_tables_;
+            delta = -1;
+            loc.table_counts_.erase(ti);
+          }
+          break;
+        }
+      }
+      if (r > 0.0) {
+        std::cerr << "Serious error: r=" << r << std::endl;
+        Print(&std::cerr);
+        assert(r <= 0.0);
+      }
+      --num_customers_;
+      return delta;
+    }
+  }
+
+  template <typename T>
+  T prob(const Dish& dish, const T& p0) const {
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    const T r = T(num_tables_ * discount_ + strength_);
+    if (it == dish_locs_.end()) {
+      return r * p0 / T(num_customers_ + strength_);
+    } else {
+      return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) /
+               T(num_customers_ + strength_);
+    }
+  }
+
+  double log_crp_prob() const {
+    return log_crp_prob(discount_, strength_);
+  }
+
+  // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
+  // does not include P_0's
+  double log_crp_prob(const double& discount, const double& strength) const {
+    double lp = 0.0;
+    if (has_discount_prior())
+      lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_);
+    if (has_strength_prior())
+      lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_);
+    assert(lp <= 0.0);
+    if (num_customers_) {
+      if (discount > 0.0) {
+        const double r = lgamma(1.0 - discount);
+        if (strength)
+          lp += lgamma(strength) - lgamma(strength / discount);
+        lp += - lgamma(strength + num_customers_)
+             + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_);
+        assert(std::isfinite(lp));
+        for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
+             it != dish_locs_.end(); ++it) {
+          const DishLocations& cur = it->second;
+          for (std::list<unsigned>::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) {
+            lp += lgamma(*ti - discount) - r;
+          }
+        }
+      } else if (!discount) { // discount == 0.0
+        lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_);
+        assert(std::isfinite(lp));
+        for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
+             it != dish_locs_.end(); ++it) {
+          const DishLocations& cur = it->second;
+          lp += lgamma(cur.table_counts_.size());
+        }
+      } else {
+        assert(!"discount less than 0 detected!");
+      }
+    }
+    assert(std::isfinite(lp));
+    return lp;
+  }
+
+  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    assert(has_discount_prior() || has_strength_prior());
+    if (num_customers() == 0) return;
+    DiscountResampler dr(*this);
+    StrengthResampler sr(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+      if (has_strength_prior()) {
+        strength_ = slice_sampler1d(sr, strength_, *rng, -discount_ + std::numeric_limits<double>::min(),
+                               std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      }
+      if (has_discount_prior()) {
+        double min_discount = std::numeric_limits<double>::min();
+        if (strength_ < 0.0) min_discount -= strength_;
+        discount_ = slice_sampler1d(dr, discount_, *rng, min_discount,
+                               1.0, 0.0, niterations, 100*niterations);
+      }
+    }
+    strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
+                             std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+  }
+
+  struct DiscountResampler {
+    DiscountResampler(const CCRP& crp) : crp_(crp) {}
+    const CCRP& crp_;
+    double operator()(const double& proposed_discount) const {
+      return crp_.log_crp_prob(proposed_discount, crp_.strength_);
+    }
+  };
+
+  struct StrengthResampler {
+    StrengthResampler(const CCRP& crp) : crp_(crp) {}
+    const CCRP& crp_;
+    double operator()(const double& proposed_strength) const {
+      return crp_.log_crp_prob(crp_.discount_, proposed_strength);
+    }
+  };
+
+  struct DishLocations {
+    DishLocations() : total_dish_count_() {}
+    unsigned total_dish_count_;        // customers at all tables with this dish
+    std::list<unsigned> table_counts_; // list<> gives O(1) deletion and insertion, which we want
+                                       // .size() is the number of tables for this dish
+  };
+
+  void Print(std::ostream* out) const {
+    std::cerr << "PYP(d=" << discount_ << ",c=" << strength_ << ") customers=" << num_customers_ << std::endl;
+    for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
+         it != dish_locs_.end(); ++it) {
+      (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): ";
+      for (typename std::list<unsigned>::const_iterator i = it->second.table_counts_.begin();
+           i != it->second.table_counts_.end(); ++i) {
+        (*out) << " " << *i;
+      }
+      (*out) << std::endl;
+    }
+  }
+
+  typedef typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator const_iterator;
+  const_iterator begin() const {
+    return dish_locs_.begin();
+  }
+  const_iterator end() const {
+    return dish_locs_.end();
+  }
+
+  unsigned num_tables_;
+  unsigned num_customers_;
+  std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_;
+
+  double discount_;
+  double strength_;
+
+  // optional beta prior on discount_ (NaN if no prior)
+  double discount_prior_strength_;
+  double discount_prior_beta_;
+
+  // optional gamma prior on strength_ (NaN if no prior)
+  double strength_prior_shape_;
+  double strength_prior_rate_;
+};
+
+template <typename T,typename H>
+std::ostream& operator<<(std::ostream& o, const CCRP<T,H>& c) {
+  c.Print(&o);
+  return o;
+}
+
+#endif
diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h
index 63b6f4c2..6efbfc78 100644
--- a/utils/ccrp_nt.h
+++ b/utils/ccrp_nt.h
@@ -11,6 +11,7 @@
 #include <boost/functional/hash.hpp>
 #include "sampler.h"
 #include "slice_sampler.h"
+#include "m.h"
 
 // Chinese restaurant process (1 parameter)
 template <typename Dish, typename DishHash = boost::hash<Dish> >
@@ -18,20 +19,21 @@ class CCRP_NoTable {
  public:
   explicit CCRP_NoTable(double conc) :
     num_customers_(),
-    concentration_(conc),
-    concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
-    concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+    alpha_(conc),
+    alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+    alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
 
   CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) :
     num_customers_(),
-    concentration_(c),
-    concentration_prior_shape_(c_shape),
-    concentration_prior_rate_(c_rate) {}
+    alpha_(c),
+    alpha_prior_shape_(c_shape),
+    alpha_prior_rate_(c_rate) {}
 
-  double concentration() const { return concentration_; }
+  double alpha() const { return alpha_; }
+  void set_alpha(const double& alpha) { alpha_ = alpha; assert(alpha_ > 0.0); }
 
-  bool has_concentration_prior() const {
-    return !std::isnan(concentration_prior_shape_);
+  bool has_alpha_prior() const {
+    return !std::isnan(alpha_prior_shape_);
   }
 
   void clear() {
@@ -71,38 +73,31 @@ class CCRP_NoTable {
     return table_diff;
   }
 
-  double prob(const Dish& dish, const double& p0) const {
+  template <typename F>
+  F prob(const Dish& dish, const F& p0) const {
     const unsigned at_table = num_customers(dish);
-    return (at_table + p0 * concentration_) / (num_customers_ + concentration_);
+    return (F(at_table) + p0 * F(alpha_)) / F(num_customers_ + alpha_);
   }
 
   double logprob(const Dish& dish, const double& logp0) const {
     const unsigned at_table = num_customers(dish);
-    return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_);
+    return log(at_table + exp(logp0 + log(alpha_))) - log(num_customers_ + alpha_);
   }
 
   double log_crp_prob() const {
-    return log_crp_prob(concentration_);
-  }
-
-  static double log_gamma_density(const double& x, const double& shape, const double& rate) {
-    assert(x >= 0.0);
-    assert(shape > 0.0);
-    assert(rate > 0.0);
-    const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape);
-    return lp;
+    return log_crp_prob(alpha_);
   }
 
   // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
   // does not include P_0's
-  double log_crp_prob(const double& concentration) const {
+  double log_crp_prob(const double& alpha) const {
     double lp = 0.0;
-    if (has_concentration_prior())
-      lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_);
+    if (has_alpha_prior())
+      lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
     assert(lp <= 0.0);
     if (num_customers_) {
-      lp += lgamma(concentration) - lgamma(concentration + num_customers_) +
-        custs_.size() * log(concentration);
+      lp += lgamma(alpha) - lgamma(alpha + num_customers_) +
+        custs_.size() * log(alpha);
       assert(std::isfinite(lp));
       for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin();
              it != custs_.end(); ++it) {
@@ -114,10 +109,10 @@ class CCRP_NoTable {
   }
 
   void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    assert(has_concentration_prior());
+    assert(has_alpha_prior());
     ConcentrationResampler cr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-        concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+        alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
                                std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
     }
   }
@@ -125,13 +120,13 @@ class CCRP_NoTable {
   struct ConcentrationResampler {
     ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {}
     const CCRP_NoTable& crp_;
-    double operator()(const double& proposed_concentration) const {
-      return crp_.log_crp_prob(proposed_concentration);
+    double operator()(const double& proposed_alpha) const {
+      return crp_.log_crp_prob(proposed_alpha);
     }
   };
 
   void Print(std::ostream* out) const {
-    (*out) << "DP(alpha=" << concentration_ << ") customers=" << num_customers_ << std::endl;
+    (*out) << "DP(alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl;
     int cc = 0;
     for (typename std::tr1::unordered_map<Dish, unsigned, DishHash>::const_iterator it = custs_.begin();
          it != custs_.end(); ++it) {
@@ -153,11 +148,11 @@ class CCRP_NoTable {
     return custs_.end();
   }
 
-  double concentration_;
+  double alpha_;
 
-  // optional gamma prior on concentration_ (NaN if no prior)
-  double concentration_prior_shape_;
-  double concentration_prior_rate_;
+  // optional gamma prior on alpha_ (NaN if no prior)
+  double alpha_prior_shape_;
+  double alpha_prior_rate_;
 };
 
 template <typename T,typename H>
diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h
index a868af9a..1fe01b0e 100644
--- a/utils/ccrp_onetable.h
+++ b/utils/ccrp_onetable.h
@@ -21,33 +21,33 @@ class CCRP_OneTable {
     num_tables_(),
     num_customers_(),
     discount_(disc),
-    concentration_(conc),
+    alpha_(conc),
     discount_prior_alpha_(std::numeric_limits<double>::quiet_NaN()),
     discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
-    concentration_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
-    concentration_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
+    alpha_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+    alpha_prior_rate_(std::numeric_limits<double>::quiet_NaN()) {}
 
   CCRP_OneTable(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) :
     num_tables_(),
     num_customers_(),
     discount_(d),
-    concentration_(c),
+    alpha_(c),
     discount_prior_alpha_(d_alpha),
     discount_prior_beta_(d_beta),
-    concentration_prior_shape_(c_shape),
-    concentration_prior_rate_(c_rate) {}
+    alpha_prior_shape_(c_shape),
+    alpha_prior_rate_(c_rate) {}
 
   double discount() const { return discount_; }
-  double concentration() const { return concentration_; }
-  void set_concentration(double c) { concentration_ = c; }
+  double alpha() const { return alpha_; }
+  void set_alpha(double c) { alpha_ = c; }
   void set_discount(double d) { discount_ = d; }
 
   bool has_discount_prior() const {
     return !std::isnan(discount_prior_alpha_);
   }
 
-  bool has_concentration_prior() const {
-    return !std::isnan(concentration_prior_shape_);
+  bool has_alpha_prior() const {
+    return !std::isnan(alpha_prior_shape_);
   }
 
   void clear() {
@@ -108,17 +108,29 @@ class CCRP_OneTable {
 
   double prob(const Dish& dish, const double& p0) const {
     const typename DishMapType::const_iterator it = dish_counts_.find(dish);
-    const double r = num_tables_ * discount_ + concentration_;
+    const double r = num_tables_ * discount_ + alpha_;
     if (it == dish_counts_.end()) {
-      return r * p0 / (num_customers_ + concentration_);
+      return r * p0 / (num_customers_ + alpha_);
     } else {
       return (it->second - discount_ + r * p0) /
-               (num_customers_ + concentration_);
+               (num_customers_ + alpha_);
+    }
+  }
+
+  template <typename T>
+  T probT(const Dish& dish, const T& p0) const {
+    const typename DishMapType::const_iterator it = dish_counts_.find(dish);
+    const T r(num_tables_ * discount_ + alpha_);
+    if (it == dish_counts_.end()) {
+      return r * p0 / T(num_customers_ + alpha_);
+    } else {
+      return (T(it->second - discount_) + r * p0) /
+               T(num_customers_ + alpha_);
     }
   }
 
   double log_crp_prob() const {
-    return log_crp_prob(discount_, concentration_);
+    return log_crp_prob(discount_, alpha_);
   }
 
   static double log_beta_density(const double& x, const double& alpha, const double& beta) {
@@ -140,19 +152,19 @@ class CCRP_OneTable {
 
   // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
   // does not include P_0's
-  double log_crp_prob(const double& discount, const double& concentration) const {
+  double log_crp_prob(const double& discount, const double& alpha) const {
     double lp = 0.0;
     if (has_discount_prior())
       lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_);
-    if (has_concentration_prior())
-      lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_);
+    if (has_alpha_prior())
+      lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_);
     assert(lp <= 0.0);
     if (num_customers_) {
       if (discount > 0.0) {
         const double r = lgamma(1.0 - discount);
-        lp += lgamma(concentration) - lgamma(concentration + num_customers_)
-             + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_)
-             - lgamma(concentration / discount);
+        lp += lgamma(alpha) - lgamma(alpha + num_customers_)
+             + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_)
+             - lgamma(alpha / discount);
         assert(std::isfinite(lp));
         for (typename DishMapType::const_iterator it = dish_counts_.begin();
              it != dish_counts_.end(); ++it) {
@@ -168,12 +180,12 @@ class CCRP_OneTable {
   }
 
   void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
-    assert(has_discount_prior() || has_concentration_prior());
+    assert(has_discount_prior() || has_alpha_prior());
     DiscountResampler dr(*this);
     ConcentrationResampler cr(*this);
     for (int iter = 0; iter < nloop; ++iter) {
-      if (has_concentration_prior()) {
-        concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+      if (has_alpha_prior()) {
+        alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
                                std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
       }
       if (has_discount_prior()) {
@@ -181,7 +193,7 @@ class CCRP_OneTable {
                                1.0, 0.0, niterations, 100*niterations);
       }
     }
-    concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0,
+    alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0,
                              std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
   }
 
@@ -189,20 +201,20 @@ class CCRP_OneTable {
     DiscountResampler(const CCRP_OneTable& crp) : crp_(crp) {}
     const CCRP_OneTable& crp_;
     double operator()(const double& proposed_discount) const {
-      return crp_.log_crp_prob(proposed_discount, crp_.concentration_);
+      return crp_.log_crp_prob(proposed_discount, crp_.alpha_);
     }
   };
 
   struct ConcentrationResampler {
     ConcentrationResampler(const CCRP_OneTable& crp) : crp_(crp) {}
     const CCRP_OneTable& crp_;
-    double operator()(const double& proposed_concentration) const {
-      return crp_.log_crp_prob(crp_.discount_, proposed_concentration);
+    double operator()(const double& proposed_alpha) const {
+      return crp_.log_crp_prob(crp_.discount_, proposed_alpha);
     }
   };
 
   void Print(std::ostream* out) const {
-    (*out) << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl;
+    (*out) << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl;
     for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) {
       (*out) << "  " << it->first << " = " << it->second << std::endl;
     }
@@ -221,15 +233,15 @@ class CCRP_OneTable {
   DishMapType dish_counts_;
 
   double discount_;
-  double concentration_;
+  double alpha_;
 
   // optional beta prior on discount_ (NaN if no prior)
   double discount_prior_alpha_;
   double discount_prior_beta_;
 
-  // optional gamma prior on concentration_ (NaN if no prior)
-  double concentration_prior_shape_;
-  double concentration_prior_rate_;
+  // optional gamma prior on alpha_ (NaN if no prior)
+  double alpha_prior_shape_;
+  double alpha_prior_rate_;
 };
 
 template <typename T,typename H>
diff --git a/utils/corpus_tools.cc b/utils/corpus_tools.cc
new file mode 100644
index 00000000..d17785af
--- /dev/null
+++ b/utils/corpus_tools.cc
@@ -0,0 +1,66 @@
+#include "corpus_tools.h"
+
+#include <iostream>
+
+#include "tdict.h"
+#include "filelib.h"
+#include "verbose.h"
+
+using namespace std;
+
+void CorpusTools::ReadFromFile(const string& filename,
+                           vector<vector<WordID> >* src,
+                           set<WordID>* src_vocab,
+                           vector<vector<WordID> >* trg,
+                           set<WordID>* trg_vocab,
+                           int rank,
+                           int size) {
+  assert(rank >= 0);
+  assert(size > 0);
+  assert(rank < size);
+  if (src) src->clear();
+  if (src_vocab) src_vocab->clear();
+  if (trg) trg->clear();
+  if (trg_vocab) trg_vocab->clear();
+  const int expected_fields = 1 + (trg == NULL ? 0 : 1);
+  if (!SILENT) cerr << "Reading from " << filename << " ...\n";
+  ReadFile rf(filename);
+  istream& in = *rf.stream();
+  string line;
+  int lc = 0;
+  static const WordID kDIV = TD::Convert("|||");
+  vector<WordID> tmp;
+  while(getline(in, line)) {
+    const bool skip = (lc % size != rank);
+    ++lc;
+    TD::ConvertSentence(line, &tmp);
+    vector<WordID>* d = NULL;
+    if (!skip) {
+      src->push_back(vector<WordID>());
+      d = &src->back();
+    }
+    set<WordID>* v = src_vocab;
+    int s = 0;
+    for (unsigned i = 0; i < tmp.size(); ++i) {
+      if (tmp[i] == kDIV) {
+        ++s;
+        if (s > 1) { cerr << "Unexpected format in line " << lc << ": " << line << endl; abort(); }
+        assert(trg);
+        if (!skip) {
+          trg->push_back(vector<WordID>());
+          d = &trg->back();
+        }
+        v = trg_vocab;
+      } else {
+        if (d) d->push_back(tmp[i]);
+        if (v) v->insert(tmp[i]);
+      }
+    }
+    ++s;
+    if (expected_fields != s) {
+      cerr << "Wrong number of fields in line " << lc << ": " << line << endl; abort();
+    }
+  }
+}
+
+
diff --git a/utils/corpus_tools.h b/utils/corpus_tools.h
new file mode 100644
index 00000000..97bdaa94
--- /dev/null
+++ b/utils/corpus_tools.h
@@ -0,0 +1,19 @@
+#ifndef _CORPUS_TOOLS_H_
+#define _CORPUS_TOOLS_H_
+
+#include <string>
+#include <set>
+#include <vector>
+#include "wordid.h"
+
+struct CorpusTools {
+  static void ReadFromFile(const std::string& filename,
+                           std::vector<std::vector<WordID> >* src,
+                           std::set<WordID>* src_vocab = NULL,
+                           std::vector<std::vector<WordID> >* trg = NULL,
+                           std::set<WordID>* trg_vocab = NULL,
+                           int rank = 0,
+                           int size = 1);
+};
+
+#endif
diff --git a/utils/crp_test.cc b/utils/crp_test.cc
new file mode 100644
index 00000000..561cd4dd
--- /dev/null
+++ b/utils/crp_test.cc
@@ -0,0 +1,102 @@
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "ccrp.h"
+#include "sampler.h"
+
+const size_t MAX_DOC_LEN_CHARS = 10000000;
+
+using namespace std;
+
+class CRPTest : public testing::Test {
+ public:
+  CRPTest() {}
+ protected:
+  virtual void SetUp() { }
+  virtual void TearDown() { }
+  MT19937 rng;
+};
+
+TEST_F(CRPTest, Dist) {
+  CCRP<string> crp(0.1, 5);
+  double un = 0.25;
+  int tt = 0;
+  tt += crp.increment("hi", un, &rng);
+  tt += crp.increment("foo", un, &rng);
+  tt += crp.increment("bar", un, &rng);
+  tt += crp.increment("bar", un, &rng);
+  tt += crp.increment("bar", un, &rng);
+  tt += crp.increment("bar", un, &rng);
+  tt += crp.increment("bar", un, &rng);
+  tt += crp.increment("bar", un, &rng);
+  tt += crp.increment("bar", un, &rng);
+  cout << "tt=" << tt << endl;
+  cout << crp << endl;
+  cout << "  P(bar)=" << crp.prob("bar", un) << endl;
+  cout << "  P(hi)=" << crp.prob("hi", un) << endl;
+  cout << "  P(baz)=" << crp.prob("baz", un) << endl;
+  cout << "  P(foo)=" << crp.prob("foo", un) << endl;
+  double x = crp.prob("bar", un) + crp.prob("hi", un) + crp.prob("baz", un) + crp.prob("foo", un);
+  cout << "    tot=" << x << endl;
+  EXPECT_FLOAT_EQ(1.0, x);
+  tt += crp.decrement("hi", &rng);
+  tt += crp.decrement("bar", &rng);
+  cout << crp << endl;
+  tt += crp.decrement("bar", &rng);
+  cout << crp << endl;
+  cout << "tt=" << tt << endl;
+}
+
+TEST_F(CRPTest, Exchangability) {
+    double tot = 0;
+    double xt = 0;
+    CCRP<int> crp(0.5, 1.0);
+    int cust = 10;
+    vector<int> hist(cust + 1, 0);
+    for (int i = 0; i < cust; ++i) { crp.increment(1, 1.0, &rng); }
+    const int samples = 100000;
+    const bool simulate = true;
+    for (int k = 0; k < samples; ++k) {
+      if (!simulate) {
+        crp.clear();
+        for (int i = 0; i < cust; ++i) { crp.increment(1, 1.0, &rng); }
+      } else {
+        int da = rng.next() * cust;
+        bool a = rng.next() < 0.5;
+        if (a) {
+          for (int i = 0; i < da; ++i) { crp.increment(1, 1.0, &rng); }
+          for (int i = 0; i < da; ++i) { crp.decrement(1, &rng); }
+          xt += 1.0;
+        } else {
+          for (int i = 0; i < da; ++i) { crp.decrement(1, &rng); }
+          for (int i = 0; i < da; ++i) { crp.increment(1, 1.0, &rng); }
+        }
+      }
+      int c = crp.num_tables(1);
+      ++hist[c];
+      tot += c;
+    }
+    EXPECT_EQ(cust, crp.num_customers());
+    cerr << "P(a) = " << (xt / samples) << endl;
+    cerr << "E[num tables] = " << (tot / samples) << endl;
+    double error = fabs((tot / samples) - 5.4);
+    cerr << "  error = " << error << endl;
+    EXPECT_LT(error, 0.1);  // it's possible for this to fail, but
+                            // very, very unlikely
+    for (int i = 1; i <= cust; ++i)
+      cerr << i << ' ' << (hist[i]) << endl;
+}
+
+TEST_F(CRPTest, LP) {
+  CCRP<string> crp(1,1,1,1,0.1,50.0);
+  crp.increment("foo", 1.0, &rng);
+  cerr << crp.log_crp_prob() << endl;
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/utils/d_ary_heap.h b/utils/d_ary_heap.h
deleted file mode 100644
index 1270638a..00000000
--- a/utils/d_ary_heap.h
+++ /dev/null
@@ -1,568 +0,0 @@
-#ifndef D_ARY_HEAP_H
-#define D_ARY_HEAP_H
-
-#include "show.h"
-#define DDARY(x)
-
-#define D_ARY_PUSH_GRAEHL 0 // untested
-#define D_ARY_POP_GRAEHL 0 // untested
-#define D_ARY_DOWN_GRAEHL 0 // untested
-#define D_ARY_UP_GRAEHL 0 // untested
-#define D_ARY_APPEND_ALWAYS_PUSH 1 // heapify (0) is untested.  otherwise switch between push and heapify depending on size (cache effects, existing items vs. # appended ones)
-
-#define D_ARY_TRACK_OUT_OF_HEAP 0 // shouldn't need to track, because in contains() false positives looking up stale or random loc map values are impossible - we just check key.  note: if you enable this, you must init location to D_ARY_HEAP_NULL_INDEX yourself until it's been added or popped
-#define D_ARY_VERIFY_HEAP 1
-// This is a very expensive test so it should be disabled even when NDEBUG is not defined
-
-# undef D_ARY_HEAP_NULL_INDEX
-# define D_ARY_HEAP_NULL_INDEX (-1) // you may init location to this.
-
-/* adapted from boost/graph/detail/d_ary_heap.hpp
-
-  local modifications:
-
-  clear, heapify, append range/container, Size type template arg, reserve constructor arg
-
-  hole+move rather than swap.  note: swap would be more efficient for heavyweight keys, until move ctors exist
-
-  don't set locmap to -1 when removing from heap (waste of time)
-
-  // unlike arity=2 case, you don't gain anything by having indices start at 1, with 0-based child indices
-  // root @1, A=2, children indices m={0,1}: parent(i)=i/2, child(i,m)=2*i+m
-  // root @0: parent(i)=(i-1)/A child(i,n)=i*A+n+1 - can't improve on this except child(i,m)=i*A+m
-  (integer division, a/b=floor(a/b), so (i-1)/A = ceil(i/A)-1, or greatest int less than (i/A))
-
-  actually, no need to adjust child index, since child is called only once and inline
-
-  e.g. for A=3 gorn address in tree -> index
-
-  () = root -> 0
-  (1) -> 1
-  (2) -> 2
-  (3) (A) -> 3
-  (1,1) -> (1*A+1) = 4
-  (1,2) -> (1*A+2) = 5
-  (1,3) -> (1*A+3) = 6
-  (2,1) -> (2*A+1) = 7
-  etc.
-
-//TODO: block-align siblings!  assume data[0] is 16 or 32-byte aligned ... then we want root @ index (blocksize-1).  see http://www.lamarca.org/anthony/pubs/heaps.pdf pg8.  for pow2(e.g. 4)-ary heap, it may be reasonable to  use root @index A-1.  however, suppose the key size is not padded to a power of 2 (e.g. 12 bytes), then we would need internal gaps at times.  would want to use compile const template based inlineable alignment math for this?  possibly use a container like vector that lets you specify padding relative to some address multiple for v[0].
-
- optimal D: see http://www.lamarca.org/anthony/pubs/heaps.pdf pg 9.  depedns on relative cost of swap,compare, but in all cases except swap=free, 2 is worse than 3-4.  for expensive swap (3x compare), 4 still as good as 5.  so just use 4.  boost benchmarking djikstra agrees; 4 is best.
-
- cache-aligned 4-heap speedup over regular 2-heap is 10-80% (for huge heaps, the speedup is more)
-
- splay/skew heaps are worse than 2heap or aligned 4heap in practice.
-
- //TODO: switch from heapify (Floyd's method) to repeated push past some size limit (in bytes) due to cache effect -
- #define D_ARY_BYTES_OUT_OF_CACHE 0x1000000
-
- //TODO: assuming locmap is an lvalue pmap, we can be more efficient.  on the other hand, if it's an intrusive property map to an interned mutable object, there's no difference in performance, and that's what i'm going to do in my first uses.  plus, if keys are indices and the map is a vector, it's barely any overhead.
-
- */
-
-//
-//=======================================================================
-// Copyright 2009 Trustees of Indiana University
-// Authors: Jeremiah J. Willcock, Andrew Lumsdaine
-//
-// Distributed under the Boost Software License, Version 1.0. (See
-// accompanying file LICENSE_1_0.txt or copy at
-// http://www.boost.org/LICENSE_1_0.txt)
-//=======================================================================
-//
-
-#include <vector>
-#include <cstddef>
-#include <algorithm>
-#include <utility>
-#include <cassert>
-#include <boost/static_assert.hpp>
-#include <boost/shared_array.hpp>
-#include <boost/property_map/property_map.hpp>
-
-
-  // D-ary heap using an indirect compare operator (use identity_property_map
-  // as DistanceMap to get a direct compare operator).  This heap appears to be
-  // commonly used for Dijkstra's algorithm for its good practical performance
-  // on some platforms; asymptotically, it's not optimal; it has an O(lg N) decrease-key
-  // operation, which is (amortized) constant time on a relaxed heap or fibonacci heap.  The
-  // implementation is mostly based on the binary heap page on Wikipedia and
-  // online sources that state that the operations are the same for d-ary
-  // heaps.  This code is not based on the old Boost d-ary heap code.
-  //
-  // - d_ary_heap_indirect is a model of UpdatableQueue as is needed for
-  //   dijkstra_shortest_paths.
-  //
-  // - Value must model Assignable.
-  // - Arity must be at least 2 (optimal value appears to be 4, both in my and
-  //   third-party experiments).
-  // - IndexInHeapMap must be a ReadWritePropertyMap from Value to
-  //   Container::size_type (to store the index of each stored value within the
-  //   heap for decrease-key aka update).
-  // - DistanceMap must be a ReadablePropertyMap from Value to something
-  //   (typedef'ed as distance_type).
-  // - Compare must be a BinaryPredicate used as a less-than operator on
-  //   distance_type.
-  // - Container must be a random-access, contiguous container (in practice,
-  //   the operations used probably require that it is std::vector<Value>).
-  //
-  template <typename Value,
-            std::size_t Arity,
-            typename IndexInHeapPropertyMap,
-            typename DistanceMap,
-            typename Better = std::less<Value>,
-            typename Container = std::vector<Value>,
-            typename Size = typename Container::size_type,
-            typename Equal = std::equal_to<Value> >
-  class d_ary_heap_indirect {
-    BOOST_STATIC_ASSERT (Arity >= 2);
-    public:
-    typedef Container container_type;
-    typedef Size size_type;
-    typedef Value value_type;
-    typedef typename Container::const_iterator const_iterator;
-    typedef const_iterator iterator;
-    // The distances being compared using better and that are stored in the
-    // distance map
-    typedef typename boost::property_traits<DistanceMap>::value_type distance_type;
-    d_ary_heap_indirect(DistanceMap const& distance,
-                        IndexInHeapPropertyMap const& index_in_heap,
-                        const Better& better = Better(),
-                        size_type container_reserve = 100000,
-                        Equal const& equal = Equal()
-      )
-      : better(better), data(), distance(distance),
-        index_in_heap(index_in_heap),equal(equal) {
-      data.reserve(container_reserve);
-    }
-    /* Implicit copy constructor */
-    /* Implicit assignment operator */
-
-    template <class C>
-    void append_heapify(C const& c) {
-      data.reserve(data.size()+c.size());
-      append_heapify(c.begin(),c.end());
-    }
-
-    template <class I>
-    void append_heapify(I begin,I end) {
-      data.insert(data.end(),begin,end);
-      heapify();
-    }
-
-    template <class C>
-    void append_push(C const& c) {
-      data.reserve(data.size()+c.size());
-      append_push(c.begin(),c.end());
-    }
-
-    // past some threshold, this should be faster than append_heapify.  also, if there are many existing elements it will be faster.
-    template <class I>
-    void append_push(I begin,I end) {
-      for (;begin!=end;++begin)
-        push(*begin);
-    }
-
-    template <class C>
-    void append(C const& c) {
-      if (D_ARY_APPEND_ALWAYS_PUSH || data.size()>=c.size()/2)
-        append_push(c);
-      else
-        append_heapify(c);
-    }
-
-    // past some threshold, this should be faster than append_heapify.  also, if there are many existing elements it will be faster.
-    template <class I>
-    void append(I begin,I end) {
-      if (D_ARY_APPEND_ALWAYS_PUSH || data.size()>=0x10000)
-        append_push(begin,end);
-      else
-        append_heapify(begin,end);
-    }
-
-    // could allow mutation of data directly, e.g. push_back 1 at a time - but then they could forget to heapify()
-
-    //from bottom of heap tree up, turn that subtree into a heap by adjusting the root down
-    // for n=size, array elements indexed by floor(n/2) + 1, floor(n/2) + 2, ... , n are all leaves for the tree, thus each is an one-element heap already
-    // warning: this is many fewer instructions but, at some point (when heap doesn't fit in Lx cache) it will become slower than repeated push().
-    void heapify() {
-      for (size_type i=parent(data.size()-1);i>0;--i) // starting from parent of last node, ending at first child of root (i==1)
-        preserve_heap_property_down(i);
-    }
-
-    void reserve(size_type s) {
-      data.reserve(s);
-    }
-
-    size_type size() const {
-      return data.size();
-    }
-
-    bool empty() const {
-      return data.empty();
-    }
-
-    const_iterator begin() const {
-      return data.begin();
-    }
-
-    const_iterator end() const {
-      return data.end();
-    }
-
-    void clear() {
-#if D_ARY_TRACK_OUT_OF_HEAP
-      using boost::put;
-      for (typename Container::iterator i=data.begin(),e=data.end();i!=e;++i)
-        put(index_in_heap,*i,(size_type)D_ARY_HEAP_NULL_INDEX);
-#endif
-      data.clear();
-    }
-
-    void push(const Value& v) {
-      if (D_ARY_PUSH_GRAEHL) {
-        size_type i = data.size();
-        data.push_back(Value()); // (hoping default construct is cheap, construct-copy inline)
-        preserve_heap_property_up(v,i); // we don't have to recopy v, or init index_in_heap
-      } else {
-        size_type index = data.size();
-        data.push_back(v);
-        using boost::put;
-        put(index_in_heap, v, index);
-        preserve_heap_property_up(index);
-      }
-      verify_heap();
-    }
-
-    Value& top() {
-      return data[0];
-    }
-
-    const Value& top() const {
-      return data[0];
-    }
-
-    void pop() {
-      using boost::put;
-      if(D_ARY_TRACK_OUT_OF_HEAP)
-        put(index_in_heap, data[0], (size_type)D_ARY_HEAP_NULL_INDEX);
-      if (data.size() != 1) {
-        if (D_ARY_POP_GRAEHL) {
-          preserve_heap_property_down(data.back(),0,data.size()-1);
-          data.pop_back();
-        } else {
-          data[0] = data.back();
-          put(index_in_heap, data[0], 0);
-          data.pop_back();
-          preserve_heap_property_down();
-        }
-        verify_heap();
-      } else {
-        data.pop_back();
-      }
-    }
-
-    // This function assumes the key has been improved
-    // (distance has become smaller, so it may need to rise toward top().
-    // i.e. decrease-key in a min-heap
-    void update(const Value& v) {
-      using boost::get;
-      size_type index = get(index_in_heap, v);
-      preserve_heap_property_up(v,index);
-      verify_heap();
-    }
-
-    // return true if improved.
-    bool maybe_improve(const Value& v,distance_type dbetter) {
-      using boost::get;
-      if (better(dbetter,get(distance,v))) {
-        preserve_heap_property_up_dist(v,dbetter);
-        return true;
-      }
-      return false;
-    }
-
-    distance_type best(distance_type null=0) const {
-      return empty() ? null : get(distance,data[0]);
-    }
-    distance_type second_best(distance_type null=0) const {
-      if (data.size()<2) return null;
-      int m=std::min(data.size(),Arity+1);
-//      if (m>=Arity) m=Arity+1;
-      distance_type b=get(distance,data[1]);
-      for (int i=2;i<m;++i) {
-        distance_type d=get(distance,data[i]);
-        if (better(d,b))
-          b=d;
-      }
-      return b;
-    }
-
-
-#include "warning_push.h"
-#pragma GCC diagnostic ignored "-Wtype-limits"
-      // because maybe size_type is signed or unsigned
-    inline bool contains(const Value &v,size_type i) const {
-      if (D_ARY_TRACK_OUT_OF_HEAP)
-        return i != (size_type)D_ARY_HEAP_NULL_INDEX;
-      size_type sz=data.size();
-      SHOWM2(DDARY,"d_ary_heap contains",i,data.size());
-      return i>=0 && i<sz && equal(v,data[i]); // note: size_type may be signed (don't recommend it, though) - thus i>=0 check to catch uninit. data
-    }
-#include "warning_pop.h"
-
-    inline bool contains(const Value& v) const {
-      using boost::get;
-      return contains(v,get(index_in_heap, v));
-    }
-
-    void push_or_update(const Value& v) { /* insert if not present, else update */
-      using boost::get;
-      size_type index = get(index_in_heap, v);
-      if (D_ARY_PUSH_GRAEHL) {
-        if (contains(v,index))
-          preserve_heap_property_up(v,index);
-        else
-          push(v);
-      } else {
-        if (!contains(v,index)) {
-          index = data.size();
-          data.push_back(v);
-          using boost::put;
-          put(index_in_heap, v, index);
-        }
-        preserve_heap_property_up(index);
-      }
-      verify_heap();
-    }
-
-    private:
-    Better better;
-    Container data;
-    DistanceMap distance;
-    IndexInHeapPropertyMap index_in_heap;
-    Equal equal;
-
-    // Get the parent of a given node in the heap
-    static inline size_type parent(size_type index) {
-      return (index - 1) / Arity;
-    }
-
-    // Get the child_idx'th child of a given node; 0 <= child_idx < Arity
-    static inline size_type child(size_type index, std::size_t child_idx) {
-      return index * Arity + child_idx + 1;
-    }
-
-    // Swap two elements in the heap by index, updating index_in_heap
-    inline void swap_heap_elements(size_type index_a, size_type index_b) {
-      using std::swap;
-      Value value_a = data[index_a];
-      Value value_b = data[index_b];
-      data[index_a] = value_b;
-      data[index_b] = value_a;
-      using boost::put;
-      put(index_in_heap, value_a, index_b);
-      put(index_in_heap, value_b, index_a);
-    }
-
-    inline void move_heap_element(Value const& v,size_type ito) {
-      using boost::put;
-      put(index_in_heap,v,ito);
-      data[ito]=v; //todo: move assign?
-    }
-
-    // Verify that the array forms a heap; commented out by default
-    void verify_heap() const {
-      // This is a very expensive test so it should be disabled even when
-      // NDEBUG is not defined
-#if D_ARY_VERIFY_HEAP
-      using boost::get;
-      for (size_t i = 1; i < data.size(); ++i) {
-        if (better(get(distance,data[i]), get(distance,data[parent(i)]))) {
-          assert (!"Element is smaller than its parent");
-        }
-      }
-#endif
-    }
-
-    // we have a copy of the key, so we don't need to do that stupid find # of levels to move then move.  we act as though data[index]=currently_being_moved, but in fact it's an uninitialized "hole", which we fill at the very end
-    inline void preserve_heap_property_up(Value const& currently_being_moved,size_type index) {
-      using boost::get;
-      preserve_heap_property_up(currently_being_moved,index,get(distance,currently_being_moved));
-    }
-
-    inline void preserve_heap_property_up_set_dist(Value const& currently_being_moved,distance_type dbetter) {
-      using boost::get;
-      using boost::put;
-      put(distance,currently_being_moved,dbetter);
-      preserve_heap_property_up(currently_being_moved,get(index_in_heap,currently_being_moved),dbetter);
-      verify_heap();
-    }
-
-    void preserve_heap_property_up(Value const& currently_being_moved,size_type index,distance_type currently_being_moved_dist) {
-      using boost::put;
-      using boost::get;
-      if (D_ARY_UP_GRAEHL) {
-        for (;;) {
-          if (index == 0) break; // Stop at root
-          size_type parent_index = parent(index);
-          Value const& parent_value = data[parent_index];
-          if (better(currently_being_moved_dist, get(distance, parent_value))) {
-            move_heap_element(parent_value,index);
-            index = parent_index;
-          } else {
-            break; // Heap property satisfied
-          }
-        }
-        //finish "swap chain" by filling hole w/ currently_being_moved
-        move_heap_element(currently_being_moved,index); // note: it's ok not to return early on index==0 at start, even if self-assignment isn't supported by Value - because currently_being_moved is a copy.
-      } else {
-        put(index_in_heap,currently_being_moved,index);
-        put(distance,currently_being_moved,currently_being_moved_dist);
-        preserve_heap_property_up(index);
-      }
-    }
-
-    // Starting at a node, move up the tree swapping elements to preserve the
-    // heap property.  doesn't actually use swap; uses hole
-    void preserve_heap_property_up(size_type index) {
-      using boost::get;
-      if (index == 0) return; // Do nothing on root
-      if (D_ARY_UP_GRAEHL) {
-        Value copyi=data[index];
-        preserve_heap_property_up(copyi,index);
-        return;
-      }
-      size_type orig_index = index;
-      size_type num_levels_moved = 0;
-      // The first loop just saves swaps that need to be done in order to avoid
-      // aliasing issues in its search; there is a second loop that does the
-      // necessary swap operations
-      Value currently_being_moved = data[index];
-      distance_type currently_being_moved_dist =
-        get(distance, currently_being_moved);
-      for (;;) {
-        if (index == 0) break; // Stop at root
-        size_type parent_index = parent(index);
-        Value parent_value = data[parent_index];
-        if (better(currently_being_moved_dist, get(distance, parent_value))) {
-          ++num_levels_moved;
-          index = parent_index;
-          continue;
-        } else {
-          break; // Heap property satisfied
-        }
-      }
-      // Actually do the moves -- move num_levels_moved elements down in the
-      // tree, then put currently_being_moved at the top
-      index = orig_index;
-      using boost::put;
-      for (size_type i = 0; i < num_levels_moved; ++i) {
-        size_type parent_index = parent(index);
-        Value parent_value = data[parent_index];
-        put(index_in_heap, parent_value, index);
-        data[index] = parent_value;
-        index = parent_index;
-      }
-      data[index] = currently_being_moved;
-      put(index_in_heap, currently_being_moved, index);
-      verify_heap();
-    }
-
-
-    // From the root, swap elements (each one with its smallest child) if there
-    // are any parent-child pairs that violate the heap property.  v is placed at data[i], but then pushed down (note: data[i] won't be read explicitly; it will instead be overwritten by percolation).  this also means that v must be a copy of data[i] if it was already at i.
-    // e.g. v=data.back(), i=0, sz=data.size()-1 for pop(), implicitly swapping data[i], data.back(), and doing data.pop_back(), then adjusting from 0 down w/ swaps.  updates index_in_heap for v.
-    inline void preserve_heap_property_down(Value const& currently_being_moved,size_type i,size_type heap_size) {
-      using boost::get;
-      distance_type currently_being_moved_dist=get(distance,currently_being_moved);
-      Value* data_ptr = &data[0];
-      size_type index = 0; // hole at index - currently_being_moved to be put here when we find the final hole spot
-      for (;;) {
-        size_type first_child_index = child(index, 0);
-        if (first_child_index >= heap_size) break; /* No children */
-        Value* child_base_ptr = data_ptr + first_child_index; // using index of first_child_index+smallest_child_index because we hope optimizer will be smart enough to const-unroll a loop below if we do this.  i think the optimizer would have gotten it even without our help (i.e. store root-relative index)
-
-        // begin find best child index/distance
-        size_type smallest_child_index = 0; // don't add to base first_child_index every time we update which is smallest.
-        distance_type smallest_child_dist = get(distance, child_base_ptr[smallest_child_index]);
-#undef D_ARY_MAYBE_IMPROVE_CHILD_I
-#define D_ARY_MAYBE_IMPROVE_CHILD_I \
-            distance_type i_dist = get(distance, child_base_ptr[i]); \
-            if (better(i_dist, smallest_child_dist)) { \
-              smallest_child_index = i; \
-              smallest_child_dist = i_dist; \
-            }
-        if (first_child_index + Arity <= heap_size) {
-          // avoid repeated heap_size boundcheck (should test if this is really a speedup - instruction cache tradeoff - could use upperbound = min(Arity,heap_size-first_child_index) instead.  but this optimizes to a fixed number of iterations (compile time known) so probably worth it
-          for (size_t i = 1; i < Arity; ++i) {
-            D_ARY_MAYBE_IMPROVE_CHILD_I
-          }
-        } else {
-          for (size_t i = 1,e=heap_size - first_child_index; i < e; ++i) {
-            D_ARY_MAYBE_IMPROVE_CHILD_I
-          }
-        }
-        //end: know best child
-
-        if (better(smallest_child_dist, currently_being_moved_dist)) {
-          // instead of swapping, move.
-          move_heap_element(child_base_ptr[smallest_child_index],index); // move up
-          index=first_child_index+smallest_child_index; // descend - hole is now here
-        } else {
-          move_heap_element(currently_being_moved,index); // finish "swap chain" by filling hole
-          break;
-        }
-      }
-      verify_heap();
-    }
-
-    inline void preserve_heap_property_down(size_type i) {
-      preserve_heap_property_down(data[i],i,data.size());
-    }
-
-    void preserve_heap_property_down() {
-      using boost::get;
-      if (data.empty()) return;
-      if (D_ARY_DOWN_GRAEHL) { // this *should* be more efficient because i avoid swaps.
-        Value copy0=data[0];
-        preserve_heap_property_down(copy0,0,data.size());
-        return;
-      }
-      size_type index = 0;
-      Value currently_being_moved = data[0];
-      distance_type currently_being_moved_dist =
-        get(distance, currently_being_moved);
-      size_type heap_size = data.size();
-      Value* data_ptr = &data[0];
-      for (;;) {
-        size_type first_child_index = child(index, 0);
-        if (first_child_index >= heap_size) break; /* No children */
-        Value* child_base_ptr = data_ptr + first_child_index;
-        size_type smallest_child_index = 0;
-        distance_type smallest_child_dist = get(distance, child_base_ptr[smallest_child_index]);
-        if (first_child_index + Arity <= heap_size) {
-          for (size_t i = 1; i < Arity; ++i) { // can be unrolled completely.
-
-            D_ARY_MAYBE_IMPROVE_CHILD_I
-          }
-        } else {
-          for (size_t i = 1,e=heap_size - first_child_index; i < e; ++i) {
-            D_ARY_MAYBE_IMPROVE_CHILD_I
-          }
-        }
-        if (better(smallest_child_dist, currently_being_moved_dist)) {
-          swap_heap_elements(smallest_child_index + first_child_index, index);
-          index = smallest_child_index + first_child_index;
-          continue;
-        } else {
-          break; // Heap property satisfied
-        }
-      }
-      verify_heap();
-    }
-
-  };
-
-#endif
diff --git a/utils/fast_lexical_cast.hpp b/utils/fast_lexical_cast.hpp
index ae49c934..ae49c934 100755..100644
--- a/utils/fast_lexical_cast.hpp
+++ b/utils/fast_lexical_cast.hpp
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 8fe6cb3d..2c49948c 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -178,6 +178,12 @@ class FastSparseVector {
   T l2norm() const {
     return sqrt(l2norm_sq());
   }
+  T pnorm(const double p) const {
+    T sum = T();
+    for (const_iterator it = begin(), e = end(); it != e; ++it)
+      sum += pow(fabs(it->second), p);
+    return pow(sum, 1.0 / p);
+  }
   // if values are binary, gives |A intersect B|/|A union B|
   template<typename S>
   S tanimoto_coef(const FastSparseVector<S> &vec) const {
@@ -373,7 +379,7 @@ class FastSparseVector {
     }
     ar & eff_size;
     while (it != this->end()) {
-      const std::pair<const std::string&, const T&> wire_pair(FD::Convert(it->first), it->second);
+      const std::pair<std::string, T> wire_pair(FD::Convert(it->first), it->second);
       ar & wire_pair;
       ++it;
     }
diff --git a/utils/fdict.h b/utils/fdict.h
index f0871b9a..0a2a9456 100644
--- a/utils/fdict.h
+++ b/utils/fdict.h
@@ -10,7 +10,7 @@
 
 #ifdef HAVE_CMPH
 #include "perfect_hash.h"
-#include "string_to.h"
+#include <sstream>
 #endif
 
 struct FD {
@@ -49,7 +49,9 @@ struct FD {
 #ifdef HAVE_CMPH
     if (hash_) {
       static std::string tls;
-      tls = to_string(w);
+      std::ostringstream os;
+      os << w;
+      tls = os.str();
       return tls;
     }
 #endif
diff --git a/utils/feature_vector.h b/utils/feature_vector.h
index a7b61a66..a7b61a66 100755..100644
--- a/utils/feature_vector.h
+++ b/utils/feature_vector.h
diff --git a/utils/ftoa.h b/utils/ftoa.h
deleted file mode 100755
index 3dba528d..00000000
--- a/utils/ftoa.h
+++ /dev/null
@@ -1,403 +0,0 @@
-#ifndef FTOA_H
-#define FTOA_H
-
-
-//TODO: for fractional digits/non-sci, determine the right amount of left padding (more if the whole number is indeed <1, to keep the significant digits), less if sci notation and/or mantissa has sig. digits (don't want N before . and N after!)
-
-#ifndef FTOA_ROUNDTRIP
-# define FTOA_ROUNDTRIP 1
-#endif
-
-#ifndef FTOA_DEBUG
-# define FTOA_DEBUG 0
-#endif
-
-#ifndef FTOA_USE_SPRINTF
-#define FTOA_USE_SPRINTF 0
-#endif
-
-#if FTOA_DEBUG
-# define FTOAassert(x) assert(x)
-# define DBFTOA(x) std::cerr<<"\nFTOA " <<__func__<<"("<<__LINE__<<"): " #x "="<<x<<"\n"
-# define DBFTOA2(x0,x1) std::cerr<<"\nFTOA " <<__func__<<"("<<__LINE__<<"): " #x0 "="<<x0<<" " #x1 "="<<x1 <<"\n"
-#else
-# define FTOAassert(x)
-# define DBFTOA(x)
-# define DBFTOA2(x0,x1)
-#endif
-
-/* DECIMAL_FOR_WHOLE ; ftos(123)
-   0 ; 123
-   1 ; 123
-   2 ; 123.
-     ; ftos(0) is always just "0" (not "0.0")
-     ; ftos(.01)
-   0 ; .01
-   1 ; 0.01
-   2 ; 0.01
-
-*/
-
-#ifndef DECIMAL_FOR_WHOLE
-# define DECIMAL_FOR_WHOLE 1
-#endif
-
-#include <limits>
-#include <stdint.h>
-#include <iostream>
-#include <cmath>
-#include <assert.h>
-#include <cstdio>
-#include "utoa.h"
-#include "nan.h"
-
-template <class Float>
-struct ftoa_traits {
-};
-
-//eP10,
-// sigd decimal places normally printed, roundtripd needed so that round-trip float->string->float is identity
-
-#define DEFINE_FTOA_TRAITS(FLOATT,INTT,sigd,roundtripd,small,large,used,P10)   \
-template <> \
-struct ftoa_traits<FLOATT> { \
-  typedef INTT int_t; \
-  typedef u ## INTT uint_t; \
-  typedef FLOATT float_t; \
-  enum { digits10=std::numeric_limits<INTT>::digits10, chars_block=P10, usedig=used, sigdig=sigd, roundtripdig=roundtripd, bufsize=roundtripdig+7 }; \
-  static const double pow10_block = 1e ## P10; \
-  static const float_t small_f = small; \
-  static const float_t large_f = large; \
-  static inline int sprintf(char *buf,double f) { return std::sprintf(buf,"%." #used "g",f); } \
-  static inline int sprintf_sci(char *buf,double f) { return std::sprintf(buf,"%." #used "e",f); } \
-  static inline int sprintf_nonsci(char *buf,double f) { return std::sprintf(buf,"%." #used "f",f); } \
-  static inline uint_t fracblock(double frac) { FTOAassert(frac>=0 && frac<1); double f=frac*pow10_block;uint_t i=(uint_t)f;FTOAassert(i<pow10_block);return i; } \
-  static inline uint_t rounded_fracblock(double frac) { FTOAassert(frac>=0 && frac<1); double f=frac*pow10_block;uint_t i=(uint_t)(f+.5);FTOAassert(i<pow10_block);return i; }  \
-  static inline float_t mantexp10(float_t f,int &exp) { float_t e=std::log10(f); float_t ef=std::floor(e); exp=ef;  return f/std::pow((float_t)10,ef); } \
-  static inline bool use_sci_abs(float_t fa) { return fa<small || fa>large; } \
-  static inline bool use_sci(float_t f) { return use_sci_abs(std::fabs(f)); }   \
-};
-//TODO: decide on computations in double (would hurt long double) or in native float type - any advantage?  more precision is usually better.
-
-//10^22 = 0x1.0f0cf064dd592p73 is the largest exactly representable power of 10 in the binary64 format.  but round down to 18 so int64_t can hold it.
-
-#if FTOA_ROUNDTRIP
-#define DEFINE_FTOA_TRAITS_ROUNDTRIP(FLOATT,INTT,sigd,roundtripd,small,large) DEFINE_FTOA_TRAITS(FLOATT,INTT,sigd,roundtripd,small,large,roundtripd,roundtripd)
-#else
-#define DEFINE_FTOA_TRAITS_ROUNDTRIP(FLOATT,INTT,sigd,roundtripd,small,large) DEFINE_FTOA_TRAITS(FLOATT,INTT,sigd,roundtripd,small,large,sigd,sigd)
-#endif
-
-DEFINE_FTOA_TRAITS_ROUNDTRIP(double,int64_t,15,17,1e-5,1e8)
-//i've heard that 1e10 is fine for float.  but we only have 1e9 (9 decimal places) in int32.
-DEFINE_FTOA_TRAITS_ROUNDTRIP(float,int32_t,6,9,1e-3,1e8)
-
-
-template <class F>
-inline void ftoa_error(F f,char const* msg="") {
-  using namespace std;
-  cerr<<"ftoa error: "<<msg<<" f="<<f<<endl;
-  assert(!"ftoa error");
-}
-
-// all of the below prepend and return new cursor.  null terminate yourself (like itoa/utoa)
-
-//possibly empty string for ~0 (no sci notation fallback).  left padded with the right number of 0s (tricky).  [ret,p) are the digits.
-template <class F>
-char *prepend_pos_frac_digits(char *p,F f) {
-  FTOAassert(f<1 && f >0);
-  typedef ftoa_traits<F> FT;
-  //repeat if very small???  nah, require sci notation to take care of it.
-  typename FT::uint_t i=FT::rounded_fracblock(f);
-  DBFTOA2(f,i);
-  if (i>0) {
-    unsigned n_skipped;
-    char *d=utoa_drop_trailing_0(p,i,n_skipped);
-    char *b=p-FT::chars_block+n_skipped;
-    FTOAassert(b<=d);
-    left_pad(b,d,'0');
-    return b;
-  } else {
-    return p;
-  }
-}
-
-template <class F>
-char *append_pos_frac_digits(char *p,F f) { // '0' right-padded, nul terminated, return position of nul.  [p,ret) are the digits
-  if (f==0) {
-    *p++='0';
-    return p;
-  }
-  FTOAassert(f<1 && f >0);
-  typedef ftoa_traits<F> FT;
-  //repeat if very small???  nah, require sci notation to take care of it.
-  typename FT::uint_t i=FT::rounded_fracblock(f);
-  DBFTOA2(f,i);
-  if (i>0) {
-    char *e=p+FT::chars_block;
-    utoa_left_pad(p,e,i,'0');
-    *e=0;
-    return e;
-  } else {
-    *p=0;
-    return p;
-  }
-}
-
-template <class F>
-inline char *prepend_pos_frac(char *p,F f) {
-  FTOAassert(f<1 && f>=0);
-  if (f==0) {
-    *--p='0';
-    return p;
-  }
-  p=prepend_pos_frac_digits(p,f);
-  *--p='.';
-  if (DECIMAL_FOR_WHOLE>0)
-    *--p='0';
-  return p;
-}
-
-template <class F>
-inline char *append_pos_frac(char *p,F f) {
-  DBFTOA(f);
-  if (DECIMAL_FOR_WHOLE>0)
-    *p++='0';
-  *p++='.';
-  return append_pos_frac_digits(p,f);
-}
-
-template <class F>
-inline char *prepend_frac(char *p,F f,bool positive_sign=false) {
-  FTOAassert(f<1 && f>-1);
-  if (f==0)
-    *--p='0';
-  else if (f<0) {
-    p=prepend_pos_frac(p,-f);
-    *--p='-';
-  } else {
-    p=prepend_pos_frac(p,f);
-    if (positive_sign)
-      *--p='+';
-  }
-  return p;
-}
-
-
-template <class F>
-inline char *append_sign(char *p,F f,bool positive_sign=false) {
-  if (f<0) {
-    *p++='-';
-  } else if (positive_sign)
-    *p++='+';
-  return p;
-}
-
-template <class F>
-inline char *append_frac(char *p,F f,bool positive_sign=false) {
-  FTOAassert(f<1 && f>-1);
-  if (f==0) {
-    *p++='0';
-    return p;
-  } else if (f<0) {
-    *p++='-';
-    return append_pos_frac(p,-f);
-  }
-  if (positive_sign) {
-    *p++='+';
-    return append_pos_frac(p,f);
-  }
-
-}
-
-
-//append_frac, append_pos_sci, append_sci.  notice these are all composed according to a pattern (but reversing order of composition in pre vs app).  or can implement with copy through buffer
-
-/* will switch to sci notation if integer part is too big for the int type. but for very small values, will simply display 0 (i.e. //TODO: find out log10 and leftpad 0s then convert rest) */
-template <class F>
-char *prepend_pos_nonsci(char *p,F f) {
-  typedef ftoa_traits<F> FT;
-  typedef typename FT::uint_t uint_t;
-  DBFTOA(f);
-  FTOAassert(f>0);
-  if (f>std::numeric_limits<uint_t>::max())
-    return prepend_pos_sci(p,f);
-  //which is faster - modf is weird and returns negative frac part if f is negative.  while we could deal with this using fabs, we instead only handle positive here (put - sign in front and negate, then call us) - ?
-#if 0
-  F intpart;
-  F frac=std::modf(f,&intpart);
-  uint_t u=intpart;
-#else
-  uint_t u=f;
-  F frac=f-u;
-#endif
-  DBFTOA2(u,frac);
-  if (frac == 0) {
-    if (DECIMAL_FOR_WHOLE>1)
-      *--p='.';
-  } else {
-    p=prepend_pos_frac_digits(p,frac);
-    *--p='.';
-  }
-  if (u==0) {
-    if (DECIMAL_FOR_WHOLE>0)
-      *--p='0';
-  } else
-    p=utoa(p,u);
-  return p;
-}
-
-// modify p; return true if handled
-template <class F>
-inline bool prepend_0_etc(char *&p,F f,bool positive_sign=false) {
-  if (f==0) {
-    *--p='0';
-    return true;
-  }
-  if (is_nan(f)) {
-    p-=3;
-    p[0]='N';p[1]='A';p[2]='N';
-    return true;
-  }
-  if (is_pos_inf(f)) {
-    p-=3;
-    p[0]='I';p[1]='N';p[2]='F';
-    if (positive_sign)
-      *--p='+';
-    return true;
-  }
-  if (is_neg_inf(f)) {
-    p-=4;
-    p[0]='-';p[1]='I';p[2]='N';p[3]='F';
-    return true;
-  }
-  return false;
-}
-
-template <class F>
-inline char *prepend_nonsci(char *p,F f,bool positive_sign=false) {
-  if (prepend_0_etc(p,f,positive_sign)) return p;
-  if (f<0) {
-    p=prepend_pos_nonsci(p,-f);
-    *--p='-';
-  } else {
-    p=prepend_pos_nonsci(p,f);
-    if (positive_sign)
-      *--p='+';
-  }
-  return p;
-}
-
-template <class F>
-inline char *prepend_pos_sci(char *p,F f,bool positive_sign_exp=false) {
-  FTOAassert(f>0);
-  typedef ftoa_traits<F> FT;
-  int e10;
-  F mant=FT::mantexp10(f,e10);
-  DBFTOA(f);
-  DBFTOA2(mant,e10);
-  FTOAassert(mant<10.00001);
-  if (mant>=10.) {
-    ++e10;
-    mant*=.1;
-  } else if (mant < 1.) {
-    --e10;
-    mant*=10;
-  }
-  p=itoa(p,e10,positive_sign_exp);
-  *--p='e';
-  return prepend_pos_nonsci(p,mant);
-}
-
-template <class F>
-inline char *prepend_sci(char *p,F f,bool positive_sign_mant=false,bool positive_sign_exp=false) {
-  if (prepend_0_etc(p,f,positive_sign_mant)) return p;
-  if (f==0)
-    *--p='0';
-  else if (f<0) {
-    p=prepend_pos_sci(p,-f,positive_sign_exp);
-    *--p='-';
-  } else {
-    p=prepend_pos_sci(p,f,positive_sign_exp);
-    if (positive_sign_mant)
-      *--p='+';
-  }
-  return p;
-}
-
-template <class F>
-inline char *append_nonsci(char *p,F f,bool positive_sign=false) {
-  if (positive_sign&&f>=0) *p++='+';
-  return p+ftoa_traits<F>::sprintf_nonsci(p,f);
-}
-
-template <class F>
-inline char *append_sci(char *p,F f,bool positive_sign=false) {
-  if (positive_sign&&f>=0) *p++='+';
-  return p+ftoa_traits<F>::sprintf_sci(p,f);
-}
-
-template <class F>
-inline char *append_ftoa(char *p,F f,bool positive_sign=false) {
-  if (positive_sign&&f>=0) *p++='+';
-  return p+ftoa_traits<F>::sprintf(p,f);
-}
-
-template <class F>
-inline char *prepend_ftoa(char *p,F f)
-{
-  typedef ftoa_traits<F> FT;
-  return FT::use_sci(f) ? prepend_sci(p,f) : prepend_nonsci(p,f);
-}
-
-template <class F>
-inline std::string ftos_append(F f) {
-  typedef ftoa_traits<F> FT;
-  char buf[FT::bufsize];
-  return std::string(buf,append_ftoa(buf,f));
-}
-
-template <class F>
-inline std::string ftos_prepend(F f) {
-  typedef ftoa_traits<F> FT;
-  char buf[FT::bufsize];
-  char *end=buf+FT::bufsize;
-  return std::string(prepend_ftoa(end,f),end);
-}
-
-
-template <class F>
-inline std::string ftos(F f) {
-#if 0
-  // trust RVO?  no extra copies?
-  return FTOA_USE_SPRINTF ? ftos_append(f) : ftos_prepend(f);
-#else
-  typedef ftoa_traits<F> FT;
-  char buf[FT::bufsize];
-  if (FTOA_USE_SPRINTF) {
-    return std::string(buf,append_ftoa(buf,f));
-  } else {
-    char *end=buf+FT::bufsize;
-    return std::string(prepend_ftoa(end,f),end);
-  }
-#endif
-}
-
-namespace {
-  const int ftoa_bufsize=30;
-  char ftoa_outbuf[ftoa_bufsize];
-}
-
-// not even THREADLOCAL - don't use.
-inline char *static_ftoa(float f)
-{
-  if (FTOA_USE_SPRINTF) {
-    append_ftoa(ftoa_outbuf,f);
-    return ftoa_outbuf;
-  } else {
-    char *end=ftoa_outbuf+ftoa_bufsize;
-    return prepend_ftoa(end,f);
-  }
-}
-
-
-#endif
diff --git a/utils/hash.h b/utils/hash.h
index 2290bc34..2290bc34 100755..100644
--- a/utils/hash.h
+++ b/utils/hash.h
diff --git a/utils/have_64_bits.h b/utils/have_64_bits.h
index d1e6064f..d1e6064f 100755..100644
--- a/utils/have_64_bits.h
+++ b/utils/have_64_bits.h
diff --git a/utils/indices_after.h b/utils/indices_after.h
index 62683f39..62683f39 100755..100644
--- a/utils/indices_after.h
+++ b/utils/indices_after.h
diff --git a/utils/int_or_pointer.h b/utils/int_or_pointer.h
deleted file mode 100755
index 4b6a9e4a..00000000
--- a/utils/int_or_pointer.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INT_OR_POINTER_H
-#define INT_OR_POINTER_H
-
-// if you ever wanted to store a discriminated union of pointer/integer without an extra boolean flag, this will do it, assuming your pointers are never odd.
-
-// check lsb for expected tag?
-#ifndef IOP_CHECK_LSB
-# define IOP_CHECK_LSB 1
-#endif
-#if IOP_CHECK_LSB
-# define iop_assert(x) assert(x)
-#else
-# define iop_assert(x)
-#endif
-
-#include <assert.h>
-#include <iostream>
-
-template <class Pointed=void,class Int=size_t>
-struct IntOrPointer {
-  typedef Pointed pointed_type;
-  typedef Int integer_type;
-  typedef Pointed *value_type;
-  typedef IntOrPointer<Pointed,Int> self_type;
-  IntOrPointer(int j) { *this=j; }
-  IntOrPointer(size_t j) { *this=j; }
-  IntOrPointer(value_type v) { *this=v; }
-  bool is_integer() const { return i&1; }
-  bool is_pointer() const { return !(i&1); }
-  value_type & pointer() { return p; }
-  const value_type & pointer() const { iop_assert(is_pointer()); return p; }
-  integer_type integer() const { iop_assert(is_integer()); return i >> 1; }
-  void set_integer(Int j) { i=2*j+1; }
-  void set_pointer(value_type p_) { p=p_;iop_assert(is_pointer()); }
-  void operator=(unsigned j) { i = 2*(integer_type)j+1; }
-  void operator=(int j) { i = 2*(integer_type)j+1; }
-  template <class C>
-  void operator=(C j) { i = 2*(integer_type)j+1; }
-  void operator=(value_type v) { p=v; }
-  IntOrPointer() {}
-  IntOrPointer(const self_type &s) : p(s.p) {}
-  void operator=(const self_type &s) { p=s.p; }
-  template <class C>
-  bool operator ==(C* v) const { return p==v; }
-  template <class C>
-  bool operator ==(const C* v) const { return p==v; }
-  template <class C>
-  bool operator ==(C j) const { return integer() == j; }
-  bool operator ==(self_type s) const { return p==s.p; }
-  bool operator !=(self_type s) const { return p!=s.p; }
-  template <class O> void print(O&o) const
-  {
-    if (is_integer())
-      o << integer();
-    else {
-      o << "0x" << std::hex << (size_t)pointer() << std::dec;
-    }
-  }
-  friend inline std::ostream& operator<<(std::ostream &o,self_type const& s) {
-    s.print(o); return o;
-  }
-protected:
-  union {
-    value_type p; // must be even (guaranteed unless you're pointing at packed chars)
-    integer_type i; // stored as 2*data+1, so only has half the range (one less bit) of a normal integer_type
-  };
-};
-
-
-#endif
diff --git a/utils/intern_pool.h b/utils/intern_pool.h
deleted file mode 100755
index 7c739add..00000000
--- a/utils/intern_pool.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef INTERN_POOL_H
-#define INTERN_POOL_H
-
-#define DEBUG_INTERN_POOL(x) x
-
-/* to "intern" a string in lisp is to make a symbol from it (a pointer to a canonical copy whose pointer can be equality-compared/hashed directly with other interned things).  we take an Item that has a key part and some mutable parts (that aren't in its identity), and we hash-by-value the key part to map to a canonical on-heap Item - and we use a boost object pool to allocate them */
-
-//FIXME: actually store function object state (assumed stateless so far)
-
-#include <boost/pool/object_pool.hpp>
-#include "hash.h"
-//#include "null_traits.h"
-#include <functional>
-
-template <class I>
-struct get_key { // default accessor for I = like pair<key,val>
-  typedef typename I::first_type const& result_type;
-  typedef I const& argument_type;
-  result_type operator()(I const& i) const {
-    return i.first;
-  }
-};
-
-// Arg type should be the non-pointer version.  this saves me from using boost type traits to remove_pointer.  f may be binary or unary
-template <class KeyF,class F,class Arg=typename KeyF::argument_type>
-struct compose_indirect {
-  typedef Arg *argument_type; // we also accept Arg &
-  KeyF kf;
-  F f;
-  typedef typename F::result_type result_type;
-  result_type operator()(Arg const& p) const {
-    return f(kf(p));
-  }
-  result_type operator()(Arg & p) const {
-    return f(kf(p));
-  }
-  result_type operator()(Arg * p) const {
-    return f(kf(*p));
-  }
-  template <class V>
-  result_type operator()(V const& v) const {
-    return f(kf(*v));
-  }
-
-  result_type operator()(Arg const& a1,Arg const& a2) const {
-    return f(kf(a1),kf(a2));
-  }
-  result_type operator()(Arg & a1,Arg & a2) const {
-    return f(kf(a1),kf(a2));
-  }
-  result_type operator()(Arg * a1,Arg * a2) const {
-    return f(kf(*a1),kf(*a2));
-  }
-  template <class V,class W>
-  result_type operator()(V const& v,W const&w) const {
-    return f(kf(*v),kf(*w));
-  }
-
-
-};
-
-template <class KeyF,class F,class Arg=typename KeyF::argument_type>
-struct equal_indirect {
-  typedef Arg *argument_type; // we also accept Arg &
-  KeyF kf;
-  F f;
-  typedef bool result_type;
-
-  result_type operator()(Arg const& a1,Arg const& a2) const {
-    return f(kf(a1),kf(a2));
-  }
-  result_type operator()(Arg & a1,Arg & a2) const {
-    return f(kf(a1),kf(a2));
-  }
-  result_type operator()(Arg * a1,Arg * a2) const {
-    return a1==a2||(a1&&a2&&f(kf(*a1),kf(*a2)));
-  }
-  template <class V,class W>
-  result_type operator()(V const& v,W const&w) const {
-    return v==w||(v&&w&&f(kf(*v),kf(*w)));
-  }
-
-
-};
-
-/*
-
-template <class F>
-struct indirect_function {
-  F f;
-  explicit indirect_function(F const& f=F()) : f(f) {}
-  typedef typename F::result_type result_type;
-  template <class V>
-  result_type operator()(V *p) const {
-    return f(*p);
-  }
-};
-*/
-
-template <class Item,class KeyF=get_key<Item>,class HashKey=boost::hash<typename KeyF::result_type>,class EqKey=std::equal_to<typename KeyF::result_type>, class Pool=boost::object_pool<Item> >
-struct intern_pool : Pool {
-  KeyF key;
-  typedef typename KeyF::result_type Key;
-  typedef Item *Handle;
-  typedef compose_indirect<KeyF,HashKey,Item> HashDeep;
-  typedef equal_indirect<KeyF,EqKey,Item> EqDeep;
-  typedef HASH_SET<Handle,HashDeep,EqDeep> Canonical;
-  typedef typename Canonical::iterator CFind;
-  typedef std::pair<CFind,bool> CInsert;
-  Canonical canonical;
-  bool interneq(Handle &i) { // returns true if i is newly interned, false if it already existed
-    CInsert i_new=canonical.insert(i);
-    i=*i_new.first;
-    return i_new.second;
-  }
-// inherited: Handle construct(...)
-  Handle construct_fresh() { return Pool::construct(); }
-  Handle intern(Handle i) { // (maybe invalidating i, returning a valid canonical handle (pointer)
-    CInsert i_new=canonical.insert(i);
-    if (i_new.second)
-      return i;
-    else {
-      free(i);
-      return *i_new->first;
-    }
-  }
-  void destroy_interned(Handle i) {
-    DEBUG_INTERN_POOL(assert(canonical.find(i)!=canonical.end()));
-    canonical.erase(i);
-    destroy(i);
-  }
-  bool destroy_fresh(Handle i) {
-    DEBUG_INTERN_POOL(assert(canonical.find(i)!=canonical.end()||*canonical.find(i)!=i)); // i is a constructed item not yet interned.
-    destroy(i);
-  }
-  void destroy_both(Handle i) { // i must have come from this pool.  may be interned, or not.  destroy both the noninterned and interned.
-    if (!destroy_if_interned(i)) destroy(i);
-  }
-  // destroy intern(i) if it exists.  return true if it existed AND its address was i.  otherwise return false (whether or not a value-equal item existed and was destroyed)
-  bool destroy_if_interned(Handle i) {
-    CFind f=canonical.find(i);
-    if (f!=canonical.end()) {
-      Handle interned=*f;
-      canonical.erase(f);
-      destroy(f);
-      if (f==i) return true;
-    }
-    return false;
-  }
-
-  intern_pool() {
-    HASH_MAP_EMPTY(canonical,(Handle)0);
-  }
-};
-
-
-
-#endif
diff --git a/utils/intrusive_refcount.hpp b/utils/intrusive_refcount.hpp
index 4a4b0187..4a4b0187 100755..100644
--- a/utils/intrusive_refcount.hpp
+++ b/utils/intrusive_refcount.hpp
diff --git a/utils/kernel_string_subseq.h b/utils/kernel_string_subseq.h
new file mode 100644
index 00000000..516e8b89
--- /dev/null
+++ b/utils/kernel_string_subseq.h
@@ -0,0 +1,51 @@
+#ifndef _KERNEL_STRING_SUBSEQ_H_
+#define _KERNEL_STRING_SUBSEQ_H_
+
+#include <vector>
+#include <cmath>
+#include <boost/multi_array.hpp>
+
+template <unsigned N, typename T>
+float ssk(const T* s, const size_t s_size, const T* t, const size_t t_size, const float lambda) {
+  assert(N > 0);
+  boost::multi_array<float, 3> kp(boost::extents[N + 1][s_size + 1][t_size + 1]);
+  const float l2 = lambda * lambda;
+  for (unsigned j = 0; j < s_size; ++j)
+    for (unsigned k = 0; k < t_size; ++k)
+      kp[0][j][k] = 1.0f;
+  for (unsigned i = 0; i < N; ++i) {
+    for (unsigned j = 0; j < s_size; ++j) {
+      float kpp = 0.0f;
+      for (unsigned k = 0; k < t_size; ++k) {
+        kpp = lambda * (kpp + lambda * (s[j]==t[k]) * kp[i][j][k]);
+        kp[i + 1][j + 1][k + 1] = lambda * kp[i + 1][j][k + 1] + kpp;
+      }
+    }
+  }
+  float kn = 0.0f;
+  for (int i = 0; i < N; ++i)
+    for (int j = 0; j < s_size; ++j)
+      for (int k = 0; k < t_size; ++k)
+        kn += l2 * (s[j] == t[k]) * kp[i][j][k];
+  return kn;
+}
+
+template <unsigned N, typename T>
+float ssk(const std::vector<T>& s, const std::vector<T>& t, const float lambda) {
+  float kst = ssk<N, T>(&s[0], s.size(), &t[0], t.size(), lambda);
+  if (!kst) return 0.0f;
+  float kss = ssk<N, T>(&s[0], s.size(), &s[0], s.size(), lambda);
+  float ktt = ssk<N, T>(&t[0], t.size(), &t[0], t.size(), lambda);
+  return kst / std::sqrt(kss * ktt);
+}
+
+template <unsigned N>
+float ssk(const std::string& s, const std::string& t, const float lambda) {
+  float kst = ssk<N, char>(&s[0], s.size(), &t[0], t.size(), lambda);
+  if (!kst) return 0.0f;
+  float kss = ssk<N, char>(&s[0], s.size(), &s[0], s.size(), lambda);
+  float ktt = ssk<N, char>(&t[0], t.size(), &t[0], t.size(), lambda);
+  return kst / std::sqrt(kss * ktt);
+}
+
+#endif
diff --git a/utils/lvalue_pmap.h b/utils/lvalue_pmap.h
deleted file mode 100755
index 5b9403c0..00000000
--- a/utils/lvalue_pmap.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef LVALUE_PMAP_H
-#define LVALUE_PMAP_H
-
-#include <boost/property_map/property_map.hpp>
-
-// i checked: boost provides get and put given [] - but it's not being found by ADL so instead i define them myself
-
-// lvalue property map pmapname<P> that is: P p; valtype &v=p->name;
-#define PMAP_MEMBER_INDIRECT(pmapname,valtype,name) template <class P> struct pmapname {  \
-  typedef P key_type; \
-  typedef valtype value_type; \
-  typedef value_type & reference; \
-  typedef boost::lvalue_property_map_tag category;          \
-  reference operator[](key_type p) const { return p->name; } \
-  typedef pmapname<P> self_type; \
-  friend inline value_type const& get(self_type const&,key_type p) { return p->name; } \
-  friend inline void put(self_type &,key_type p,value_type const& v) { p->name = v; }             \
-};
-
-#define PMAP_MEMBER_INDIRECT_2(pmapname,name) template <class P,class R> struct pmapname {    \
-  typedef P key_type; \
-  typedef R value_type; \
-  typedef value_type & reference; \
-  typedef boost::lvalue_property_map_tag category; \
-  reference operator[](key_type p) const { return p->name; } \
-  typedef pmapname<P,R> self_type;                                                      \
-  friend inline value_type const& get(self_type const&,key_type p) { return p->name; } \
-  friend inline void put(self_type &,key_type p,value_type const& v) { p->name = v; }             \
-};
-
-#endif
diff --git a/utils/m.h b/utils/m.h
new file mode 100644
index 00000000..dc881b36
--- /dev/null
+++ b/utils/m.h
@@ -0,0 +1,140 @@
+#ifndef _M_H_
+#define _M_H_
+
+#include <cassert>
+#include <cmath>
+#include <boost/math/special_functions/digamma.hpp>
+#include <boost/math/constants/constants.hpp>
+
+// TODO right now I sometimes assert that x is in the support of the distributions
+// should be configurable to return -inf instead
+
+template <typename F>
+struct M {
+  // support [0, 1, 2 ...)
+  static inline F log_poisson(unsigned x, const F& lambda) {
+    assert(lambda > 0.0);
+    return std::log(lambda) * x - lgamma(x + 1) - lambda;
+  }
+
+  // support [0, 1, 2 ...)
+  static inline F log_geometric(unsigned x, const F& p) {
+    assert(p > 0.0);
+    assert(p < 1.0);
+    return std::log(1 - p) * x + std::log(p);
+  }
+
+  // log of the binomial coefficient
+  static inline F log_binom_coeff(unsigned n, unsigned k) {
+    assert(n >= k);
+    if (n == k) return 0.0;
+    return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1);
+  }
+
+  // http://en.wikipedia.org/wiki/Negative_binomial_distribution
+  // support [0, 1, 2 ...)
+  static inline F log_negative_binom(unsigned x, unsigned r, const F& p) {
+    assert(p > 0.0);
+    assert(p < 1.0);
+    return log_binom_coeff(x + r - 1u, x) + r * std::log(F(1) - p) + x * std::log(p);
+  }
+
+  // this is the Beta function, *not* the beta probability density
+  // http://mathworld.wolfram.com/BetaFunction.html
+  static inline F log_beta_fn(const F& x, const F& y) {
+    return lgamma(x) + lgamma(y) - lgamma(x + y);
+  }
+
+  // support x >= 0.0
+  static F log_gamma_density(const F& x, const F& shape, const F& rate) {
+    assert(x >= 0.0);
+    assert(shape > 0.0);
+    assert(rate > 0.0);
+    return (shape-1)*std::log(x) - shape*std::log(rate) - x/rate - lgamma(shape);
+  }
+
+  // this is the Beta *density* p(x ; alpha, beta)
+  // support x \in (0,1)
+  static inline F log_beta_density(const F& x, const F& alpha, const F& beta) {
+    assert(x > 0.0);
+    assert(x < 1.0);
+    assert(alpha > 0.0);
+    assert(beta > 0.0);
+    return (alpha-1)*std::log(x)+(beta-1)*std::log(1-x) - log_beta_fn(alpha, beta);
+  }
+
+  // support x \in R
+  static inline F log_laplace_density(const F& x, const F& mu, const F& b) {
+    assert(b > 0.0);
+    return -std::log(2*b) - std::fabs(x - mu) / b;
+  }
+
+  // support x \in R
+  // this is NOT the "log normal" density, it is the log of the "normal density at x"
+  static inline F log_gaussian_density(const F& x, const F& mu, const F& var) {
+    assert(var > 0.0);
+    return -0.5 * std::log(var * 2 * boost::math::constants::pi<F>()) - (x - mu)*(x - mu) / (2 * var);
+  }
+
+  // (x1,x2) \in R^2
+  // parameterized in terms of two means, a two "variances", a correlation < 1
+  static inline F log_bivariate_gaussian_density(const F& x1, const F& x2,
+                                                 const F& mu1, const F& mu2,
+                                                 const F& var1, const F& var2,
+                                                 const F& cor) {
+    assert(var1 > 0);
+    assert(var2 > 0);
+    assert(std::fabs(cor) < 1.0);
+    const F cor2 = cor*cor;
+    const F var1var22 = var1 * var2;
+    const F Z = 0.5 * std::log(var1var22 * (1 - cor2)) + std::log(2 * boost::math::constants::pi<F>());
+    return -Z -1.0 / (2 * (1 - cor2)) * ((x1 - mu1)*(x1-mu1) / var1 + (x2-mu2)*(x2-mu2) / var2 - 2*cor*(x1 - mu1)*(x2-mu2) / std::sqrt(var1var22));
+  }
+
+  // support x \in [a,b]
+  static inline F log_triangle_density(const F& x, const F& a, const F& b, const F& c) {
+    assert(a < b);
+    assert(a <= c);
+    assert(c <= b);
+    assert(x >= a);
+    assert(x <= b);
+    if (x <= c)
+      return std::log(2) + std::log(x - a) - std::log(b - a) - std::log(c - a);
+    else
+      return std::log(2) + std::log(b - x) - std::log(b - a) - std::log(b - c);
+  }
+
+  // note: this has been adapted so that 0 is in the support of the distribution
+  // support [0, 1, 2 ...)
+  static inline F log_yule_simon(unsigned x, const F& rho) {
+    assert(rho > 0.0);
+    return std::log(rho) + log_beta_fn(x + 1, rho + 1);
+  }
+
+  // see http://www.gatsby.ucl.ac.uk/~ywteh/research/compling/hpylm.pdf
+  // when y=1, sometimes written x^{\overline{n}} or x^{(n)} "Pochhammer symbol"
+  static inline F log_generalized_factorial(const F& x, const F& n, const F& y = 1.0) {
+    assert(x > 0.0);
+    assert(y >= 0.0);
+    assert(n > 0.0);
+    if (!n) return 0.0;
+    if (y == F(1)) {
+      return lgamma(x + n) - lgamma(x);
+    } else if (y) {
+      return n * std::log(y) + lgamma(x/y + n) - lgamma(x/y);
+    } else {  // y == 0.0
+      return n * std::log(x);
+    }
+  }
+
+  // digamma is the first derivative of the log-gamma function
+  static inline F digamma(const F& x) {
+    return boost::math::digamma(x);
+  }
+
+};
+
+typedef M<double> Md;
+typedef M<double> Mf;
+
+#endif
diff --git a/utils/m_test.cc b/utils/m_test.cc
new file mode 100644
index 00000000..c4d6a166
--- /dev/null
+++ b/utils/m_test.cc
@@ -0,0 +1,91 @@
+#include "m.h"
+
+#include <iostream>
+#include <gtest/gtest.h>
+#include <cassert>
+
+using namespace std;
+
+class MTest : public testing::Test {
+ public:
+  MTest() {}
+ protected:
+  virtual void SetUp() { }
+  virtual void TearDown() { }
+};
+
+TEST_F(MTest, Densities) {
+  double px1 = Md::log_gaussian_density(1.0, 0.0, 1.0);
+  double px2 = Md::log_gaussian_density(-1.0, 0.0, 1.0);
+  double py1 = Md::log_laplace_density(1.0, 0.0, 1.0);
+  double py2 = Md::log_laplace_density(1.0, 0.0, 1.0);
+  double pz1 = Md::log_triangle_density(1.0, -2.0, 2.0, 0.0);
+  double pz2 = Md::log_triangle_density(1.0, -2.0, 2.0, 0.0);
+  cerr << px1 << " " << py1 << " " << pz2 << endl;
+  EXPECT_FLOAT_EQ(px1, px2);
+  EXPECT_FLOAT_EQ(py1, py2);
+  EXPECT_FLOAT_EQ(pz1, pz2);
+  double b1 = Md::log_bivariate_gaussian_density(1.0, -1.0, 0.0, 0.0, 1.0, 1.0, -0.8);
+  double b2 = Md::log_bivariate_gaussian_density(-1.0, 1.0, 0.0, 0.0, 1.0, 1.0, -0.8);
+  cerr << b1 << " " << b2 << endl;
+}
+
+TEST_F(MTest, Poisson) {
+  double prev = 1.0;
+  double tot = 0;
+  for (int i = 0; i < 10; ++i) {
+    double p = Md::log_poisson(i, 0.99);
+    cerr << "p(i=" << i << ") = " << exp(p) << endl;
+    EXPECT_LT(p, prev);
+    tot += exp(p);
+    prev = p;
+  }
+  cerr << "  tot=" << tot << endl;
+  EXPECT_LE(tot, 1.0);
+}
+
+TEST_F(MTest, YuleSimon) {
+  double prev = 1.0;
+  double tot = 0;
+  for (int i = 0; i < 10; ++i) {
+    double p = Md::log_yule_simon(i, 1.0);
+    cerr << "p(i=" << i << ") = " << exp(p) << endl;
+    EXPECT_LT(p, prev);
+    tot += exp(p);
+    prev = p;
+  }
+  cerr << "  tot=" << tot << endl;
+  EXPECT_LE(tot, 1.0);
+}
+
+TEST_F(MTest, LogGeometric) {
+  double prev = 1.0;
+  double tot = 0;
+  for (int i = 0; i < 10; ++i) {
+    double p = Md::log_geometric(i, 0.5);
+    cerr << "p(i=" << i << ") = " << exp(p) << endl;
+    EXPECT_LT(p, prev);
+    tot += exp(p);
+    prev = p;
+  }
+  cerr << "  tot=" << tot << endl;
+  EXPECT_LE(tot, 1.0);
+}
+
+TEST_F(MTest, GeneralizedFactorial) {
+  for (double i = 0.3; i < 10000; i += 0.4) {
+    double a = Md::log_generalized_factorial(1.0, i);
+    double b = lgamma(1.0 + i);
+    EXPECT_FLOAT_EQ(a,b);
+  }
+  double gf_3_6 = 3.0 * 4.0 * 5.0 * 6.0 * 7.0 * 8.0;
+  EXPECT_FLOAT_EQ(Md::log_generalized_factorial(3.0, 6.0), std::log(gf_3_6));
+  double gf_314_6 = 3.14 * 4.14 * 5.14 * 6.14 * 7.14 * 8.14;
+  EXPECT_FLOAT_EQ(Md::log_generalized_factorial(3.14, 6.0), std::log(gf_314_6));
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/utils/max_plus.h b/utils/max_plus.h
deleted file mode 100755
index 2e56f85e..00000000
--- a/utils/max_plus.h
+++ /dev/null
@@ -1,201 +0,0 @@
-#ifndef MAX_PLUS_H_
-#define MAX_PLUS_H_
-
-#define MAX_PLUS_ORDER 0
-#define MAX_PLUS_DEBUG(x)
-
-// max-plus algebra.  ordering a > b really means that (i.e. default a<b sorting will do worst (closest to 0) first.  so get used to passing predicates like std::greater<MaxPlus<T> > around
-// x+y := max{x,y}
-// x*y := x+y
-// 0 := -inf
-// 1 := 0
-// additive inverse does not, but mult. does. (inverse()) and x/y := x-y = x+y.inverse()
-//WARNING: default order is reversed, on purpose, i.e. a<b means a "better than" b, i.e. log(p_a)>log(p_b).  sorry.  defaults in libs are to order ascending, but we want best first.
-
-#include <boost/functional/hash.hpp>
-#include <iostream>
-#include <cstdlib>
-#include <cmath>
-#include <cassert>
-#include <limits>
-#include "semiring.h"
-#include "show.h"
-//#include "logval.h"
-
-template <class T>
-class MaxPlus {
- public:
-  void print(std::ostream &o) const {
-    o<<v_;
-  }
-  PRINT_SELF(MaxPlus<T>)
-  template <class O>
-  void operator=(O const& o) {
-    v_=o.v_;
-  }
-  template <class O>
-  MaxPlus(O const& o) : v_(o.v_) {  }
-
-  typedef MaxPlus<T> Self;
-  MaxPlus() : v_(LOGVAL_LOG0) {}
-  explicit MaxPlus(double x) : v_(std::log(x)) {}
-  MaxPlus(init_1) : v_(0) {  }
-  MaxPlus(init_0) : v_(LOGVAL_LOG0) {  }
-  MaxPlus(int x) : v_(std::log(x)) {}
-  MaxPlus(unsigned x) : v_(std::log(x)) { }
-  MaxPlus(double lnx,bool sign) : v_(lnx) { MAX_PLUS_DEBUG(assert(!sign)); }
-  MaxPlus(double lnx,init_lnx) : v_(lnx) {}
-  static Self exp(T lnx) { return MaxPlus(lnx,false); }
-
-  // maybe the below are faster than == 1 and == 0.  i don't know.
-  bool is_1() const { return v_==0; }
-  bool is_0() const { return v_==LOGVAL_LOG0; }
-
-  static Self One() { return Self(init_1()); }
-  static Self Zero() { return Self(init_0()); }
-  static Self e() { return Self(1,false); }
-  void logeq(const T& v) { v_ = v; }
-  bool signbit() const { return false; }
-
-  Self& logpluseq(const Self& a) {
-    if (a.is_0()) return *this;
-    if (a.v_ < v_) {
-      v_ = v_ + log1p(std::exp(a.v_ - v_));
-    } else {
-      v_ = a.v_ + log1p(std::exp(v_ - a.v_));
-    }
-    return *this;
-  }
-
-  Self& besteq(const Self& a) {
-    if (a.v_ < v_)
-      v_=a.v_;
-    return *this;
-  }
-
-  Self& operator+=(const Self& a) {
-    if (a.v_ < v_)
-      v_=a.v_;
-    return *this;
-  }
-
-  Self& operator*=(const Self& a) {
-    v_ += a.v_;
-    return *this;
-  }
-
-  Self& operator/=(const Self& a) {
-    v_ -= a.v_;
-    return *this;
-  }
-
-  // Self(fabs(log(x)),x.s_)
-  friend Self abslog(Self x) {
-    if (x.v_<0) x.v_=-x.v_;
-    return x;
-  }
-
-  Self& poweq(const T& power) {
-    v_ *= power;
-    return *this;
-  }
-
-  Self inverse() const {
-    return Self(-v_,false);
-  }
-
-  Self pow(const T& power) const {
-    Self res = *this;
-    res.poweq(power);
-    return res;
-  }
-
-  Self root(const T& root) const {
-    return pow(1/root);
-  }
-
-// copy elision - as opposed to explicit copy of Self const& o1, we should be able to construct Logval r=a+(b+c) as a single result in place in r.  todo: return std::move(o1) - C++0x
-  friend inline Self operator+(Self a,Self const& b) {
-    a+=b;
-    return a;
-  }
-  friend inline Self operator*(Self a,Self const& b) {
-    a*=b;
-    return a;
-  }
-  friend inline Self operator/(Self a,Self const& b) {
-    a/=b;
-    return a;
-  }
-  friend inline T log(Self const& a) {
-    return a.v_;
-  }
-  friend inline T pow(Self const& a,T const& e) {
-    return a.pow(e);
-  }
-
-  // intentionally not defining an operator < or operator > - because you may want to default (for library convenience) a<b means a better than b (i.e. gt)
-  inline bool lt(Self const& o) const {
-    return v_ < o.v_;
-  }
-  inline bool gt(Self const& o) const {
-    return o.v_ > v_;
-  }
-  friend inline bool operator==(Self const& lhs, Self const& rhs) {
-    return lhs.v_ == rhs.v_;
-  }
-  friend inline bool operator!=(Self const& lhs, Self const& rhs) {
-    return lhs.v_ != rhs.v_;
-  }
-  std::size_t hash() const {
-    using namespace boost;
-    return hash_value(v_);
-  }
-  friend inline std::size_t hash_value(Self const& x) {
-    return x.hash();
-  }
-
-/*
-  operator T() const {
-  return std::exp(v_);
-  }
-*/
-  T as_float() const {
-    return std::exp(v_);
-  }
-
-  T v_;
-};
-
-template <class T>
-struct semiring_traits<MaxPlus<T> > : default_semiring_traits<MaxPlus<T> > {
-  static const bool has_logplus=true;
-  static const bool has_besteq=true;
-#if MAX_PLUS_ORDER
-  static const bool have_order=true;
-#endif
-};
-
-#if MAX_PLUS_ORDER
-template <class T>
-bool operator<(const MaxPlus<T>& lhs, const MaxPlus<T>& rhs) {
-  return (lhs.v_ < rhs.v_);
-}
-
-template <class T>
-bool operator<=(const MaxPlus<T>& lhs, const MaxPlus<T>& rhs) {
-  return (lhs.v_ <= rhs.v_);
-}
-
-template <class T>
-bool operator>(const MaxPlus<T>& lhs, const MaxPlus<T>& rhs) {
-  return (lhs.v_ > rhs.v_);
-}
-
-template <class T>
-bool operator>=(const MaxPlus<T>& lhs, const MaxPlus<T>& rhs) {
-  return (lhs.v_ >= rhs.v_);
-}
-#endif
-
-#endif
diff --git a/utils/maybe_update_bound.h b/utils/maybe_update_bound.h
deleted file mode 100755
index d57215d0..00000000
--- a/utils/maybe_update_bound.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef MAYBE_UPDATE_BOUND_H
-#define MAYBE_UPDATE_BOUND_H
-
-template <class To,class From>
-inline void maybe_increase_max(To &to,const From &from) {
-    if (to<from)
-        to=from;
-}
-
-template <class To,class From>
-inline void maybe_decrease_min(To &to,const From &from) {
-    if (from<to)
-        to=from;
-}
-
-
-#endif
diff --git a/utils/mfcr.h b/utils/mfcr.h
new file mode 100644
index 00000000..886f01ef
--- /dev/null
+++ b/utils/mfcr.h
@@ -0,0 +1,366 @@
+#ifndef _MFCR_H_
+#define _MFCR_H_
+
+#include <algorithm>
+#include <numeric>
+#include <cassert>
+#include <cmath>
+#include <list>
+#include <iostream>
+#include <vector>
+#include <iterator>
+#include <tr1/unordered_map>
+#include <boost/functional/hash.hpp>
+#include "sampler.h"
+#include "slice_sampler.h"
+#include "m.h"
+
+struct TableCount {
+  TableCount() : count(), floor() {}
+  TableCount(int c, int f) : count(c), floor(f) {
+    assert(f >= 0);
+  }
+  int count;               // count or delta (may be 0, <0, or >0)
+  unsigned char floor;     // from which floor?
+};
+ 
+std::ostream& operator<<(std::ostream& o, const TableCount& tc) {
+  return o << "[c=" << tc.count << " floor=" << static_cast<unsigned int>(tc.floor) << ']';
+}
+
+// Multi-Floor Chinese Restaurant as proposed by Wood & Teh (AISTATS, 2009) to simulate
+// graphical Pitman-Yor processes.
+// http://jmlr.csail.mit.edu/proceedings/papers/v5/wood09a/wood09a.pdf
+//
+// Implementation is based on Blunsom, Cohn, Goldwater, & Johnson (ACL 2009) and code
+// referenced therein.
+// http://www.aclweb.org/anthology/P/P09/P09-2085.pdf
+//
+template <unsigned Floors, typename Dish, typename DishHash = boost::hash<Dish> >
+class MFCR {
+ public:
+
+  MFCR(double d, double strength) :
+    num_tables_(),
+    num_customers_(),
+    discount_(d),
+    strength_(strength),
+    discount_prior_strength_(std::numeric_limits<double>::quiet_NaN()),
+    discount_prior_beta_(std::numeric_limits<double>::quiet_NaN()),
+    strength_prior_shape_(std::numeric_limits<double>::quiet_NaN()),
+    strength_prior_rate_(std::numeric_limits<double>::quiet_NaN()) { check_hyperparameters(); }
+
+  MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) :
+    num_tables_(),
+    num_customers_(),
+    discount_(d),
+    strength_(strength),
+    discount_prior_strength_(discount_strength),
+    discount_prior_beta_(discount_beta),
+    strength_prior_shape_(strength_shape),
+    strength_prior_rate_(strength_rate) { check_hyperparameters(); }
+
+  void check_hyperparameters() {
+    if (discount_ < 0.0 || discount_ >= 1.0) {
+      std::cerr << "Bad discount: " << discount_ << std::endl;
+      abort();
+    }
+    if (strength_ <= -discount_) {
+      std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl;
+      abort();
+    }
+  }
+
+  double discount() const { return discount_; }
+  double strength() const { return strength_; }
+  void set_discount(double d) { discount_ = d; check_hyperparameters(); }
+  void set_strength(double a) { strength_ = a; check_hyperparameters(); }
+
+  bool has_discount_prior() const {
+    return !std::isnan(discount_prior_strength_);
+  }
+
+  bool has_strength_prior() const {
+    return !std::isnan(strength_prior_shape_);
+  }
+
+  void clear() {
+    num_tables_ = 0;
+    num_customers_ = 0;
+    dish_locs_.clear();
+  }
+
+  unsigned num_tables() const {
+    return num_tables_;
+  }
+
+  unsigned num_tables(const Dish& dish) const {
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    if (it == dish_locs_.end()) return 0;
+    return it->second.table_counts_.size();
+  }
+
+  // this is not terribly efficient but it should not typically be necessary to execute this query
+  unsigned num_tables(const Dish& dish, const unsigned floor) const {
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    if (it == dish_locs_.end()) return 0;
+    unsigned c = 0;
+    for (typename std::list<TableCount>::const_iterator i = it->second.table_counts_.begin();
+         i != it->second.table_counts_.end(); ++i) {
+      if (i->floor == floor) ++c;
+    }
+    return c;
+  }
+
+  unsigned num_customers() const {
+    return num_customers_;
+  }
+
+  unsigned num_customers(const Dish& dish) const {
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    if (it == dish_locs_.end()) return 0;
+    return it->total_dish_count_;
+  }
+
+  // returns (delta, floor) indicating whether a new table (delta) was opened and on which floor
+  template <class InputIterator, class InputIterator2>
+  TableCount increment(const Dish& dish, InputIterator p0s, InputIterator2 lambdas, MT19937* rng) {
+    DishLocations& loc = dish_locs_[dish];
+    // marg_p0 = marginal probability of opening a new table on any floor with label dish
+    typedef typename std::iterator_traits<InputIterator>::value_type F;
+    const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0));
+    assert(marg_p0 <= F(1.0001));
+    int floor = -1;
+    bool share_table = false;
+    if (loc.total_dish_count_) {
+      const F p_empty = F(strength_ + num_tables_ * discount_) * marg_p0;
+      const F p_share = F(loc.total_dish_count_ - loc.table_counts_.size() * discount_);
+      share_table = rng->SelectSample(p_empty, p_share);
+    }
+    if (share_table) {
+      // this can be done with doubles since P0 (which may be tiny) is not involved
+      double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_);
+      for (typename std::list<TableCount>::iterator ti = loc.table_counts_.begin();
+           ti != loc.table_counts_.end(); ++ti) {
+        r -= ti->count - discount_;
+        if (r <= 0.0) {
+          ++ti->count;
+          floor = ti->floor;
+          break;
+        }
+      }
+      if (r > 0.0) {
+        std::cerr << "Serious error: r=" << r << std::endl;
+        Print(&std::cerr);
+        assert(r <= 0.0);
+      }
+    } else { // sit at currently empty table -- must sample what floor
+      if (Floors == 1) {
+        floor = 0;
+      } else {
+        F r = F(rng->next()) * marg_p0;
+        for (unsigned i = 0; i < Floors; ++i) {
+          r -= (*p0s) * (*lambdas);
+          ++p0s;
+          ++lambdas;
+          if (r <= F(0.0)) {
+            floor = i;
+            break;
+          }
+        }
+      }
+      assert(floor >= 0);
+      loc.table_counts_.push_back(TableCount(1, floor));
+      ++num_tables_;
+    }
+    ++loc.total_dish_count_;
+    ++num_customers_;
+    return (share_table ? TableCount(0, floor) : TableCount(1, floor));
+  }
+
+  // returns first = -1 or 0, indicating whether a table was closed, and on what floor (second)
+  TableCount decrement(const Dish& dish, MT19937* rng) {
+    DishLocations& loc = dish_locs_[dish];
+    assert(loc.total_dish_count_);
+    int floor = -1;
+    int delta = 0;
+    if (loc.total_dish_count_ == 1) {
+      floor = loc.table_counts_.front().floor;
+      dish_locs_.erase(dish);
+      --num_tables_;
+      --num_customers_;
+      delta = -1;
+    } else {
+      // sample customer to remove UNIFORMLY. that is, do NOT use the d
+      // here. if you do, it will introduce (unwanted) bias!
+      double r = rng->next() * loc.total_dish_count_;
+      --loc.total_dish_count_;
+      --num_customers_;
+      for (typename std::list<TableCount>::iterator ti = loc.table_counts_.begin();
+           ti != loc.table_counts_.end(); ++ti) {
+        r -= ti->count;
+        if (r <= 0.0) {
+          floor = ti->floor;
+          if ((--ti->count) == 0) {
+            --num_tables_;
+            delta = -1;
+            loc.table_counts_.erase(ti);
+          }
+          break;
+        }
+      }
+      if (r > 0.0) {
+        std::cerr << "Serious error: r=" << r << std::endl;
+        Print(&std::cerr);
+        assert(r <= 0.0);
+      }
+    }
+    return TableCount(delta, floor);
+  }
+
+  template <class InputIterator, class InputIterator2>
+  typename std::iterator_traits<InputIterator>::value_type prob(const Dish& dish, InputIterator p0s, InputIterator2 lambdas) const {
+    typedef typename std::iterator_traits<InputIterator>::value_type F;
+    const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0));
+    assert(marg_p0 <= F(1.0001));
+    const typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.find(dish);
+    const F r = F(num_tables_ * discount_ + strength_);
+    if (it == dish_locs_.end()) {
+      return r * marg_p0 / F(num_customers_ + strength_);
+    } else {
+      return (F(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + F(r * marg_p0)) /
+               F(num_customers_ + strength_);
+    }
+  }
+
+  double log_crp_prob() const {
+    return log_crp_prob(discount_, strength_);
+  }
+
+  // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process
+  // does not include draws from G_w's
+  double log_crp_prob(const double& discount, const double& strength) const {
+    double lp = 0.0;
+    if (has_discount_prior())
+      lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_);
+    if (has_strength_prior())
+      lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_);
+    assert(lp <= 0.0);
+    if (num_customers_) {
+      if (discount > 0.0) {
+        const double r = lgamma(1.0 - discount);
+        if (strength)
+          lp += lgamma(strength) - lgamma(strength / discount);
+        lp += - lgamma(strength + num_customers_)
+             + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_);
+        assert(std::isfinite(lp));
+        for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
+             it != dish_locs_.end(); ++it) {
+          const DishLocations& cur = it->second;
+          for (std::list<TableCount>::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) {
+            lp += lgamma(ti->count - discount) - r;
+          }
+        }
+      } else if (!discount) { // discount == 0.0
+        lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_);
+        assert(std::isfinite(lp));
+        for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
+             it != dish_locs_.end(); ++it) {
+          const DishLocations& cur = it->second;
+          lp += lgamma(cur.table_counts_.size());
+        }
+      } else {
+        assert(!"discount less than 0 detected!");
+      }
+    }
+    assert(std::isfinite(lp));
+    return lp;
+  }
+
+  void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) {
+    assert(has_discount_prior() || has_strength_prior());
+    DiscountResampler dr(*this);
+    StrengthResampler sr(*this);
+    for (int iter = 0; iter < nloop; ++iter) {
+      if (has_strength_prior()) {
+        strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
+                               std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+      }
+      if (has_discount_prior()) {
+        double min_discount = std::numeric_limits<double>::min();
+        if (strength_ < 0.0) min_discount -= strength_;
+        discount_ = slice_sampler1d(dr, discount_, *rng, min_discount,
+                               1.0, 0.0, niterations, 100*niterations);
+      }
+    }
+    strength_ = slice_sampler1d(sr, strength_, *rng, -discount_,
+                             std::numeric_limits<double>::infinity(), 0.0, niterations, 100*niterations);
+  }
+
+  struct DiscountResampler {
+    DiscountResampler(const MFCR& crp) : crp_(crp) {}
+    const MFCR& crp_;
+    double operator()(const double& proposed_d) const {
+      return crp_.log_crp_prob(proposed_d, crp_.strength_);
+    }
+  };
+
+  struct StrengthResampler {
+    StrengthResampler(const MFCR& crp) : crp_(crp) {}
+    const MFCR& crp_;
+    double operator()(const double& proposediscount_strength) const {
+      return crp_.log_crp_prob(crp_.discount_, proposediscount_strength);
+    }
+  };
+
+  struct DishLocations {
+    DishLocations() : total_dish_count_() {}
+    unsigned total_dish_count_;          // customers at all tables with this dish
+    std::list<TableCount> table_counts_; // list<> gives O(1) deletion and insertion, which we want
+                                         // .size() is the number of tables for this dish
+  };
+
+  void Print(std::ostream* out) const {
+    (*out) << "MFCR<" << Floors << ">(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl;
+    for (typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator it = dish_locs_.begin();
+         it != dish_locs_.end(); ++it) {
+      (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): ";
+      for (typename std::list<TableCount>::const_iterator i = it->second.table_counts_.begin();
+           i != it->second.table_counts_.end(); ++i) {
+        (*out) << " " << *i;
+      }
+      (*out) << std::endl;
+    }
+  }
+
+  typedef typename std::tr1::unordered_map<Dish, DishLocations, DishHash>::const_iterator const_iterator;
+  const_iterator begin() const {
+    return dish_locs_.begin();
+  }
+  const_iterator end() const {
+    return dish_locs_.end();
+  }
+
+  unsigned num_tables_;
+  unsigned num_customers_;
+  std::tr1::unordered_map<Dish, DishLocations, DishHash> dish_locs_;
+
+  double discount_;
+  double strength_;
+
+  // optional beta prior on discount_ (NaN if no prior)
+  double discount_prior_strength_;
+  double discount_prior_beta_;
+
+  // optional gamma prior on strength_ (NaN if no prior)
+  double strength_prior_shape_;
+  double strength_prior_rate_;
+};
+
+template <unsigned N,typename T,typename H>
+std::ostream& operator<<(std::ostream& o, const MFCR<N,T,H>& c) {
+  c.Print(&o);
+  return o;
+}
+
+#endif
diff --git a/utils/mfcr_test.cc b/utils/mfcr_test.cc
new file mode 100644
index 00000000..cc886335
--- /dev/null
+++ b/utils/mfcr_test.cc
@@ -0,0 +1,72 @@
+#include "mfcr.h"
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "sampler.h"
+
+using namespace std;
+
+void test_exch(MT19937* rng) {
+  MFCR<2, int> crp(0.5, 3.0);
+  vector<double> lambdas(2);
+  vector<double> p0s(2);
+  lambdas[0] = 0.2;
+  lambdas[1] = 0.8;
+  p0s[0] = 1.0;
+  p0s[1] = 1.0;
+
+  double tot = 0;
+  double tot2 = 0;
+  double xt = 0;
+  int cust = 10;
+  vector<int> hist(cust + 1, 0), hist2(cust + 1, 0);
+  for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
+  const int samples = 100000;
+  const bool simulate = true;
+  for (int k = 0; k < samples; ++k) {
+    if (!simulate) {
+      crp.clear();
+      for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
+    } else {
+      int da = rng->next() * cust;
+      bool a = rng->next() < 0.45;
+      if (a) {
+        for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
+        for (int i = 0; i < da; ++i) { crp.decrement(1, rng); }
+        xt += 1.0;
+      } else {
+        for (int i = 0; i < da; ++i) { crp.decrement(1, rng); }
+        for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); }
+      }
+    }
+    int c = crp.num_tables(1);
+    ++hist[c];
+    tot += c;
+    int c2 = crp.num_tables(1,0);  // tables on floor 0 with dish 1
+    ++hist2[c2];
+    tot2 += c2;
+  }
+  cerr << cust << " = " << crp.num_customers() << endl;
+  cerr << "P(a) = " << (xt / samples) << endl;
+  cerr << "E[num tables] = " << (tot / samples) << endl;
+  double error = fabs((tot / samples) - 6.894);
+  cerr << "   error = " << error << endl;
+  for (int i = 1; i <= cust; ++i)
+    cerr << i << ' ' << (hist[i]) << endl;
+  cerr << "E[num tables on floor 0] = " << (tot2 / samples) << endl;
+  double error2 = fabs((tot2 / samples) - 1.379);
+  cerr << "  error2 = " << error2 << endl;
+  for (int i = 1; i <= cust; ++i)
+    cerr << i << ' ' << (hist2[i]) << endl;
+  assert(error < 0.05);   // these can fail with very low probability
+  assert(error2 < 0.05);
+};
+
+int main(int argc, char** argv) {
+  MT19937 rng;
+  test_exch(&rng);
+  return 0;
+}
+
diff --git a/utils/murmur_hash.h b/utils/murmur_hash.h
index 6063d524..6063d524 100755..100644
--- a/utils/murmur_hash.h
+++ b/utils/murmur_hash.h
diff --git a/utils/named_enum.h b/utils/named_enum.h
index 675ec868..675ec868 100755..100644
--- a/utils/named_enum.h
+++ b/utils/named_enum.h
diff --git a/utils/nan.h b/utils/nan.h
deleted file mode 100755
index 257364d5..00000000
--- a/utils/nan.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef NAN_H
-#define NAN_H
-//TODO: switch to C99 isnan isfinite isinf etc. (faster)
-
-#include <limits>
-
-template <bool> struct nan_static_assert;
-template <> struct nan_static_assert<true> { };
-
-// is_iec559 i.e. only IEEE 754 float has x != x <=> x is nan
-template<typename T>
-inline bool is_nan(T x) {
-//    static_cast<void>(sizeof(nan_static_assert<std::numeric_limits<T>::has_quiet_NaN>));
-    return std::numeric_limits<T>::has_quiet_NaN && (x != x);
-}
-
-template <typename T>
-inline bool is_inf(T x) {
-//    static_cast<void>(sizeof(nan_static_assert<std::numeric_limits<T>::has_infinity>));
-    return x == std::numeric_limits<T>::infinity() || x == -std::numeric_limits<T>::infinity();
-}
-
-template <typename T>
-inline bool is_pos_inf(T x) {
-//    static_cast<void>(sizeof(nan_static_assert<std::numeric_limits<T>::has_infinity>));
-    return x == std::numeric_limits<T>::infinity();
-}
-
-template <typename T>
-inline bool is_neg_inf(T x) {
-//    static_cast<void>(sizeof(nan_static_assert<std::numeric_limits<T>::has_infinity>));
-    return x == -std::numeric_limits<T>::infinity();
-}
-
-//c99 isfinite macro shoudl be much faster
-template <typename T>
-inline bool is_finite(T x) {
-  return !is_nan(x) && !is_inf(x);
-}
-
-
-#endif
diff --git a/utils/null_deleter.h b/utils/null_deleter.h
index 082ab453..082ab453 100755..100644
--- a/utils/null_deleter.h
+++ b/utils/null_deleter.h
diff --git a/utils/null_traits.h b/utils/null_traits.h
index fac857d9..fac857d9 100755..100644
--- a/utils/null_traits.h
+++ b/utils/null_traits.h
diff --git a/utils/sampler.h b/utils/sampler.h
index cae660d2..bdbc01b0 100644
--- a/utils/sampler.h
+++ b/utils/sampler.h
@@ -48,7 +48,7 @@ struct RandomNumberGenerator {
   template <typename F>
   size_t SelectSample(const F& a, const F& b, double T = 1.0) {
     if (T == 1.0) {
-      if (this->next() > (a / (a + b))) return 1; else return 0;
+      if (F(this->next()) > (a / (a + b))) return 1; else return 0;
     } else {
       assert(!"not implemented");
     }
diff --git a/utils/semiring.h b/utils/semiring.h
index 5007994c..5007994c 100755..100644
--- a/utils/semiring.h
+++ b/utils/semiring.h
diff --git a/utils/show.h b/utils/show.h
index 95cad253..95cad253 100755..100644
--- a/utils/show.h
+++ b/utils/show.h
diff --git a/utils/static_utoa.h b/utils/static_utoa.h
index bb3d821f..bb3d821f 100755..100644
--- a/utils/static_utoa.h
+++ b/utils/static_utoa.h
diff --git a/utils/string_to.h b/utils/string_to.h
deleted file mode 100755
index c78a5394..00000000
--- a/utils/string_to.h
+++ /dev/null
@@ -1,314 +0,0 @@
-#ifndef STRING_TO_H
-#define STRING_TO_H
-
-/*
-   may not be any faster than boost::lexical_cast in later incarnations (see http://accu.org/index.php/journals/1375)
-   but is slightly simpler.  no wide char or locale.
-
-   X string_to<X>(string);
-   string to_string(X);
-   X& string_into(string,X &); // note: returns the same ref you passed in, for convenience of use
-
-   default implementation via stringstreams (quite slow, I'm sure)
-
-   fast implementation for string, int<->string, unsigned<->string, float<->string, double<->string
-
-*/
-
-#ifndef USE_FTOA
-#define USE_FTOA 1
-#endif
-#ifndef HAVE_STRTOUL
-# define HAVE_STRTOUL 1
-#endif
-
-#include <string>
-#include <sstream>
-#include <stdexcept>
-#include <cstdlib>
-
-#include "have_64_bits.h"
-#include "utoa.h"
-#if USE_FTOA
-# include "ftoa.h"
-#endif
-
-namespace {
-// for faster numeric to/from string.  TODO: separate into optional header
-#include <stdio.h>
-#include <ctype.h>
-#include <stdlib.h> // access to evil (fast) C isspace etc.
-#include <limits.h> //strtoul
-}
-
-inline void throw_string_to(std::string const& msg,char const* prefix="string_to: ") {
-  throw std::runtime_error(prefix+msg);
-}
-
-template <class I,class To>
-bool try_stream_into(I & i,To &to,bool complete=true)
-{
-    i >> to;
-    if (i.fail()) return false;
-    if (complete) {
-        char c;
-        return !(i >> c);
-    }
-    return true;
-}
-
-template <class Str,class To>
-bool try_string_into(Str const& str,To &to,bool complete=true)
-{
-    std::istringstream i(str);
-    return try_stream_into(i,to,complete);
-}
-
-template <class Str,class Data> inline
-Data & string_into(const Str &str,Data &data)
-{
-    if (!try_string_into(str,data))
-        throw std::runtime_error(std::string("Couldn't convert (string_into): ")+str);
-    return data;
-}
-
-
-template <class Data,class Str> inline
-Data string_to(const Str &str)
-{
-    Data ret;
-    string_into(str,ret);
-    return ret;
-}
-
-template <class D> inline
-std::string to_string(D const &d)
-{
-    std::ostringstream o;
-    o << d;
-    return o.str();
-}
-
-inline std::string to_string(unsigned x) {
-  return utos(x);
-}
-
-inline std::string to_string(int x) {
-  return itos(x);
-}
-
-inline long strtol_complete(char const* s,int base=10) {
-  char *e;
-  if (*s) {
-    long r=strtol(s,&e,base);
-    char c=*e;
-    if (!c || isspace(c)) //simplifying assumption: we're happy if there's other stuff in the string, so long as the number ends in a space or eos.  TODO: loop consuming spaces until end?
-      return r;
-  }
-  throw_string_to(s,"Couldn't convert to integer: ");
-}
-
-// returns -INT_MAX or INT_MAX if number is too large/small
-inline int strtoi_complete_bounded(char const* s,int base=10) {
-  long l=strtol_complete(s,base);
-  if (l<std::numeric_limits<int>::min())
-    return std::numeric_limits<int>::min();
-  if (l>std::numeric_limits<int>::max())
-    return std::numeric_limits<int>::max();
-  return l;
-}
-#define RANGE_STR(x) #x
-#ifdef INT_MIN
-# define INTRANGE_STR "[" RANGE_STR(INT_MIN) "," RANGE_STR(INT_MAX) "]"
-#else
-# define INTRANGE_STR "[-2137483648,2147483647]"
-#endif
-
-  // throw if out of int range
-inline int strtoi_complete_exact(char const* s,int base=10) {
-  long l=strtol_complete(s,base);
-  if (l<std::numeric_limits<int>::min() || l>std::numeric_limits<int>::max())
-    throw_string_to(s,"Out of range for int " INTRANGE_STR ": ");
-  return l;
-}
-
-#if HAVE_LONGER_LONG
-inline int& string_into(std::string const& s,int &x) {
-  x=strtoi_complete_exact(s.c_str());
-  return x;
-}
-inline int& string_into(char const* s,int &x) {
-  x=strtoi_complete_exact(s);
-  return x;
-}
-#endif
-
-inline long& string_into(std::string const& s,long &x) {
-  x=strtol_complete(s.c_str());
-  return x;
-}
-inline long& string_into(char const* s,long &x) {
-  x=strtol_complete(s);
-  return x;
-}
-
-
-//FIXME: preprocessor separation for tokens int<->unsigned int, long<->unsigned long, strtol<->strtoul ?  massive code duplication
-inline unsigned long strtoul_complete(char const* s,int base=10) {
-  char *e;
-  if (*s) {
-#if HAVE_STRTOUL
-    unsigned long r=strtoul(s,&e,base);
-#else
-//    unsigned long r=strtol(s,&e,base); //FIXME: not usually safe
-    unsigned long r;
-    sscanf(s,"%ul",&r);
-#endif
-    char c=*e;
-    if (!c || isspace(c)) //simplifying assumption: we're happy if there's other stuff in the string, so long as the number ends in a space or eos.  TODO: loop consuming spaces until end?
-      return r;
-  }
-  throw_string_to(s,"Couldn't convert to integer: ");
-}
-
-inline unsigned strtou_complete_bounded(char const* s,int base=10) {
-  unsigned long l=strtoul_complete(s,base);
-  if (l<std::numeric_limits<unsigned>::min())
-    return std::numeric_limits<unsigned>::min();
-  if (l>std::numeric_limits<unsigned>::max())
-    return std::numeric_limits<unsigned>::max();
-  return l;
-}
-
-#ifdef UINT_MIN
-# define UINTRANGE_STR "[" RANGE_STR(UINT_MIN) "," RANGE_STR(UINT_MAX) "]"
-#else
-# define UINTRANGE_STR "[0,4,294,967,295]"
-#endif
-
-  // throw if out of int range
-inline unsigned strtou_complete_exact(char const* s,int base=10) {
-  unsigned long l=strtoul_complete(s,base);
-  if (l<std::numeric_limits<unsigned>::min() || l>std::numeric_limits<unsigned>::max())
-    throw_string_to(s,"Out of range for uint " UINTRANGE_STR ": ");
-  return l;
-}
-
-#if HAVE_LONGER_LONG
-inline unsigned& string_into(std::string const& s,unsigned &x) {
-  x=strtou_complete_exact(s.c_str());
-  return x;
-}
-inline unsigned& string_into(char const* s,unsigned &x) {
-  x=strtou_complete_exact(s);
-  return x;
-}
-#endif
-
-inline unsigned long& string_into(std::string const& s,unsigned long &x) {
-  x=strtoul_complete(s.c_str());
-  return x;
-}
-inline unsigned long& string_into(char const* s,unsigned long &x) {
-  x=strtoul_complete(s);
-  return x;
-}
-
-//FIXME: end code duplication
-
-
-/* 9 decimal places needed to avoid rounding error in float->string->float.  17 for double->string->double
-   in terms of usable decimal places, there are 6 for float and 15 for double
- */
-inline std::string to_string_roundtrip(float x) {
-  char buf[17];
-  return std::string(buf,buf+sprintf(buf,"%.9g",x));
-}
-inline std::string to_string(float x) {
-#if USE_FTOA
-  return ftos(x);
-#else
-  char buf[15];
-  return std::string(buf,buf+sprintf(buf,"%.7g",x));
-#endif
-}
-inline std::string to_string_roundtrip(double x) {
-  char buf[32];
-  return std::string(buf,buf+sprintf(buf,"%.17g",x));
-}
-inline std::string to_string(double x) {
-#if USE_FTOA
-  return ftos(x);
-#else
-  char buf[30];
-  return std::string(buf,buf+sprintf(buf,"%.15g",x));
-#endif
-}
-
-inline double& string_into(char const* s,double &x) {
-  x=std::atof(s);
-  return x;
-}
-inline float& string_into(char const* s,float &x) {
-  x=std::atof(s);
-  return x;
-}
-
-inline double& string_into(std::string const& s,double &x) {
-  x=std::atof(s.c_str());
-  return x;
-}
-inline float& string_into(std::string const& s,float &x) {
-  x=std::atof(s.c_str());
-  return x;
-}
-
-
-template <class Str>
-bool try_string_into(Str const& str,Str &to,bool complete=true)
-{
-    str=to;
-    return true;
-}
-
-inline std::string const& to_string(std::string const& d)
-{
-    return d;
-}
-
-template <class Str>
-Str const& string_to(Str const &s)
-{
-    return s;
-}
-
-template <class Str>
-Str & string_into(Str const &s,Str &d)
-{
-    return d=s;
-}
-
-/*
-
-template <class Str,class Data,class size_type> inline
-void substring_into(const Str &str,size_type pos,size_type n,Data &data)
-{
-//    std::istringstream i(str,pos,n); // doesn't exist!
-    std::istringstream i(str.substr(pos,n));
-    if (!(i>>*data))
-        throw std::runtime_error("Couldn't convert (string_into): "+str);
-}
-
-template <class Data,class Str,class size_type> inline
-Data string_to(const Str &str,size_type pos,size_type n)
-{
-    Data ret;
-    substring_into(str,pos,n,ret);
-    return ret;
-}
-
-*/
-
-
-
-#endif
diff --git a/utils/stringlib.h b/utils/stringlib.h
index cafbdac3..f457e1e4 100644
--- a/utils/stringlib.h
+++ b/utils/stringlib.h
@@ -125,6 +125,13 @@ inline std::string LowercaseString(const std::string& in) {
   return res;
 }
 
+inline std::string UppercaseString(const std::string& in) {
+  std::string res(in.size(),' ');
+  for (int i = 0; i < in.size(); ++i)
+    res[i] = toupper(in[i]);
+  return res;
+}
+
 inline int CountSubstrings(const std::string& str, const std::string& sub) {
   size_t p = 0;
   int res = 0;
diff --git a/utils/stringlib_test.cc b/utils/stringlib_test.cc
index f66cdbeb..f66cdbeb 100755..100644
--- a/utils/stringlib_test.cc
+++ b/utils/stringlib_test.cc
diff --git a/utils/swap_pod.h b/utils/swap_pod.h
index bb9a830d..bb9a830d 100755..100644
--- a/utils/swap_pod.h
+++ b/utils/swap_pod.h
diff --git a/utils/utoa.h b/utils/utoa.h
index 8b54987b..8b54987b 100755..100644
--- a/utils/utoa.h
+++ b/utils/utoa.h
diff --git a/utils/value_array.h b/utils/value_array.h
index 12fc9d87..12fc9d87 100755..100644
--- a/utils/value_array.h
+++ b/utils/value_array.h