From 7928695272b000de7142b91e05959a8fab6b1d2a Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>
Date: Wed, 14 Nov 2012 20:33:51 -0500
Subject: major mert clean up, stuff for simple system demo

---
 compound-split/README       | 51 ---------------------------------------------
 compound-split/README.md    | 51 +++++++++++++++++++++++++++++++++++++++++++++
 compound-split/make-dict.pl | 24 +++++++++++++++++++++
 3 files changed, 75 insertions(+), 51 deletions(-)
 delete mode 100644 compound-split/README
 create mode 100644 compound-split/README.md
 create mode 100755 compound-split/make-dict.pl

(limited to 'compound-split')

diff --git a/compound-split/README b/compound-split/README
deleted file mode 100644
index b7491007..00000000
--- a/compound-split/README
+++ /dev/null
@@ -1,51 +0,0 @@
-Instructions for running the compound splitter, which is a reimplementation
-and extension (more features, larger non-word list) of the model described in
-
-  C. Dyer. (2009)  Using a maximum entropy model to build segmentation
-            lattices for MT. In Proceedings of NAACL HLT 2009,
-            Boulder, Colorado, June 2009
-
-If you use this software, please cite this paper.
-
-
-GENERATING 1-BEST SEGMENTATIONS AND LATTICES
-------------------------------------------------------------------------------
-
-Here are some sample invokations:
-
-  ./compound-split.pl --output 1best < infile.txt > out.1best.txt
-      Segment infile.txt according to the 1-best segmentation file.
-
-  ./compound-split.pl --output plf < infile.txt > out.plf
-
-  ./compound-split.pl --output plf --beam 3.5 < infile.txt > out.plf
-      This generates denser lattices than usual (the default beam threshold
-      is 2.2, higher numbers do less pruning)
-
-
-MODEL TRAINING (only for the adventuresome)
-------------------------------------------------------------------------------
-
-I've included some training data for training a German language lattice
-segmentation model, and if you want to explore, you can or change the data.
-If you're especially adventuresome, you can add features to cdec (the current
-feature functions are found in ff_csplit.cc).  The training/references are
-in the file:
-
-               dev.in-ref
-
-The format is the unsegmented form on the right and the reference lattice on
-the left, separated by a triple pipe ( ||| ).  Note that the segmentation
-model inserts a # as the first word, so your segmentation references must
-include this.
-
-To retrain the model (using MAP estimation of a conditional model), do the
-following:
-
-  cd de
-  ./TRAIN
-
-Note, the optimization objective is supposed to be non-convex, but i haven't
-found much of an effect of where I initialize things.  But I haven't looked
-very hard- this might be something to explore.
-
diff --git a/compound-split/README.md b/compound-split/README.md
new file mode 100644
index 00000000..b7491007
--- /dev/null
+++ b/compound-split/README.md
@@ -0,0 +1,51 @@
+Instructions for running the compound splitter, which is a reimplementation
+and extension (more features, larger non-word list) of the model described in
+
+  C. Dyer. (2009)  Using a maximum entropy model to build segmentation
+            lattices for MT. In Proceedings of NAACL HLT 2009,
+            Boulder, Colorado, June 2009
+
+If you use this software, please cite this paper.
+
+
+GENERATING 1-BEST SEGMENTATIONS AND LATTICES
+------------------------------------------------------------------------------
+
+Here are some sample invokations:
+
+  ./compound-split.pl --output 1best < infile.txt > out.1best.txt
+      Segment infile.txt according to the 1-best segmentation file.
+
+  ./compound-split.pl --output plf < infile.txt > out.plf
+
+  ./compound-split.pl --output plf --beam 3.5 < infile.txt > out.plf
+      This generates denser lattices than usual (the default beam threshold
+      is 2.2, higher numbers do less pruning)
+
+
+MODEL TRAINING (only for the adventuresome)
+------------------------------------------------------------------------------
+
+I've included some training data for training a German language lattice
+segmentation model, and if you want to explore, you can or change the data.
+If you're especially adventuresome, you can add features to cdec (the current
+feature functions are found in ff_csplit.cc).  The training/references are
+in the file:
+
+               dev.in-ref
+
+The format is the unsegmented form on the right and the reference lattice on
+the left, separated by a triple pipe ( ||| ).  Note that the segmentation
+model inserts a # as the first word, so your segmentation references must
+include this.
+
+To retrain the model (using MAP estimation of a conditional model), do the
+following:
+
+  cd de
+  ./TRAIN
+
+Note, the optimization objective is supposed to be non-convex, but i haven't
+found much of an effect of where I initialize things.  But I haven't looked
+very hard- this might be something to explore.
+
diff --git a/compound-split/make-dict.pl b/compound-split/make-dict.pl
new file mode 100755
index 00000000..71f2b928
--- /dev/null
+++ b/compound-split/make-dict.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+my %d;
+my $z = 0;
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+while(<STDIN>) {
+  chomp;
+  s/[\–":“„!=+*.@«#%&,»\?\/{}\$\(\)\[\];\-0-9]+/ /g;
+  $_ = lc $_;
+  my @words = split /\s+/;
+  for my $w (@words) {
+    next if length($w) == 0;
+    $d{$w}++;
+    $z++;
+  }
+}
+my $lz = log($z);
+for my $w (sort {$d{$b} <=> $d{$a}} keys %d) {
+  my $c = $lz-log($d{$w});
+  print "$w $c\n";
+}
+
-- 
cgit v1.2.3