From 7928695272b000de7142b91e05959a8fab6b1d2a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 14 Nov 2012 20:33:51 -0500 Subject: major mert clean up, stuff for simple system demo --- compound-split/README | 51 --------------------------------------------- compound-split/README.md | 51 +++++++++++++++++++++++++++++++++++++++++++++ compound-split/make-dict.pl | 24 +++++++++++++++++++++ 3 files changed, 75 insertions(+), 51 deletions(-) delete mode 100644 compound-split/README create mode 100644 compound-split/README.md create mode 100755 compound-split/make-dict.pl (limited to 'compound-split') diff --git a/compound-split/README b/compound-split/README deleted file mode 100644 index b7491007..00000000 --- a/compound-split/README +++ /dev/null @@ -1,51 +0,0 @@ -Instructions for running the compound splitter, which is a reimplementation -and extension (more features, larger non-word list) of the model described in - - C. Dyer. (2009) Using a maximum entropy model to build segmentation - lattices for MT. In Proceedings of NAACL HLT 2009, - Boulder, Colorado, June 2009 - -If you use this software, please cite this paper. - - -GENERATING 1-BEST SEGMENTATIONS AND LATTICES ------------------------------------------------------------------------------- - -Here are some sample invokations: - - ./compound-split.pl --output 1best < infile.txt > out.1best.txt - Segment infile.txt according to the 1-best segmentation file. - - ./compound-split.pl --output plf < infile.txt > out.plf - - ./compound-split.pl --output plf --beam 3.5 < infile.txt > out.plf - This generates denser lattices than usual (the default beam threshold - is 2.2, higher numbers do less pruning) - - -MODEL TRAINING (only for the adventuresome) ------------------------------------------------------------------------------- - -I've included some training data for training a German language lattice -segmentation model, and if you want to explore, you can or change the data. -If you're especially adventuresome, you can add features to cdec (the current -feature functions are found in ff_csplit.cc). The training/references are -in the file: - - dev.in-ref - -The format is the unsegmented form on the right and the reference lattice on -the left, separated by a triple pipe ( ||| ). Note that the segmentation -model inserts a # as the first word, so your segmentation references must -include this. - -To retrain the model (using MAP estimation of a conditional model), do the -following: - - cd de - ./TRAIN - -Note, the optimization objective is supposed to be non-convex, but i haven't -found much of an effect of where I initialize things. But I haven't looked -very hard- this might be something to explore. - diff --git a/compound-split/README.md b/compound-split/README.md new file mode 100644 index 00000000..b7491007 --- /dev/null +++ b/compound-split/README.md @@ -0,0 +1,51 @@ +Instructions for running the compound splitter, which is a reimplementation +and extension (more features, larger non-word list) of the model described in + + C. Dyer. (2009) Using a maximum entropy model to build segmentation + lattices for MT. In Proceedings of NAACL HLT 2009, + Boulder, Colorado, June 2009 + +If you use this software, please cite this paper. + + +GENERATING 1-BEST SEGMENTATIONS AND LATTICES +------------------------------------------------------------------------------ + +Here are some sample invokations: + + ./compound-split.pl --output 1best < infile.txt > out.1best.txt + Segment infile.txt according to the 1-best segmentation file. + + ./compound-split.pl --output plf < infile.txt > out.plf + + ./compound-split.pl --output plf --beam 3.5 < infile.txt > out.plf + This generates denser lattices than usual (the default beam threshold + is 2.2, higher numbers do less pruning) + + +MODEL TRAINING (only for the adventuresome) +------------------------------------------------------------------------------ + +I've included some training data for training a German language lattice +segmentation model, and if you want to explore, you can or change the data. +If you're especially adventuresome, you can add features to cdec (the current +feature functions are found in ff_csplit.cc). The training/references are +in the file: + + dev.in-ref + +The format is the unsegmented form on the right and the reference lattice on +the left, separated by a triple pipe ( ||| ). Note that the segmentation +model inserts a # as the first word, so your segmentation references must +include this. + +To retrain the model (using MAP estimation of a conditional model), do the +following: + + cd de + ./TRAIN + +Note, the optimization objective is supposed to be non-convex, but i haven't +found much of an effect of where I initialize things. But I haven't looked +very hard- this might be something to explore. + diff --git a/compound-split/make-dict.pl b/compound-split/make-dict.pl new file mode 100755 index 00000000..71f2b928 --- /dev/null +++ b/compound-split/make-dict.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl -w +use strict; +use utf8; +my %d; +my $z = 0; +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +while() { + chomp; + s/[\–":“„!=+*.@«#%&,»\?\/{}\$\(\)\[\];\-0-9]+/ /g; + $_ = lc $_; + my @words = split /\s+/; + for my $w (@words) { + next if length($w) == 0; + $d{$w}++; + $z++; + } +} +my $lz = log($z); +for my $w (sort {$d{$b} <=> $d{$a}} keys %d) { + my $c = $lz-log($d{$w}); + print "$w $c\n"; +} + -- cgit v1.2.3