diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2011-02-16 18:45:49 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2011-02-16 18:45:49 -0500 |
commit | 5656232da7d26190bea814a778bdcc5cc36a6df9 (patch) | |
tree | 12294eb52d7c033be844d57142ebbb6a7a576b35 | |
parent | 6a8ba7a4c00ed011798a7b597c1e65cb1d9053ca (diff) |
add case preservation to compound splitter
-rwxr-xr-x | compound-split/compound-split.pl | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/compound-split/compound-split.pl b/compound-split/compound-split.pl index 7f45d345..0f359504 100755 --- a/compound-split/compound-split.pl +++ b/compound-split/compound-split.pl @@ -12,12 +12,14 @@ my $BEAM = 2.1; my $OUTPUT = 'plf'; my $HELP; my $VERBOSE; +my $PRESERVE_CASE; GetOptions("decoder=s" => \$CDEC, "language=s" => \$LANG, "beam=f" => \$BEAM, "output=s" => \$OUTPUT, "verbose" => \$VERBOSE, + "preserve_case" => \$PRESERVE_CASE, "help" => \$HELP ) or usage(); @@ -60,15 +62,18 @@ while(<STDIN>) { my @words = split /\s+/; my @res = (); my @todo = (); + my @casings = (); for (my $i=0; $i < scalar @words; $i++) { my $word = lc $words[$i]; if (length($word)<6 || $word =~ /^[,\-0-9\.]+$/ || $word =~ /[@.\-\/:]/) { + push @casings, 0; if ($IS_PLF) { push @res, "(('" . escape($word) . "',0,1),),"; } else { push @res, $word; } } else { + push @casings, guess_casing($words[$i]); push @res, undef; push @todo, $word; } @@ -84,6 +89,7 @@ while(<STDIN>) { unless ($IS_PLF) { $seg =~ s/^# //o; } + if ($PRESERVE_CASE && $casings[$i]) { $seg = recase_words($seg); } $res[$i] = $seg; } } @@ -100,6 +106,12 @@ while(<STDIN>) { close IN; close OUT; +sub recase_words { + my $word = shift; + $word =~ s/\b(\w)/\u$1/g; + return $word; +} + sub escape { $_ = shift; s/\\/\\\\/g; @@ -107,6 +119,11 @@ sub escape { return $_; } +sub guess_casing { + my $word = shift @_; + if (lc($word) eq $word) { return 0; } else { return 1; } +} + sub usage { print <<EOT; @@ -118,6 +135,8 @@ Usage: $0 [OPTIONS] < file.txt --beam NUM Beam threshold, used with PLF output (probably between 1.5 and 5.0) --output plf|1best Output format, 1best or plf (lattice) + --preserve_case Preserve the casing of the input word + (model will be scored lowercase) --verbose Show verbose decoder output EOT |