summaryrefslogtreecommitdiff
path: root/compound-split
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-02-16 18:45:49 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2011-02-16 18:45:49 -0500
commit5656232da7d26190bea814a778bdcc5cc36a6df9 (patch)
tree12294eb52d7c033be844d57142ebbb6a7a576b35 /compound-split
parent6a8ba7a4c00ed011798a7b597c1e65cb1d9053ca (diff)
add case preservation to compound splitter
Diffstat (limited to 'compound-split')
-rwxr-xr-xcompound-split/compound-split.pl19
1 files changed, 19 insertions, 0 deletions
diff --git a/compound-split/compound-split.pl b/compound-split/compound-split.pl
index 7f45d345..0f359504 100755
--- a/compound-split/compound-split.pl
+++ b/compound-split/compound-split.pl
@@ -12,12 +12,14 @@ my $BEAM = 2.1;
my $OUTPUT = 'plf';
my $HELP;
my $VERBOSE;
+my $PRESERVE_CASE;
GetOptions("decoder=s" => \$CDEC,
"language=s" => \$LANG,
"beam=f" => \$BEAM,
"output=s" => \$OUTPUT,
"verbose" => \$VERBOSE,
+ "preserve_case" => \$PRESERVE_CASE,
"help" => \$HELP
) or usage();
@@ -60,15 +62,18 @@ while(<STDIN>) {
my @words = split /\s+/;
my @res = ();
my @todo = ();
+ my @casings = ();
for (my $i=0; $i < scalar @words; $i++) {
my $word = lc $words[$i];
if (length($word)<6 || $word =~ /^[,\-0-9\.]+$/ || $word =~ /[@.\-\/:]/) {
+ push @casings, 0;
if ($IS_PLF) {
push @res, "(('" . escape($word) . "',0,1),),";
} else {
push @res, $word;
}
} else {
+ push @casings, guess_casing($words[$i]);
push @res, undef;
push @todo, $word;
}
@@ -84,6 +89,7 @@ while(<STDIN>) {
unless ($IS_PLF) {
$seg =~ s/^# //o;
}
+ if ($PRESERVE_CASE && $casings[$i]) { $seg = recase_words($seg); }
$res[$i] = $seg;
}
}
@@ -100,6 +106,12 @@ while(<STDIN>) {
close IN;
close OUT;
+sub recase_words {
+ my $word = shift;
+ $word =~ s/\b(\w)/\u$1/g;
+ return $word;
+}
+
sub escape {
$_ = shift;
s/\\/\\\\/g;
@@ -107,6 +119,11 @@ sub escape {
return $_;
}
+sub guess_casing {
+ my $word = shift @_;
+ if (lc($word) eq $word) { return 0; } else { return 1; }
+}
+
sub usage {
print <<EOT;
@@ -118,6 +135,8 @@ Usage: $0 [OPTIONS] < file.txt
--beam NUM Beam threshold, used with PLF output
(probably between 1.5 and 5.0)
--output plf|1best Output format, 1best or plf (lattice)
+ --preserve_case Preserve the casing of the input word
+ (model will be scored lowercase)
--verbose Show verbose decoder output
EOT