From 243c40d06e949cb3d6c9cfe865c307973f2df49e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 16 Feb 2011 18:45:49 -0500 Subject: add case preservation to compound splitter --- compound-split/compound-split.pl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'compound-split') diff --git a/compound-split/compound-split.pl b/compound-split/compound-split.pl index 7f45d345..0f359504 100755 --- a/compound-split/compound-split.pl +++ b/compound-split/compound-split.pl @@ -12,12 +12,14 @@ my $BEAM = 2.1; my $OUTPUT = 'plf'; my $HELP; my $VERBOSE; +my $PRESERVE_CASE; GetOptions("decoder=s" => \$CDEC, "language=s" => \$LANG, "beam=f" => \$BEAM, "output=s" => \$OUTPUT, "verbose" => \$VERBOSE, + "preserve_case" => \$PRESERVE_CASE, "help" => \$HELP ) or usage(); @@ -60,15 +62,18 @@ while() { my @words = split /\s+/; my @res = (); my @todo = (); + my @casings = (); for (my $i=0; $i < scalar @words; $i++) { my $word = lc $words[$i]; if (length($word)<6 || $word =~ /^[,\-0-9\.]+$/ || $word =~ /[@.\-\/:]/) { + push @casings, 0; if ($IS_PLF) { push @res, "(('" . escape($word) . "',0,1),),"; } else { push @res, $word; } } else { + push @casings, guess_casing($words[$i]); push @res, undef; push @todo, $word; } @@ -84,6 +89,7 @@ while() { unless ($IS_PLF) { $seg =~ s/^# //o; } + if ($PRESERVE_CASE && $casings[$i]) { $seg = recase_words($seg); } $res[$i] = $seg; } } @@ -100,6 +106,12 @@ while() { close IN; close OUT; +sub recase_words { + my $word = shift; + $word =~ s/\b(\w)/\u$1/g; + return $word; +} + sub escape { $_ = shift; s/\\/\\\\/g; @@ -107,6 +119,11 @@ sub escape { return $_; } +sub guess_casing { + my $word = shift @_; + if (lc($word) eq $word) { return 0; } else { return 1; } +} + sub usage { print <