summaryrefslogtreecommitdiff
path: root/training/dep-reorder
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
commit7cc92b65a3185aa242088d830e166e495674efc9 (patch)
tree681fe5237612a4e96ce36fb9fabef00042c8ee61 /training/dep-reorder
parent37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff)
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'training/dep-reorder')
-rwxr-xr-xtraining/dep-reorder/conll2reordering-forest.pl65
-rw-r--r--training/dep-reorder/george.conll4
-rwxr-xr-xtraining/dep-reorder/scripts/conll2simplecfg.pl57
3 files changed, 126 insertions, 0 deletions
diff --git a/training/dep-reorder/conll2reordering-forest.pl b/training/dep-reorder/conll2reordering-forest.pl
new file mode 100755
index 00000000..3cd226be
--- /dev/null
+++ b/training/dep-reorder/conll2reordering-forest.pl
@@ -0,0 +1,65 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
+my $FIRST_CONV = "$script_dir/scripts/conll2simplecfg.pl";
+my $CDEC = "$script_dir/../../decoder/cdec";
+
+our $tfile1 = "grammar1.$$";
+our $tfile2 = "text.$$";
+
+die "Usage: $0 parses.conll\n" unless scalar @ARGV == 1;
+open C, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!";
+
+END { unlink $tfile1; unlink "$tfile1.cfg"; unlink $tfile2; }
+
+my $first = 1;
+open T, ">$tfile1" or die "Can't write $tfile1: $!";
+my $lc = 0;
+my $flag = 0;
+my @words = ();
+while(<C>) {
+ print T;
+ chomp;
+ if (/^$/) {
+ if ($first) { $first = undef; } else { if ($flag) { print "\n"; $flag = 0; } }
+ $first = undef;
+ close T;
+ open SO, ">$tfile2" or die "Can't write $tfile2: $!";
+ print SO "@words\n";
+ close SO;
+ @words=();
+ `$FIRST_CONV < $tfile1 > $tfile1.cfg`;
+ if ($? != 0) {
+ die "Error code: $?";
+ }
+ my $cfg = `$CDEC -n -S 10000 -f scfg -g $tfile1.cfg -i $tfile2 --show_cfg_search_space 2>/dev/null`;
+ if ($? != 0) {
+ die "Error code: $?";
+ }
+ my @rules = split /\n/, $cfg;
+ shift @rules; # get rid of output
+ for my $rule (@rules) {
+ my ($lhs, $f, $e, $feats) = split / \|\|\| /, $rule;
+ $f =~ s/,\d\]/\]/g;
+ $feats = 'TOP=1' unless $feats;
+ if ($lhs =~ /\[Goal_\d+\]/) { $lhs = '[S]'; }
+ print "$lhs ||| $f ||| $feats\n";
+ if ($e eq '[1] [2]') {
+ my ($a, $b) = split /\s+/, $f;
+ $feats =~ s/=1$//;
+ my ($x, $y) = split /_/, $feats;
+ print "$lhs ||| $b $a ||| ${y}_$x=1\n";
+ }
+ $flag = 1;
+ }
+ open T, ">$tfile1" or die "Can't write $tfile1: $!";
+ $lc = -1;
+ } else {
+ my ($ind, $word, @dmmy) = split /\s+/;
+ push @words, $word;
+ }
+ $lc++;
+}
+close T;
+
diff --git a/training/dep-reorder/george.conll b/training/dep-reorder/george.conll
new file mode 100644
index 00000000..7eebb360
--- /dev/null
+++ b/training/dep-reorder/george.conll
@@ -0,0 +1,4 @@
+1 George _ GEORGE _ _ 2 X _ _
+2 hates _ HATES _ _ 0 X _ _
+3 broccoli _ BROC _ _ 2 X _ _
+
diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl
new file mode 100755
index 00000000..b101347a
--- /dev/null
+++ b/training/dep-reorder/scripts/conll2simplecfg.pl
@@ -0,0 +1,57 @@
+#!/usr/bin/perl -w
+use strict;
+
+# 1 在 _ 10 _ _ 4 X _ _
+# 2 门厅 _ 3 _ _ 1 X _ _
+# 3 下面 _ 23 _ _ 4 X _ _
+# 4 。 _ 45 _ _ 0 X _ _
+
+my @ldeps;
+my @rdeps;
+@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; }
+@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; }
+my $rootcat = 0;
+my @cats = ('S');
+my $len = 0;
+my @noposcats = ('S');
+while(<>) {
+ chomp;
+ if (/^\s*$/) {
+ write_cfg($len);
+ $len = 0;
+ @cats=('S');
+ @noposcats = ('S');
+ @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; }
+ @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; }
+ next;
+ }
+ $len++;
+ my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/;
+ my $cat = "C$xcat";
+ my $catpos = $cat . "_$pos";
+ push @cats, $catpos;
+ push @noposcats, $cat;
+ print "[$catpos] ||| $word ||| $word ||| Word=1\n";
+ if ($headpos == 0) { $rootcat = $pos; }
+ if ($pos < $headpos) {
+ push @{$ldeps[$headpos]}, $pos;
+ } else {
+ push @{$rdeps[$headpos]}, $pos;
+ }
+}
+
+sub write_cfg {
+ my $len = shift;
+ for (my $i = 1; $i <= $len; $i++) {
+ my @lds = @{$ldeps[$i]};
+ for my $ld (@lds) {
+ print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n";
+ }
+ my @rds = @{$rdeps[$i]};
+ for my $rd (@rds) {
+ print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n";
+ }
+ }
+ print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n";
+}
+