From 0732fff632e792ae2268e9ef1c9c230624098eb7 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Thu, 22 Jul 2010 19:05:54 +0000 Subject: Added option to apply tags to source-side git-svn-id: https://ws10smt.googlecode.com/svn/trunk@367 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/evaluation-pipeline.pl | 2 +- gi/pipeline/local-gi-pipeline.pl | 4 +++- gi/pipeline/scripts/patch-corpus.pl | 20 ++++++++++++++++---- .../prjava/train-PR-cluster.sh | 2 +- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 16b9a571..b2656985 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -143,7 +143,7 @@ for my $feat (@features) { if (!defined $rs) { die "DON'T KNOW ABOUT FEATURE $feat\n"; } my @xfs = @$rs; @xfeats = (@xfeats, @xfs); - $FEATURIZER_OPTS .= " -f $feat" unless $feat=="BackoffRule"; + $FEATURIZER_OPTS .= " -f $feat" unless $feat eq "BackoffRule"; } print STDERR "X-FEATS: @xfeats\n"; diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 96df34ea..f72637af 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -183,7 +183,9 @@ sub setup_data { copy($CORPUS, $CORPUS_LEX); if ($TAGGED_CORPUS) { die "Can't find $TAGGED_CORPUS" unless -f $TAGGED_CORPUS; - my $cmd="$PATCH_CORPUS $TAGGED_CORPUS $CORPUS_LEX > $CORPUS_CLUSTER"; + my $opt=""; + $opt = "-s" if ($LANGUAGE eq "source"); + my $cmd="$PATCH_CORPUS $opt $TAGGED_CORPUS $CORPUS_LEX > $CORPUS_CLUSTER"; safesystem($cmd) or die "Failed to extract contexts."; } else { symlink($LEX_NAME, $CORPUS_CLUSTER); diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl index 2b181837..200022bc 100755 --- a/gi/pipeline/scripts/patch-corpus.pl +++ b/gi/pipeline/scripts/patch-corpus.pl @@ -2,7 +2,13 @@ use strict; my $PATCH = shift @ARGV; -die "Usage: $0 tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; +my $TGT = 1; +if ($PATCH eq "-s") { + undef $TGT; + $PATCH = shift @ARGV; +} + +die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; my $first=

; close P; @@ -24,9 +30,15 @@ while(my $pline =

) { chomp $line; @fields = split / \|\|\| /, $line; my @pwords = split /\s+/, $pline; - my @lwords = split /\s+/, $fields[1]; - die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - $fields[1] = $pline; + if ($TGT) { + my @lwords = split /\s+/, $fields[1]; + die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); + $fields[1] = $pline; + } else { + my @lwords = split /\s+/, $fields[0]; + die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); + $fields[0] = $pline; + } print join ' ||| ', @fields; print "\n"; } diff --git a/gi/posterior-regularisation/prjava/train-PR-cluster.sh b/gi/posterior-regularisation/prjava/train-PR-cluster.sh index 6c2f62cd..aa434fa8 100755 --- a/gi/posterior-regularisation/prjava/train-PR-cluster.sh +++ b/gi/posterior-regularisation/prjava/train-PR-cluster.sh @@ -1,4 +1,4 @@ #!/bin/sh d=`dirname $0` -java -ea -Xmx3g -cp $d/prjava.jar:$d/lib/trove-2.0.2.jar:$d/lib/optimization.jar:$d/lib/jopt-simple-3.2.jar:$d/lib/lib/commons-math-2.1.jar phrase.Trainer $* +java -ea -Xmx30g -cp $d/prjava.jar:$d/lib/trove-2.0.2.jar:$d/lib/optimization.jar:$d/lib/jopt-simple-3.2.jar:$d/lib/lib/commons-math-2.1.jar phrase.Trainer $* -- cgit v1.2.3