summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-09 18:54:51 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-09 18:54:51 +0000
commit48ab19ad0dd6abb50b8cef7a7b91719473fa01f2 (patch)
tree975b2456530045ea2f59acc553077bc9b73c9a3c
parenta71041194a4420164145c01e4c8973e30319b574 (diff)
support doing clustering on tagged corpora
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@208 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl37
-rwxr-xr-xgi/pipeline/scripts/patch-corpus.pl34
2 files changed, 65 insertions, 6 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index f1551243..817d5c90 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -40,10 +40,11 @@ my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-contexts-train";
my $PREM_TRAIN="$PRTOOLS/prjava/train-PR-cluster.sh";
my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh";
+my $PATCH_CORPUS = "$SCRIPT_DIR/scripts/patch-corpus.pl";
my $EXTRACTOR = "$EXTOOLS/extractor";
my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train";
-assert_exec($SORT_KEYS, $REDUCER, $EXTRACTOR, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN);
+assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, $PYP_TOPICS_TRAIN, $S2L, $C2D, $TOPIC_TRAIN);
my $BACKOFF_GRAMMAR;
my $TAGGED_CORPUS;
@@ -70,11 +71,17 @@ my $CORPUS = $ARGV[0];
open F, "<$CORPUS" or die "Can't read $CORPUS: $!"; close F;
print STDERR " Output: $OUTPUT\n";
+my $DATA_DIR = $OUTPUT . '/corpora';
+my $LEX_NAME = 'corpus.f_e_a.lex';
+my $CORPUS_LEX = $DATA_DIR . '/' . $LEX_NAME; # corpus used to extract rules
+my $CORPUS_CLUSTER = $DATA_DIR . '/corpus.f_e_a.cluster'; # corpus used for clustering (often identical)
+
my $CONTEXT_DIR = $OUTPUT . '/' . context_dir();
my $CLUSTER_DIR = $OUTPUT . '/' . cluster_dir();
my $GRAMMAR_DIR = $OUTPUT . '/' . grammar_dir();
print STDERR " Context: $CONTEXT_DIR\n Cluster: $CLUSTER_DIR\n Grammar: $GRAMMAR_DIR\n";
safemkdir($OUTPUT) or die "Couldn't create output directory $OUTPUT: $!";
+safemkdir($DATA_DIR) or die "Couldn't create output directory $DATA_DIR: $!";
safemkdir($CONTEXT_DIR) or die "Couldn't create output directory $CONTEXT_DIR: $!";
safemkdir($CLUSTER_DIR) or die "Couldn't create output directory $CLUSTER_DIR: $!";
safemkdir($GRAMMAR_DIR) or die "Couldn't create output directory $GRAMMAR_DIR: $!";
@@ -82,6 +89,8 @@ if(-e $TOPICS_CONFIG) {
copy($TOPICS_CONFIG, $CLUSTER_DIR) or die "Copy failed: $!";
}
+setup_data();
+
extract_context();
if (lc($MODEL) eq "pyp") {
topic_train();
@@ -96,9 +105,25 @@ if ($BIDIR) {
$res = grammar_extract();
}
print STDERR "\n!!!COMPLETE!!!\n";
-print STDERR "GRAMMAR: $res\n\nYou should probably run:\n\n zcat $res | $SCRIPT_DIR/../../extools/filter_score_grammar -c $CORPUS -t TESTSET.TXT > filtered-grammar.scfg\n\n";
+print STDERR "GRAMMAR: $res\nYou should probably run: $SCRIPT_DIR/evaluation-pipeline.pl LANGPAIR giwork/ct1s0.L10.PYP.t4.s20.grammar/grammar.gz -f FEAT1 -f FEAT2\n\n";
exit 0;
+sub setup_data {
+ print STDERR "\n!!!PREPARE CORPORA!!!\n";
+ if (-f $CORPUS_LEX && $CORPUS_CLUSTER) {
+ print STDERR "$CORPUS_LEX and $CORPUS_CLUSTER exist, reusing...\n";
+ return;
+ }
+ copy($CORPUS, $CORPUS_LEX);
+ if ($TAGGED_CORPUS) {
+ die "Can't find $TAGGED_CORPUS" unless -f $TAGGED_CORPUS;
+ my $cmd="$PATCH_CORPUS $TAGGED_CORPUS $CORPUS_LEX > $CORPUS_CLUSTER";
+ safesystem($cmd) or die "Failed to extract contexts.";
+ } else {
+ symlink($LEX_NAME, $CORPUS_CLUSTER);
+ }
+}
+
sub context_dir {
return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE";
}
@@ -153,10 +178,10 @@ sub extract_context {
if (-e $OUT_CONTEXTS) {
print STDERR "$OUT_CONTEXTS exists, reusing...\n";
} else {
- my $cmd = "$EXTRACTOR -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";
+ my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";
if ($COMPLETE_CACHE) {
print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n";
- $cmd = "$EXTRACTOR -i $CORPUS -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";
+ $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";
}
safesystem($cmd) or die "Failed to extract contexts.";
}
@@ -193,9 +218,9 @@ sub label_spans_with_topics {
print STDERR "$OUT_SPANS exists, reusing...\n";
} else {
safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip";
- safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE > $OUT_SPANS") or die "Failed to label spans";
+ safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE > $OUT_SPANS") or die "Failed to label spans";
unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt";
- safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $CLUSTER_DIR/corpus.src_trg_al_label") or die "Couldn't paste";
+ safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $CLUSTER_DIR/corpus.src_trg_al_label") or die "Couldn't paste";
}
}
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl
new file mode 100755
index 00000000..2b181837
--- /dev/null
+++ b/gi/pipeline/scripts/patch-corpus.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $PATCH = shift @ARGV;
+die "Usage: $0 tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
+
+open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
+my $first=<P>; close P;
+my @fields = split / \|\|\| /, $first;
+die "Bad format!" if (scalar @fields > 2);
+
+if (scalar @fields != 1) {
+ # TODO support this
+ die "Patching source and target not supported yet!";
+}
+
+my $line = 0;
+open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
+while(my $pline = <P>) {
+ chomp $pline;
+ $line++;
+ my $line = <>;
+ die "Too few lines in lexical corpus!" unless $line;
+ chomp $line;
+ @fields = split / \|\|\| /, $line;
+ my @pwords = split /\s+/, $pline;
+ my @lwords = split /\s+/, $fields[1];
+ die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
+ $fields[1] = $pline;
+ print join ' ||| ', @fields;
+ print "\n";
+}
+
+