From 8d222e20d8f253aa2c73d139d8ae6cc69483d071 Mon Sep 17 00:00:00 2001 From: bothameister Date: Fri, 23 Jul 2010 18:03:47 +0000 Subject: Adding morphology-segmentation stuff. Changes include: local-gi-pipeline (--morf arg), eval-pipeline (--oov-grammar, --lmorder) git-svn-id: https://ws10smt.googlecode.com/svn/trunk@382 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/evaluation-pipeline.pl | 10 +++++++--- gi/pipeline/local-gi-pipeline.pl | 16 ++++++++++++---- 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'gi/pipeline') diff --git a/gi/pipeline/evaluation-pipeline.pl b/gi/pipeline/evaluation-pipeline.pl index 13fe07cf..e940a5b9 100755 --- a/gi/pipeline/evaluation-pipeline.pl +++ b/gi/pipeline/evaluation-pipeline.pl @@ -123,16 +123,20 @@ my $dataDir = '/export/ws10smt/data'; my @features; my $bkoffgram; my $gluegram; +my $oovgram; my $usefork; +my $lmorder = 3; if (GetOptions( "backoff-grammar=s" => \$bkoffgram, "glue-grammar=s" => \$gluegram, + "oov-grammar=s" => \$oovgram, "data=s" => \$dataDir, "pmem=s" => \$PMEM, "features=s@" => \@features, "use-fork" => \$usefork, "jobs=i" => \$JOBS, "out-dir=s" => \$outdir, + "lmorder=i" => \$lmorder, ) == 0 || @ARGV!=2 || $help) { print_help(); exit; @@ -214,7 +218,6 @@ my $testini = mydircat($outdir, "cdec-test.ini"); write_cdec_ini($testini, $testgrammar); - # VEST print STDERR "\nMINIMUM ERROR TRAINING\n"; my $tuned_weights = mydircat($outdir, 'weights.tuned'); @@ -294,17 +297,18 @@ sub write_cdec_ini { my ($filename, $grammar_path) = (@_); open CDECINI, ">$filename" or die "Can't write $filename: $!"; my $glue = ($gluegram ? "$glue_grmr" : "$datadir/glue/glue.scfg.gz"); + my $oov = ($oovgram ? "$oovgram" : "$datadir/oov.scfg.gz"); print CDECINI < \$BASE_PHRASE_MAX_SIZE, 'language=s' => \$LANGUAGE, 'get_name_only' => \$NAME_SHORTCUT, 'preserve_phrases' => \$PRESERVE_PHRASES, + 'morf=s' => \$MORFMARK, ); if ($NAME_SHORTCUT) { $NUM_TOPICS = $NUM_TOPICS_FINE; @@ -216,7 +219,7 @@ sub cluster_dir { } sub labeled_dir { - if (lc($MODEL) eq "pyp" && $LABEL_THRESHOLD != 0) { + if (lc($MODEL) eq "pyp" && $LABEL_THRESHOLD ne "0") { return cluster_dir() . "_lt$LABEL_THRESHOLD"; } else { return cluster_dir(); @@ -272,8 +275,13 @@ sub extract_context { $ccopt = "-c 0"; $postsort = "" unless ($PRESERVE_PHRASES); } + my $presort = ($PRESERVE_PHRASES ? "| $REMOVE_TAGS_CONTEXT --phrase=tok --context=tag " : ""); + if ($MORFMARK ne "") { + $presort = $presort . "| $MORF_DOC_FILTER \"$MORFMARK\" "; + } + my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER $ccopt -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE $presort | $SORT_KEYS $postsort | $GZIP > $OUT_CONTEXTS"; safesystem($cmd) or die "Failed to extract contexts."; } @@ -351,7 +359,7 @@ sub label_spans_with_topics { safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip"; safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $extra > $OUT_SPANS") or die "Failed to label spans"; unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt"; - safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste"; + safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS | sed 's/ *||| *\$//' > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste"; } } -- cgit v1.2.3