diff options
Diffstat (limited to 'gi')
| -rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 48 | ||||
| -rw-r--r-- | gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java | 17 | ||||
| -rw-r--r-- | gi/posterior-regularisation/prjava/src/phrase/Trainer.java | 24 | 
3 files changed, 68 insertions, 21 deletions
| diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 8353a242..96df34ea 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -39,8 +39,8 @@ die "Can't find posterior-regularisation: $PRTOOLS" unless -e $PRTOOLS && -d $PR  my $REDUCER = "$EXTOOLS/mr_stripe_rule_reduce";  my $C2D = "$PYPSCRIPTS/contexts2documents.py";  my $S2L = "$PYPSCRIPTS/spans2labels.py"; +my $SPLIT = "$SCRIPT_DIR/../posterior-regularisation/split-languages.py"; -my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-contexts-train";  my $PREM_TRAIN="$PRTOOLS/prjava/train-PR-cluster.sh";  my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh"; @@ -48,9 +48,8 @@ my $PATCH_CORPUS = "$SCRIPT_DIR/scripts/patch-corpus.pl";  my $EXTRACTOR = "$EXTOOLS/extractor";  my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train"; -assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, #$PYP_TOPICS_TRAIN,  -            $S2L, $C2D #, $TOPIC_TRAIN -           ); +assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, +            $S2L, $C2D, $TOPIC_TRAIN, $SPLIT);  my $BACKOFF_GRAMMAR;  my $DEFAULT_CAT; @@ -78,7 +77,7 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,                             'pr-flags=s' => \$PR_FLAGS,                             'tagged_corpus=s' => \$TAGGED_CORPUS,                             'language=s' => \$LANGUAGE, -                           'get_name_only' => \$NAME_SHORTCUT, +                           'get_name_only' => \$NAME_SHORTCUT                            );  if ($NAME_SHORTCUT) {    $NUM_TOPICS = $NUM_TOPICS_FINE; @@ -132,7 +131,12 @@ if(-e $TOPICS_CONFIG) {  setup_data(); -extract_context(); +if (lc($MODEL) eq "blagree") { +    extract_bilingual_context(); +} else { +    extract_context(); +} +  if (lc($MODEL) eq "pyp") {      if($HIER_CAT) {          $NUM_TOPICS = $NUM_TOPICS_COARSE; @@ -199,6 +203,8 @@ sub cluster_dir {          return context_dir() . ".PR.t$NUM_TOPICS.i$NUM_ITERS.sp$PR_SCALE_P.sc$PR_SCALE_C";      } elsif (lc($MODEL) eq "agree") {          return context_dir() . ".AGREE.t$NUM_TOPICS.i$NUM_ITERS"; +    } elsif (lc($MODEL) eq "blagree") { +        return context_dir() . ".BLAGREE.t$NUM_TOPICS.i$NUM_ITERS";      }  } @@ -261,6 +267,29 @@ sub extract_context {    }  } +sub extract_bilingual_context { + print STDERR "\n!!!CONTEXT EXTRACTION\n";  + my $OUT_SRC_CONTEXTS = "$CONTEXT_DIR/context.source"; + my $OUT_TGT_CONTEXTS = "$CONTEXT_DIR/context.target"; + + if (-e $OUT_SRC_CONTEXTS . ".gz" and -e $OUT_TGT_CONTEXTS . ".gz") { +   print STDERR "$OUT_SRC_CONTEXTS.gz and $OUT_TGT_CONTEXTS.gz exist, reusing...\n"; + } else { +   my $OUT_BI_CONTEXTS = "$CONTEXT_DIR/context.bilingual.txt.gz"; +   my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language both --context_language both | $SORT_KEYS | $REDUCER | $GZIP > $OUT_BI_CONTEXTS"; +   if ($COMPLETE_CACHE) { +     print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; +     $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE  --phrase_language both --context_language both  | $SORT_KEYS | $GZIP > $OUT_BI_CONTEXTS"; +   } +   safesystem($cmd) or die "Failed to extract contexts."; + +   safesystem("$ZCAT $OUT_BI_CONTEXTS | $SPLIT $OUT_SRC_CONTEXTS $OUT_TGT_CONTEXTS") or die "Failed to split contexts.\n"; +   safesystem("$GZIP -f $OUT_SRC_CONTEXTS") or die "Failed to zip output contexts.\n"; +   safesystem("$GZIP -f $OUT_TGT_CONTEXTS") or die "Failed to zip output contexts.\n"; + } +} + +  sub topic_train {    print STDERR "\n!!!TRAIN PYP TOPICS\n";    my $IN_CONTEXTS = "$CONTEXT_DIR/context.txt.gz"; @@ -274,18 +303,21 @@ sub topic_train {  sub prem_train {    print STDERR "\n!!!TRAIN PR/EM model\n"; -  my $IN_CONTEXTS = "$CONTEXT_DIR/context.txt.gz";    my $OUT_CLUSTERS = "$CLUSTER_DIR/docs.txt.gz";    if (-e $OUT_CLUSTERS) {      print STDERR "$OUT_CLUSTERS exists, reusing...\n";    } else { +    my $in = "--in $CONTEXT_DIR/context.txt.gz";      my $opts = "";      if (lc($MODEL) eq "pr") {          $opts = "--scale-phrase $PR_SCALE_P --scale-context $PR_SCALE_C";      } elsif (lc($MODEL) eq "agree") {          $opts = "--agree-direction"; +    } elsif (lc($MODEL) eq "blagree") { +        $in = "--in $CONTEXT_DIR/context.source.gz --in1 $CONTEXT_DIR/context.target.gz"; +        $opts = "--agree-language";      } -    safesystem("$PREM_TRAIN --in $IN_CONTEXTS --topics $NUM_TOPICS --out $OUT_CLUSTERS --iterations $NUM_ITERS $opts $PR_FLAGS") or die "Topic training failed.\n"; +    safesystem("$PREM_TRAIN $in --topics $NUM_TOPICS --out $OUT_CLUSTERS --iterations $NUM_ITERS $opts $PR_FLAGS") or die "Topic training failed.\n";    }  } diff --git a/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java b/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java index 8bf0b93e..031f887f 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java +++ b/gi/posterior-regularisation/prjava/src/phrase/Agree2Sides.java @@ -157,13 +157,16 @@ public class Agree2Sides {  	public double[] posterior(int edgeIdx) 
  	{
 -		Edge edge1=c1.getEdges().get(edgeIdx);
 -		Edge edge2=c2.getEdges().get(edgeIdx);
 -		double[] prob1=model1.posterior(edge1);
 -		double[] prob2=model2.posterior(edge2);
 +		return posterior(c1.getEdges().get(edgeIdx), c2.getEdges().get(edgeIdx));
 +	}
 +	
 +	public double[] posterior(Edge e1, Edge e2) 
 +	{
 +		double[] prob1=model1.posterior(e1);
 +		double[] prob2=model2.posterior(e2);
 -		llh+=edge1.getCount()*Math.log(arr.F.l1norm(prob1));
 -		llh+=edge2.getCount()*Math.log(arr.F.l1norm(prob2));
 +		llh+=e1.getCount()*Math.log(arr.F.l1norm(prob1));
 +		llh+=e2.getCount()*Math.log(arr.F.l1norm(prob2));
  		arr.F.l1normalize(prob1);
  		arr.F.l1normalize(prob2);
 @@ -177,7 +180,6 @@ public class Agree2Sides {  	public void displayPosterior(PrintStream ps)
  	{	
 -		
  		for (int i=0;i<c1.getEdges().size();i++)
  		{
  			Edge edge=c1.getEdges().get(i);
 @@ -192,5 +194,4 @@ public class Agree2Sides {  			ps.println(" ||| C=" + t);
  		}
  	}
 -	
  }
 diff --git a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java b/gi/posterior-regularisation/prjava/src/phrase/Trainer.java index 202930f5..c1d4775e 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/Trainer.java +++ b/gi/posterior-regularisation/prjava/src/phrase/Trainer.java @@ -166,8 +166,10 @@ public class Trainer  			last = o;  		} -		if (cluster == null) +		if (cluster == null && agree != null)  			cluster = agree.model1; +		else if (cluster == null && agree2sides != null) +			cluster = agree2sides.model1;  		double pl1lmax = cluster.phrase_l1lmax();  		double cl1lmax = cluster.context_l1lmax(); @@ -183,11 +185,23 @@ public class Trainer  					test = corpus.getEdges();  				else  				{	// if --test supplied, load up the file -					infile = (File) options.valueOf("test"); -					System.out.println("Reading testing concordance from " + infile); -					test = corpus.readEdges(FileUtil.reader(infile)); +					if (agree == null && agree2sides == null) +					{ +						infile = (File) options.valueOf("test"); +						System.out.println("Reading testing concordance from " + infile); +						test = corpus.readEdges(FileUtil.reader(infile)); +					} +					else +						System.err.println("Can't run agreement models on different test data cf training (yet); --test ignored.");  				} -				cluster.displayPosterior(ps, test); +				 +				if (agree != null) +					agree.displayPosterior(ps); +				else if (agree2sides != null) +					agree2sides.displayPosterior(ps); +				else +					cluster.displayPosterior(ps, test); +					  				ps.close();  			} catch (IOException e) {  				System.err.println("Failed to open either testing file or output file"); | 
