From 241a0317966f00546aaf3b5c02c137097cfaccda Mon Sep 17 00:00:00 2001
From: "trevor.cohn" <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Sun, 18 Jul 2010 22:43:24 +0000
Subject: ??

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@312 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 extools/extractor.cc                                   |  1 +
 extools/simple-extract.sh                              |  2 +-
 gi/posterior-regularisation/prjava/train-PR-cluster.sh |  2 +-
 gi/pyp-topics/scripts/spans2labels.py                  | 11 ++++++++---
 4 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/extools/extractor.cc b/extools/extractor.cc
index 7279f745..71778d49 100644
--- a/extools/extractor.cc
+++ b/extools/extractor.cc
@@ -396,6 +396,7 @@ int main(int argc, char** argv) {
     ++line;
     in.getline(buf, MAX_LINE_LENGTH);
     if (buf[0] == 0) continue;
+    //cerr << "line #" << line << " = " << buf << endl;
     if (!silent) {
       if (line % 200 == 0) cerr << '.';
       if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush;
diff --git a/extools/simple-extract.sh b/extools/simple-extract.sh
index 7d9f439d..ec5c5276 100755
--- a/extools/simple-extract.sh
+++ b/extools/simple-extract.sh
@@ -6,6 +6,6 @@ date
 date
 # -p = compute phrase marginals
 # -b = bidirectional rules (starting with F or E) were extracted
-gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz
+zcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz
 date
 
diff --git a/gi/posterior-regularisation/prjava/train-PR-cluster.sh b/gi/posterior-regularisation/prjava/train-PR-cluster.sh
index 4d4c68d0..8298aa14 100755
--- a/gi/posterior-regularisation/prjava/train-PR-cluster.sh
+++ b/gi/posterior-regularisation/prjava/train-PR-cluster.sh
@@ -1,4 +1,4 @@
 #!/bin/sh
 
 d=`dirname $0`
-java -ea -Xmx8g -cp $d/prjava.jar:$d/lib/trove-2.0.2.jar:$d/lib/optimization.jar:$d/lib/jopt-simple-3.2.jar:$d/lib/lib/commons-math-2.1.jar phrase.Trainer $*
+java -ea -Xmx60g -cp $d/prjava.jar:$d/lib/trove-2.0.2.jar:$d/lib/optimization.jar:$d/lib/jopt-simple-3.2.jar:$d/lib/lib/commons-math-2.1.jar phrase.Trainer $*
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index f3968616..73ea20f2 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -20,6 +20,7 @@ if len(sys.argv) > 4:
   assert phr in 'stb'
   assert ctx in 'stb'
 
+print >>sys.stderr, "Loading phrase index"
 phrase_context_index = {}
 for line in file(sys.argv[1], 'r'):
   phrase,tail= line.split('\t')
@@ -37,17 +38,20 @@ for line in file(sys.argv[1], 'r'):
     features=dict([ keyval.split('=') for keyval in contexts[i+1].split()])
     category = features['C']    
     if features.has_key('P') and float(features['P']) < threshold:
-        category = cutoff_cat
+	category = cutoff_cat
     
     phrase_context_index[(phrase,contexts[i])] = category 
-#   print (phrase,contexts[i]), category, prob
+    #print (phrase,contexts[i]), category
 
+print >>sys.stderr, "Labelling spans"
 for line in sys.stdin:
   line_segments = line.split('|||')
   source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)]
   target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)]
   phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()]
 
+  #print >>sys.stderr, "line", source, '---', target, 'phrases', phrases
+
   print "|||",
 
   for s1,s2,t1,t2 in phrases:
@@ -85,7 +89,8 @@ for line in sys.stdin:
     else:
         context = contextt
 
-    label = phrase_context_index.get((phrase,context), "<UNK>")
+    #print "%d-%d-%d-%d looking up" % (s1-order,s2-order,t1-order,t2-order), (phrase, context)
+    label = phrase_context_index.get((phrase,context), cutoff_cat)
     if label != cutoff_cat: #cutoff'd spans are left unlabelled
       print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label),
   print
-- 
cgit v1.2.3