diff options
-rw-r--r-- | extools/extractor.cc | 1 | ||||
-rwxr-xr-x | extools/simple-extract.sh | 2 | ||||
-rwxr-xr-x | gi/posterior-regularisation/prjava/train-PR-cluster.sh | 2 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 11 |
4 files changed, 11 insertions, 5 deletions
diff --git a/extools/extractor.cc b/extools/extractor.cc index 7279f745..71778d49 100644 --- a/extools/extractor.cc +++ b/extools/extractor.cc @@ -396,6 +396,7 @@ int main(int argc, char** argv) { ++line; in.getline(buf, MAX_LINE_LENGTH); if (buf[0] == 0) continue; + //cerr << "line #" << line << " = " << buf << endl; if (!silent) { if (line % 200 == 0) cerr << '.'; if (line % 8000 == 0) cerr << " [" << line << "]\n" << flush; diff --git a/extools/simple-extract.sh b/extools/simple-extract.sh index 7d9f439d..ec5c5276 100755 --- a/extools/simple-extract.sh +++ b/extools/simple-extract.sh @@ -6,6 +6,6 @@ date date # -p = compute phrase marginals # -b = bidirectional rules (starting with F or E) were extracted -gzcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz +zcat ex.output.gz | ./mr_stripe_rule_reduce -p -b | sort -t $'\t' -k 1 | ./mr_stripe_rule_reduce | gzip > phrase-table.gz date diff --git a/gi/posterior-regularisation/prjava/train-PR-cluster.sh b/gi/posterior-regularisation/prjava/train-PR-cluster.sh index 4d4c68d0..8298aa14 100755 --- a/gi/posterior-regularisation/prjava/train-PR-cluster.sh +++ b/gi/posterior-regularisation/prjava/train-PR-cluster.sh @@ -1,4 +1,4 @@ #!/bin/sh d=`dirname $0` -java -ea -Xmx8g -cp $d/prjava.jar:$d/lib/trove-2.0.2.jar:$d/lib/optimization.jar:$d/lib/jopt-simple-3.2.jar:$d/lib/lib/commons-math-2.1.jar phrase.Trainer $* +java -ea -Xmx60g -cp $d/prjava.jar:$d/lib/trove-2.0.2.jar:$d/lib/optimization.jar:$d/lib/jopt-simple-3.2.jar:$d/lib/lib/commons-math-2.1.jar phrase.Trainer $* diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index f3968616..73ea20f2 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -20,6 +20,7 @@ if len(sys.argv) > 4: assert phr in 'stb' assert ctx in 'stb' +print >>sys.stderr, "Loading phrase index" phrase_context_index = {} for line in file(sys.argv[1], 'r'): phrase,tail= line.split('\t') @@ -37,17 +38,20 @@ for line in file(sys.argv[1], 'r'): features=dict([ keyval.split('=') for keyval in contexts[i+1].split()]) category = features['C'] if features.has_key('P') and float(features['P']) < threshold: - category = cutoff_cat + category = cutoff_cat phrase_context_index[(phrase,contexts[i])] = category -# print (phrase,contexts[i]), category, prob + #print (phrase,contexts[i]), category +print >>sys.stderr, "Labelling spans" for line in sys.stdin: line_segments = line.split('|||') source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)] target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)] phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()] + #print >>sys.stderr, "line", source, '---', target, 'phrases', phrases + print "|||", for s1,s2,t1,t2 in phrases: @@ -85,7 +89,8 @@ for line in sys.stdin: else: context = contextt - label = phrase_context_index.get((phrase,context), "<UNK>") + #print "%d-%d-%d-%d looking up" % (s1-order,s2-order,t1-order,t2-order), (phrase, context) + label = phrase_context_index.get((phrase,context), cutoff_cat) if label != cutoff_cat: #cutoff'd spans are left unlabelled print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label), print |