summaryrefslogtreecommitdiff
path: root/dtrain/job
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-09-23 20:53:15 +0200
committerPatrick Simianer <p@simianer.de>2011-09-23 20:53:15 +0200
commit9bde56ed23b4b97f8193f9f8f582f18086ff17c1 (patch)
tree83bd5687f2069405537f7f8fbdfbe208a634ca54 /dtrain/job
parent4433886ac335e6db7ded081b5ef673490ee27718 (diff)
begin refactoring
Diffstat (limited to 'dtrain/job')
-rwxr-xr-xdtrain/job/avgweights.rb30
-rw-r--r--dtrain/job/cdec.ini8
-rw-r--r--dtrain/job/dtrain.ini10
-rwxr-xr-xdtrain/job/dtrain.sh6
-rwxr-xr-xdtrain/job/hadoop-streaming-job.sh23
5 files changed, 0 insertions, 77 deletions
diff --git a/dtrain/job/avgweights.rb b/dtrain/job/avgweights.rb
deleted file mode 100755
index e635aab4..00000000
--- a/dtrain/job/avgweights.rb
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-#c = {}
-w.default = 0
-#c.default = 0
-while line = STDIN.gets
- key, val = line.split /\t/
- w[key] += val.to_f
- #c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-num_map = 104.0
-
-w.each_key { |k|
- #if k == shard_count_key then next end
- #if k == "__bias" then next end
- puts "#{k}\t#{w[k]/num_map}"
- #/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/job/cdec.ini b/dtrain/job/cdec.ini
deleted file mode 100644
index 0d32f0b7..00000000
--- a/dtrain/job/cdec.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-feature_function=WordPenalty
-cubepruning_pop_limit=30
-feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
-feature_function=RuleIdentityFeatures
-scfg_max_span_limit=15
-
diff --git a/dtrain/job/dtrain.ini b/dtrain/job/dtrain.ini
deleted file mode 100644
index 079d7d69..00000000
--- a/dtrain/job/dtrain.ini
+++ /dev/null
@@ -1,10 +0,0 @@
-decoder_config=cdec.ini
-kbest=100
-ngrams=4
-epochs=10
-input=-
-scorer=stupid_bleu
-output=-
-#stop_after=100
-#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
-
diff --git a/dtrain/job/dtrain.sh b/dtrain/job/dtrain.sh
deleted file mode 100755
index 75ec29ea..00000000
--- a/dtrain/job/dtrain.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./dtrain -q -c dtrain.ini
-
-exit 0
-
diff --git a/dtrain/job/hadoop-streaming-job.sh b/dtrain/job/hadoop-streaming-job.sh
deleted file mode 100755
index 2cf3f50a..00000000
--- a/dtrain/job/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
-IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m
-OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m
-
-$HSTREAMING \
- -mapper "dtrain.sh" \
- -reducer "avgweights.rb" \
- -input $IN \
- -output $OUT \
- -file avgweights.rb \
- -file dtrain.sh \
- -file dtrain \
- -file dtrain.ini \
- -file cdec.ini \
- -file nc-wmt11.en.srilm.3.gz \
- -jobconf mapred.reduce.tasks=1 \
- -jobconf mapred.max.map.failures.percent=100
-