summaryrefslogtreecommitdiff
path: root/phrase2_extraction
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2016-05-10 10:49:09 +0200
committerPatrick Simianer <p@simianer.de>2016-05-10 10:49:09 +0200
commit6bd7135e6039b0682f49234e42451077413f0bd9 (patch)
treee30c76fbbea82f89dd10e56c35e70ec357bb24b4 /phrase2_extraction
parent96d6f8a3fc83e075f8054d48ea8c6973ca534b65 (diff)
count clicks and keystrokes, fix bug in rule addition, good params and improvement for phrase2 extraction
Diffstat (limited to 'phrase2_extraction')
-rwxr-xr-xphrase2_extraction/phrase2_extraction.rb38
1 files changed, 34 insertions, 4 deletions
diff --git a/phrase2_extraction/phrase2_extraction.rb b/phrase2_extraction/phrase2_extraction.rb
index 253df1b..547e0be 100755
--- a/phrase2_extraction/phrase2_extraction.rb
+++ b/phrase2_extraction/phrase2_extraction.rb
@@ -5,9 +5,9 @@ require 'zipf'
module PhrasePhraseExtraction
DEBUG = false
-MAX_NT = 1 # Chiang: 2
-MAX_SEED_NUM_WORDS = 10 # Chiang: 10 words
-MAX_SRC_SZ = 5 # Chiang: 5 words
+MAX_NT = 2 # Chiang: 2
+MAX_SEED_NUM_WORDS = 4 # Chiang: 10 words, -> phrases!
+MAX_SRC_SZ = 10 # Chiang: 5 words, -> words!
FORBID_SRC_ADJACENT_SRC_NT = true # Chiang:true
class Rule
@@ -51,6 +51,21 @@ class Rule
return src_len
end
+ def len_src_w
+ src_len = 0
+ @source.each { |i|
+ if i.is_a? String
+ src_len += i.split.size #1
+ else
+ i.each { |j|
+ src_len += source_context[j].split.size
+ }
+ end
+ }
+
+ return src_len
+ end
+
def len_tgt
tgt_len = 0
@target.each { |i|
@@ -64,6 +79,21 @@ class Rule
return tgt_len
end
+ def len_tgt_w
+ tgt_len = 0
+ @target.each { |i|
+ if i.is_a? String
+ tgt_len += i.split.size
+ else
+ i.each { |j|
+ tgt_len += target_context[j].split.size
+ }
+ end
+ }
+
+ return tgt_len
+ end
+
def to_s
source_string = ""
@source.each { |i|
@@ -625,7 +655,7 @@ end
def PhrasePhraseExtraction.remove_too_long_src_sides rules
return rules.reject { |r|
- r.len_src > PhrasePhraseExtraction::MAX_SRC_SZ
+ r.len_src_w > PhrasePhraseExtraction::MAX_SRC_SZ
}
end