diff options
author | Patrick Simianer <p@simianer.de> | 2016-05-10 10:49:09 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2016-05-10 10:49:09 +0200 |
commit | 6bd7135e6039b0682f49234e42451077413f0bd9 (patch) | |
tree | e30c76fbbea82f89dd10e56c35e70ec357bb24b4 /phrase2_extraction | |
parent | 96d6f8a3fc83e075f8054d48ea8c6973ca534b65 (diff) |
count clicks and keystrokes, fix bug in rule addition, good params and improvement for phrase2 extraction
Diffstat (limited to 'phrase2_extraction')
-rwxr-xr-x | phrase2_extraction/phrase2_extraction.rb | 38 |
1 files changed, 34 insertions, 4 deletions
diff --git a/phrase2_extraction/phrase2_extraction.rb b/phrase2_extraction/phrase2_extraction.rb index 253df1b..547e0be 100755 --- a/phrase2_extraction/phrase2_extraction.rb +++ b/phrase2_extraction/phrase2_extraction.rb @@ -5,9 +5,9 @@ require 'zipf' module PhrasePhraseExtraction DEBUG = false -MAX_NT = 1 # Chiang: 2 -MAX_SEED_NUM_WORDS = 10 # Chiang: 10 words -MAX_SRC_SZ = 5 # Chiang: 5 words +MAX_NT = 2 # Chiang: 2 +MAX_SEED_NUM_WORDS = 4 # Chiang: 10 words, -> phrases! +MAX_SRC_SZ = 10 # Chiang: 5 words, -> words! FORBID_SRC_ADJACENT_SRC_NT = true # Chiang:true class Rule @@ -51,6 +51,21 @@ class Rule return src_len end + def len_src_w + src_len = 0 + @source.each { |i| + if i.is_a? String + src_len += i.split.size #1 + else + i.each { |j| + src_len += source_context[j].split.size + } + end + } + + return src_len + end + def len_tgt tgt_len = 0 @target.each { |i| @@ -64,6 +79,21 @@ class Rule return tgt_len end + def len_tgt_w + tgt_len = 0 + @target.each { |i| + if i.is_a? String + tgt_len += i.split.size + else + i.each { |j| + tgt_len += target_context[j].split.size + } + end + } + + return tgt_len + end + def to_s source_string = "" @source.each { |i| @@ -625,7 +655,7 @@ end def PhrasePhraseExtraction.remove_too_long_src_sides rules return rules.reject { |r| - r.len_src > PhrasePhraseExtraction::MAX_SRC_SZ + r.len_src_w > PhrasePhraseExtraction::MAX_SRC_SZ } end |