summaryrefslogtreecommitdiff
path: root/phrase2_extraction/phrase2_extraction.rb
diff options
context:
space:
mode:
Diffstat (limited to 'phrase2_extraction/phrase2_extraction.rb')
-rwxr-xr-xphrase2_extraction/phrase2_extraction.rb36
1 files changed, 33 insertions, 3 deletions
diff --git a/phrase2_extraction/phrase2_extraction.rb b/phrase2_extraction/phrase2_extraction.rb
index 48dfd73..547e0be 100755
--- a/phrase2_extraction/phrase2_extraction.rb
+++ b/phrase2_extraction/phrase2_extraction.rb
@@ -6,8 +6,8 @@ module PhrasePhraseExtraction
DEBUG = false
MAX_NT = 2 # Chiang: 2
-MAX_SEED_NUM_WORDS = 3 # Chiang: 10 words
-MAX_SRC_SZ = 3 # Chiang: 5 words
+MAX_SEED_NUM_WORDS = 4 # Chiang: 10 words, -> phrases!
+MAX_SRC_SZ = 10 # Chiang: 5 words, -> words!
FORBID_SRC_ADJACENT_SRC_NT = true # Chiang:true
class Rule
@@ -51,6 +51,21 @@ class Rule
return src_len
end
+ def len_src_w
+ src_len = 0
+ @source.each { |i|
+ if i.is_a? String
+ src_len += i.split.size #1
+ else
+ i.each { |j|
+ src_len += source_context[j].split.size
+ }
+ end
+ }
+
+ return src_len
+ end
+
def len_tgt
tgt_len = 0
@target.each { |i|
@@ -64,6 +79,21 @@ class Rule
return tgt_len
end
+ def len_tgt_w
+ tgt_len = 0
+ @target.each { |i|
+ if i.is_a? String
+ tgt_len += i.split.size
+ else
+ i.each { |j|
+ tgt_len += target_context[j].split.size
+ }
+ end
+ }
+
+ return tgt_len
+ end
+
def to_s
source_string = ""
@source.each { |i|
@@ -625,7 +655,7 @@ end
def PhrasePhraseExtraction.remove_too_long_src_sides rules
return rules.reject { |r|
- r.len_src > PhrasePhraseExtraction::MAX_SRC_SZ
+ r.len_src_w > PhrasePhraseExtraction::MAX_SRC_SZ
}
end