From ad309390887b947d997e4040dac98126ee9a356c Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Fri, 11 Dec 2015 16:09:13 +0100
Subject: phrase2 extraction: cosmetics

---
 phrase2_extraction/example.txt           |   3 +
 phrase2_extraction/phrase2_extraction.rb | 718 ++++++++++++++++++++++++++++++
 phrase_alignment/example.txt             |   3 -
 phrase_alignment/phrase_alignment.rb     | 735 -------------------------------
 server.rb                                |   2 +-
 5 files changed, 722 insertions(+), 739 deletions(-)
 create mode 100644 phrase2_extraction/example.txt
 create mode 100755 phrase2_extraction/phrase2_extraction.rb
 delete mode 100644 phrase_alignment/example.txt
 delete mode 100755 phrase_alignment/phrase_alignment.rb

diff --git a/phrase2_extraction/example.txt b/phrase2_extraction/example.txt
new file mode 100644
index 0000000..89bca35
--- /dev/null
+++ b/phrase2_extraction/example.txt
@@ -0,0 +1,3 @@
+a b c d
+w x y z
+0-1 1-0 2-2 3-3
diff --git a/phrase2_extraction/phrase2_extraction.rb b/phrase2_extraction/phrase2_extraction.rb
new file mode 100755
index 0000000..be17940
--- /dev/null
+++ b/phrase2_extraction/phrase2_extraction.rb
@@ -0,0 +1,718 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+module PhrasePhraseExtraction
+
+DEBUG                      = false
+MAX_NT                     = 2    # Chiang: 2
+MAX_SEED_NUM_WORDS         = 3    # Chiang: 10 words
+MAX_SRC_SZ                 = 3    # Chiang: 5 words
+FORBID_SRC_ADJACENT_SRC_NT = true # Chiang:true
+
+class Rule
+  attr_accessor :source, :target, :arity, :source_context, :target_context, :alignment
+
+  def initialize source_range=nil, target_range=nil, source_context=nil, target_context=nil, alignment=[]
+    if source_context && target_range && source_context && target_context
+      @source = [source_range]
+      @target = [target_range]
+      @source_context = source_context
+      @target_context = target_context
+      @alignment = alignment
+    else
+      @source = []
+      @target = []
+      @source_context = []
+      @target_context = []
+      @alignment = []
+    end
+    @arity = 0
+  end
+
+  def hash
+    self.as_trule_string.hash
+  end
+
+  def eql? other
+    self.as_trule_string == other.as_trule_string
+  end
+
+  def len_src
+    src_len = 0
+    @source.each { |i|
+      if i.is_a? String
+        src_len += 1
+      else
+        src_len += i.last-i.first+1
+      end
+    }
+
+    return src_len
+  end
+
+  def len_tgt
+    tgt_len = 0
+    @target.each { |i|
+      if i.is_a? String
+        tgt_len += 1
+      else
+        tgt_len += i.last-i.first+1
+      end
+    }
+
+    return tgt_len
+  end
+
+  def to_s
+    source_string = ""
+    @source.each { |i|
+      if i.is_a? Range
+        source_string += @source_context[i].to_s
+      else
+        source_string += " #{i} "
+      end
+    }
+    target_string = ""
+    @target.each { |i|
+      if i.is_a? Range
+        target_string += @target_context[i].to_s
+      else
+        target_string += " #{i} "
+      end
+    }
+
+    astr = ""
+    @alignment.each { |p|
+      astr += " #{p.first}-#{p.last}"
+    }
+    astr.strip!
+
+    return "#{source_string.gsub(/\s+/, " ").strip} -> #{target_string.gsub(/\s+/, " ").strip} | #{astr}"
+  end
+
+  def rebase_alignment
+    min_src = @alignment.map{|p| p.first }.min
+    min_tgt = @alignment.map{|p| p.last }.min
+    @alignment.each_with_index { |p,j|
+      @alignment[j] = [p.first-min_src, p.last-min_tgt]
+    }
+  end
+
+  def rebase_alignment1 correct_src, correct_tgt, start_source, start_target
+    @alignment.each_with_index { |p,j|
+      if p[0] > start_source
+        @alignment[j][0] = [0,p.first-correct_src].max
+      end
+      if p[1] > start_target
+        @alignment[j][1] = [0,p.last-correct_tgt].max
+      end
+    }
+  end
+
+  def as_trule_string
+    source_string = ""
+    @source.each { |i|
+      if i.is_a? Range
+        source_string += @source_context[i].join(" ").strip
+      else
+        source_string += " #{i} "
+      end
+    }
+    target_string = ""
+    @target.each { |i|
+      if i.is_a? Range
+        target_string += @target_context[i].join(" ").strip
+      else
+        target_string += " #{i} "
+      end
+    }
+    source_string = source_string.lstrip.strip
+    target_string = target_string.lstrip.strip
+
+    astr = ""
+    @alignment.each { |p|
+      astr += " #{p.first}-#{p.last}"
+    }
+    astr.strip!
+
+    #source_string.gsub!(/\[X,\d+\]/, "[X]")
+    return "[X] ||| #{source_string} ||| #{target_string} ||| NewRule=1 ||| #{astr}"
+  end
+
+  def is_terminal?
+    #return false if @source.size>1
+    #return false if @target.size>1
+    @source.each { |i| return false if !i.is_a? Range }
+    @target.each { |i| return false if !i.is_a? Range }
+    return true
+  end
+
+  # check if other_rule is a part of self
+  def mergeable_with? other_rule
+    return false if !other_rule.is_terminal?
+    other_source_begin = other_rule.source.first.first
+    other_source_end   = other_rule.source.first.last
+    other_target_begin = other_rule.target.first.first
+    other_target_end   = other_rule.target.first.last
+    b = false
+    @source.each { |i|
+      next if !i.is_a? Range
+      if (   other_source_begin >= i.first \
+          && other_source_end   <= i.last  \
+          && (!(other_source_begin==i.first && other_source_end==i.last)))
+        b = true
+        break
+      end
+    }
+    return false if !b
+    @target.each { |i|
+      next if !i.is_a? Range
+      if (   other_target_begin >= i.first \
+          && other_target_end   <= i.last  \
+          && (!(other_target_begin==i.first && other_target_end==i.last)))
+        b = true
+        break
+      end
+    }
+
+    return b
+  end
+
+  def self.split a, b, index=0, p="target"
+    return "[NEWX,#{index}]"if (a==b)
+
+    aa = a.to_a
+    begin_split = b.first
+    end_split   = b.last
+
+    p1 = aa[0..aa.index([begin_split-1,aa.first].max)]
+    p2 = aa[aa.index([end_split+1, aa.last].min)..aa.last]
+
+    nt = "[NEWX,#{index}]"
+
+    ret = nil
+    if begin_split > a.first && end_split < a.last
+      ret = [(p1.first..p1.last), nt, (p2.first..p2.last)]
+    elsif begin_split == a.first
+      ret = [nt, (p2.first..p2.last)]
+    elsif end_split == a.last
+      ret = [(p1.first..p1.last), nt]
+    end
+
+    return ret
+  end
+
+  def self.merge r, s
+    return nil if !r.mergeable_with? s
+    return nil if !s.is_terminal?
+
+    other_source_begin = s.source.first.first
+    other_source_end   = s.source.first.last
+    other_target_begin = s.target.first.first
+    other_target_end   = s.target.first.last
+
+    new_rule = Rule.new
+    new_rule.source_context = r.source_context
+    new_rule.target_context = r.target_context
+    new_rule.arity = r.arity+1
+    new_rule.alignment = Array.new
+    r.alignment.each { |p| new_rule.alignment << Array.new(p) } # deep copy
+
+    c = new_rule.arity
+    done = false
+    correct_src = 0
+    r.source.each_with_index { |i,j|
+      if i.is_a? Range
+        if (   !done \
+            && other_source_begin >= i.first \
+            && other_source_end   <= i.last)
+          new_rule.source << Rule.split(i, (other_source_begin..other_source_end), c, "source")
+          new_rule.source.flatten!
+          done = true
+        else
+          new_rule.source << i
+        end
+      else
+        new_rule.source << i
+      end
+    }
+    # relabel Xs (linear on source side)
+    switch = false
+    k = 1
+    new_rule.source.each_with_index { |i,j|
+      if i.is_a? String
+        m = i.match(/\[(X|NEWX),(\d+)\]/)
+        n = m[1]
+        l = m[2].to_i
+        if k != l
+          switch = true
+        end
+        new_rule.source[j] = "[#{n},#{k}]"
+        k += 1
+      end
+    }
+    STDERR.write "switch #{switch}\n" if DEBUG
+    done = false
+    correct_tgt = 0
+    r.target.each_with_index { |i,j|
+      if i.is_a? Range
+        if (   !done \
+            && other_target_begin >= i.first \
+            && other_target_end   <= i.last)
+          new_rule.target << Rule.split(i, (other_target_begin..other_target_end), c)
+          new_rule.target.flatten!
+          done = true
+        else
+          new_rule.target << i
+        end
+      else
+        new_rule.target << i
+        reorder = true
+      end
+    }
+
+    correct_src = r.len_src-new_rule.len_src
+    correct_tgt = r.len_tgt-new_rule.len_tgt
+    STDERR.write "correct_src #{correct_src}\n"
+    STDERR.write "correct_tgt #{correct_tgt}\n"
+
+    start_correct_source = nil
+    j = 0
+    fl = []
+    new_rule.source.each { |i|
+      if i.is_a? Range
+        fl << new_rule.source_context[i]
+      else
+        if i.match(/\[NEWX,\d+\]/)
+          STDERR.write "j = #{j}\n"
+          start_correct_source = j
+        end
+        fl << i
+      end
+      j += 1
+    }
+    fl.flatten!
+
+    start_correct_target = nil
+    j = 0
+    fl.each { |i|
+      if i.match(/\[NEWX,\d+\]/)
+        STDERR.write "j = #{j}\n"
+        start_correct_source = j
+        break
+      end
+      j += 1
+    }
+
+    el = []
+    new_rule.target.each { |i|
+      if i.is_a? Range
+        el << new_rule.target_context[i]
+      else
+        el << i
+      end
+      j += 1
+    }
+    el.flatten!
+
+    start_correct_target = nil
+    j = 0
+    el.each { |i|
+      if i.match(/\[NEWX,\d+\]/)
+        STDERR.write "j = #{j}\n"
+        start_correct_target = j
+        break
+      end
+      j += 1
+    }
+
+    if DEBUG
+      STDERR.write "start_correct_source = #{start_correct_source}\n"
+      STDERR.write "start_correct_target = #{start_correct_target}\n"
+    end
+
+    new_rule.rebase_alignment1 correct_src, correct_tgt, start_correct_source, start_correct_target
+    STDERR.write "not uniq'ed #{new_rule.alignment.to_s}\n" if DEBUG
+    new_rule.alignment.uniq!
+
+    if DEBUG
+      STDERR.write "a before: #{new_rule.alignment.to_s}\n"
+      STDERR.write "#{fl.to_s}\n"
+    end
+    new_rule.alignment.reject! { |p|
+      !fl[p.first] || !el[p.last] || fl[p.first].match(/\[(NEWX|X),\d+\]/) || el[p.last].match(/\[(NEWX|X),\d+\]/)
+    }
+    if DEBUG
+      STDERR.write "a after: #{new_rule.alignment.to_s}\n"
+      STDERR.write "old len_src #{r.len_src}\n"
+      STDERR.write "new len_src #{new_rule.len_src}\n"
+      STDERR.write "old len_tgt #{r.len_tgt}\n"
+      STDERR.write "new len_tgt #{new_rule.len_tgt}\n"
+    end
+
+    if switch
+      new_rule.target.each_with_index { |i,j|
+        if i.is_a? String
+          m = i.match(/\[(X|NEWX),(\d+)\]/)
+          n = m[1]
+          k = m[2].to_i
+          l = nil
+          if k == 1
+            l = 2
+          else # 2
+            l = 1
+          end
+          new_rule.target[j] = "[#{n},#{l}]"
+        end
+      }
+    end
+
+    new_rule.source.each_with_index { |i,j|
+      if i.is_a?(String) && i.match(/\[NEWX,\d\]/)
+        i.gsub!(/NEWX/, "X")
+      end
+    }
+    new_rule.target.each_with_index { |i,j|
+      if i.is_a?(String) && i.match(/\[NEWX,\d\]/)
+        i.gsub!(/NEWX/, "X")
+      end
+    }
+
+    return new_rule
+  end
+
+  def expand_fake_alignment
+    new_alignment = []
+    if DEBUG
+      STDERR.write "#{@alignment.to_s}\n"
+      STDERR.write "#{@source.to_s}\n"
+      STDERR.write "#{@target.to_s}\n"
+    end
+    fl = @source.map { |i|
+      if i.is_a? Range
+        @source_context[i].map{|x|x.split}
+      else
+        i
+      end
+    }.flatten 1
+    el = @target.map { |i|
+      if i.is_a? Range
+        @target_context[i].map{|x|x.split}
+      else
+        i
+      end
+    }.flatten 1
+    if DEBUG
+      STDERR.write "#{fl.to_s}\n"
+      STDERR.write "#{el.to_s}\n"
+      STDERR.write "->\n"
+    end
+
+    offsets_src = {}
+    #offsets_src.default = 0
+    o = 0
+    fl.each_with_index { |i,j|
+      if i.is_a? Array
+        o += i.size-1
+      end
+      offsets_src[j] = o
+    }
+    offsets_tgt = {}
+    o = 0
+    el.each_with_index { |i,j|
+      if i.is_a? Array
+        o += i.size-1
+      end
+      offsets_tgt[j] = o
+    }
+
+    @alignment.each { |p|
+      if DEBUG
+        STDERR.write "#{p.to_s}\n"
+        STDERR.write "#{offsets_src[p.first]} -- #{offsets_tgt[p.last]}\n"
+      end
+      new_alignment << [ p.first+offsets_src[p.first], p.last+offsets_tgt[p.last] ]
+      if DEBUG
+        STDERR.write "#{new_alignment.last.to_s}\n"
+        STDERR.write "---\n"
+        STDERR.write "\n"
+      end
+    }
+    @alignment = new_alignment
+  end
+
+end
+
+def PhrasePhraseExtraction.has_alignment a, i, dir="src"
+  index = 0
+  index = 1 if dir=="tgt"
+  a.each { |p|
+    return true if p[index]==i
+  }
+  return false
+end
+
+def PhrasePhraseExtraction.extract fstart, fend, estart, eend, f, e, a, flen, elen
+  a.each { |p|
+    fi=p[0]; ei=p[1]
+    if (fstart..fend).include? fi
+      if ei<estart || ei>eend
+        return []
+      end
+    end
+    if (estart..eend).include? ei
+      if fi<fstart || fi>fend
+        return []
+      end
+    end
+
+  }
+  rules = []
+  fs = fstart
+  loop do
+    fe = fend
+    loop do
+      rules << Rule.new(fs..fe, estart..eend, f, e)
+      a.each { |p|
+        if (fs..fe).include?(p.first)
+          rules.last.alignment << p
+        end
+      }
+      rules.last.rebase_alignment
+      fe += 1
+      break if has_alignment(a, fe, "tgt")||fe>=elen
+    end
+    fs -= 1
+    break has_alignment(a, fs, "src")||fs<0
+  end
+
+  return rules
+end
+
+def PhrasePhraseExtraction.make_gappy_rules rules, seed_rules
+  MAX_NT.times {
+    new_rules = []
+    rules.each { |r|
+      seed_rules.each { |s|
+        if r.mergeable_with? s
+          new = Rule.merge r, s
+          new_rules << new
+          STDERR.write "#{r.to_s} <<< #{s.to_s}\n" if DEBUG
+          STDERR.write " = #{new.to_s}\n\n" if DEBUG
+        end
+      }
+    }
+    rules += new_rules
+  }
+
+  return rules
+end
+
+def PhrasePhraseExtraction.make_seed_rules a, e, f
+  rules = []
+  (0..e.size-1).each { |estart|
+  (estart..e.size-1).each { |eend|
+
+    fstart = f.size-1
+    fend   = 0
+    a.each { |p|
+      fi=p[0]; ei=p[1]
+      if estart<=ei && ei<=eend
+        fstart = [fi, fstart].min
+        fend   = [fi, fend].max
+      end
+    }
+    next if fstart>fend
+    STDERR.write "fstart #{fstart}, fend #{fend}, estart #{estart}, eend #{eend}\n" if DEBUG
+    new_rules = extract fstart, fend, estart, eend, f, e, a, f.size, e.size
+    new_rules.each { |r|
+      STDERR.write "#{r.to_s}\n" if DEBUG
+    }
+    rules += new_rules
+  }
+  }
+
+  return rules
+end
+
+def PhrasePhraseExtraction.extract_rules f, e, as, expand=false
+  a = []
+  as.each { |p|
+    x,y = p.split "-"
+    x = x.to_i; y = y.to_i
+    a << [x,y]
+  }
+  rules = PhrasePhraseExtraction.make_seed_rules a, e,f
+  seed_rules = PhrasePhraseExtraction.remove_too_large_seed_phrases rules
+  rules = PhrasePhraseExtraction.make_gappy_rules rules, seed_rules
+
+  if PhrasePhraseExtraction::FORBID_SRC_ADJACENT_SRC_NT
+    rules = PhrasePhraseExtraction.remove_adjacent_nt rules
+  end
+
+  rules = PhrasePhraseExtraction.remove_too_long_src_sides rules
+
+  if expand
+    rules.each { |r| r.expand_fake_alignment }
+  end
+
+  return rules.uniq
+end
+
+def PhrasePhraseExtraction.remove_too_large_seed_phrases rules
+  return rules.reject { |r|
+    STDERR.write "#{r}\n"
+    src_len = r.len_src
+    tgt_len = r.len_tgt
+    src_len>PhrasePhraseExtraction::MAX_SEED_NUM_WORDS \
+    || tgt_len>PhrasePhraseExtraction::MAX_SEED_NUM_WORDS }
+end
+
+def PhrasePhraseExtraction.remove_adjacent_nt rules
+  return rules.reject { |r|
+    b = false
+    prev = false
+    r.source.each { |i|
+      if i.is_a? String
+        if prev
+          b = true
+          break
+        end
+        prev = true
+      else
+        prev = false
+      end
+    }
+    b
+  }
+end
+
+def PhrasePhraseExtraction.remove_too_long_src_sides rules
+  return rules.reject { |r|
+    r.len_src > PhrasePhraseExtraction::MAX_SRC_SZ
+  }
+end
+
+def PhrasePhraseExtraction.test
+  # 0 1 2 3
+  # a b c d
+  # w x y z
+  # 0-0
+  # 1-3
+  # 2-2
+  # 3-1
+  ra = Rule.new
+  rb = Rule.new
+  ra.source = [(0..2), "[X,1]"]
+  ra.target = [(0..0), "[X,1]", (2..3)]
+  ra.source_context = ["a", "b", "c", "d"]
+  ra.target_context = ["w", "x", "y", "z"]
+  ra.alignment = [[0,0],[1,3],[2,2]]
+  ra.arity = 1
+  rb.source = [(1..1)]
+  rb.target = [(3..3)]
+  rb.source_context = ["a", "b", "c", "d"]
+  rb.target_context = ["w", "x", "y", "z"]
+  rb.alignment = [[0,0]]
+  rb.arity = 0
+
+  puts ra.mergeable_with? rb
+  nr = Rule.merge ra, rb
+  puts ra.to_s
+  puts rb.to_s
+  puts nr.to_s
+end
+
+def PhrasePhraseExtraction.test_phrase
+  ra = Rule.new
+  rb = Rule.new
+  ra.source = [(0..2), "[X,1]"]
+  ra.target = [(0..0), "[X,1]", (2..3)]
+  ra.source_context = ["a a", "b b", "c c", "d d"]
+  ra.target_context = ["w w", "x x", "y y", "z z"]
+  ra.alignment = [[0,0],[1,3],[2,2]]
+  #ra.expand_fake_alignment
+  ra.arity = 1
+  rb.source = [(1..1)]
+  rb.target = [(3..3)]
+  rb.source_context = ra.source_context
+  rb.target_context = rb.source_context
+  rb.alignment = [[0,0]]
+  #rb.expand_fake_alignment
+  rb.arity = 0
+
+  puts ra.mergeable_with? rb
+  nr = Rule.merge ra, rb
+  puts ra.to_s
+  puts rb.to_s
+  nr.expand_fake_alignment
+  puts nr.to_s
+end
+
+def PhrasePhraseExtraction.test_phrase1
+  source_context = ["a", "b", "c", "Blechbänder", ", besteht", "der Spreizdorn im wesentlichen", "aus", "x"]
+  target_context = ["w", "x", "y", "the expansion", "mandrel consists", "essentially of expansion mandrel", "z"]
+
+  ra = Rule.new
+  ra.source = ["[X,1]", (3..6)]
+  ra.target = ["[X,1]", (3..5)]
+  ra.source_context = source_context
+  ra.target_context = target_context
+  ra.alignment = [[1,1],[2,2],[3,3],[4,2]]
+  ra.arity = 1
+
+  rb = Rule.new
+  rb.source = [(4..6)]
+  rb.target = [(4..5)]
+  rb.source_context = source_context
+  rb.target_context = target_context
+  rb.alignment = [[0,0],[1,1],[2,0]]
+  rb.arity = 0
+
+  puts ra.mergeable_with? rb
+  nr = Rule.merge ra, rb
+  puts ra.to_s
+  puts rb.to_s
+  nr.expand_fake_alignment
+  puts nr.to_s
+end
+
+end # module
+
+def main
+  file = ReadFile.new ARGV[0]
+
+  f = file.gets.split
+  e = file.gets.split
+  a = []
+  file.gets.split.each { |p|
+    x,y = p.split "-"
+    x = x.to_i; y = y.to_i
+    a << [x,y]
+  }
+  rules = PhrasePhraseExtraction.make_seed_rules a, e, f
+  seed_rules = PhrasePhraseExtraction.remove_too_large_seed_phrases rules
+  rules = PhrasePhraseExtraction.make_gappy_rules rules, seed_rules
+
+  if PhrasePhraseExtraction::FORBID_SRC_ADJACENT_SRC_NT
+    rules = PhrasePhraseExtraction.remove_adjacent_nt rules
+  end
+
+  rules = PhrasePhraseExtraction.remove_too_long_src_sides rules
+
+  rules.uniq!
+
+  rules.each { |r|
+    puts r.as_trule_string
+  }
+end
+main
+
+def test
+  PhrasePhraseExtraction.test
+  PhrasePhraseExtraction.test_phrase
+  PhrasePhraseExtraction.test_phrase1
+end
+#test
+
diff --git a/phrase_alignment/example.txt b/phrase_alignment/example.txt
deleted file mode 100644
index 89bca35..0000000
--- a/phrase_alignment/example.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-a b c d
-w x y z
-0-1 1-0 2-2 3-3
diff --git a/phrase_alignment/phrase_alignment.rb b/phrase_alignment/phrase_alignment.rb
deleted file mode 100755
index 1c1a0ed..0000000
--- a/phrase_alignment/phrase_alignment.rb
+++ /dev/null
@@ -1,735 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'zipf'
-
-
-module PhrasePhraseExtraction
-
-DEBUG=true
-MAX_NT=2 # chiang:2
-MAX_SEED_NUM_WORDS=3 # chiang:10 words phrases!
-MAX_SRC_SZ=3 # chiang:5 words phrases!
-FORBID_SRC_ADJACENT_SRC_NT=true # chiang:true
-
-class Rule
-  attr_accessor :source, :target, :arity, :source_context, :target_context, :alignment
-
-  def initialize source_range=nil, target_range=nil, source_context=nil, target_context=nil, alignment=[]
-    if source_context && target_range && source_context && target_context
-      @source = [source_range]
-      @target = [target_range]
-      @source_context = source_context
-      @target_context = target_context
-      @alignment = alignment
-    else
-      @source = []
-      @target = []
-      @source_context = []
-      @target_context = []
-      @alignment = []
-    end
-    @arity = 0
-  end
-
-  def <=> other_rule
-  end
-
-  def hash
-    self.as_trule_string.hash
-  end
-
-  def eql? other
-    self.as_trule_string == other.as_trule_string
-  end
-
-  def len_src
-    src_len = 0
-    @source.each { |i|
-      if i.is_a? String
-        src_len += 1
-      else
-        src_len += i.last-i.first+1
-      end
-    }
-
-    return src_len
-  end
-
-  def len_tgt
-    tgt_len = 0
-    @target.each { |i|
-      if i.is_a? String
-        tgt_len += 1
-      else
-        tgt_len += i.last-i.first+1
-      end
-    }
-
-    return tgt_len
-  end
-
-  def len
-    src_len = 0
-    @source.each { |i|
-      if i.is_a? String
-        src_len += 1
-      else
-        src_len += i.last-i.first+1
-      end
-    }
-    tgt_len = 0
-    @target.each { |i|
-      if i.is_a? String
-        tgt_len += 1
-      else
-        tgt_len += i.last-i.first+1
-      end
-    }
-    return [src_len, tgt_len]
-  end
-
-  def to_s
-    source_string = ""
-    @source.each { |i|
-      if i.is_a? Range
-        source_string += @source_context[i].to_s
-      else
-        source_string += " #{i} "
-      end
-    }
-    target_string = ""
-    @target.each { |i|
-      if i.is_a? Range
-        target_string += @target_context[i].to_s
-      else
-        target_string += " #{i} "
-      end
-    }
-
-    astr = ""
-    @alignment.each { |p|
-      astr += " #{p.first}-#{p.last}"
-    }
-    astr.strip!
-
-    return "#{source_string.gsub(/\s+/, " ").strip} -> #{target_string.gsub(/\s+/, " ").strip} | #{astr}"
-  end
-
-  def base_alignment
-    min_src = @alignment.map{|p| p.first }.min
-    min_tgt = @alignment.map{|p| p.last }.min
-    @alignment.each_with_index { |p,j|
-      @alignment[j] = [p.first-min_src, p.last-min_tgt]
-    }
-  end
-
-  def base_alignment2 correct_src, correct_tgt, start_source, start_target
-    @alignment.each_with_index { |p,j|
-      if p[0] > start_source
-        @alignment[j][0] = [0,p.first-correct_src].max
-      end
-      if p[1] > start_target
-        @alignment[j][1] = [0,p.last-correct_tgt].max
-      end
-    }
-  end
-
-  def as_trule_string
-    source_string = ""
-    @source.each { |i|
-      if i.is_a? Range
-        source_string += @source_context[i].join(" ").strip
-      else
-        source_string += " #{i} "
-      end
-    }
-    target_string = ""
-    @target.each { |i|
-      if i.is_a? Range
-        target_string += @target_context[i].join(" ").strip
-      else
-        target_string += " #{i} "
-      end
-    }
-    source_string = source_string.lstrip.strip
-    target_string = target_string.lstrip.strip
-
-    astr = ""
-    @alignment.each { |p|
-      astr += " #{p.first}-#{p.last}"
-    }
-    astr.strip!
-
-    #source_string.gsub!(/\[X,\d+\]/, "[X]")
-    return "[X] ||| #{source_string} ||| #{target_string} ||| NewRule=1 ||| #{astr}"
-  end
-
-  def is_terminal?
-    #return false if @source.size>1
-    #return false if @target.size>1
-    @source.each { |i| return false if !i.is_a? Range }
-    @target.each { |i| return false if !i.is_a? Range }
-    return true
-  end
-
-  # check if other_rule is a part of self
-  def mergeable_with? other_rule
-    return false if !other_rule.is_terminal?
-    other_source_begin = other_rule.source.first.first
-    other_source_end   = other_rule.source.first.last
-    other_target_begin = other_rule.target.first.first
-    other_target_end   = other_rule.target.first.last
-    b = false
-    @source.each { |i|
-      next if !i.is_a? Range
-      if (   other_source_begin >= i.first \
-          && other_source_end   <= i.last  \
-          && (!(other_source_begin==i.first && other_source_end==i.last)))
-        b = true
-        break
-      end
-    }
-    return false if !b
-    @target.each { |i|
-      next if !i.is_a? Range
-      if (   other_target_begin >= i.first \
-          && other_target_end   <= i.last  \
-          && (!(other_target_begin==i.first && other_target_end==i.last)))
-        b = true
-        break
-      end
-    }
-
-    return b
-  end
-
-  def self.split a, b, index=0, p="target"
-    return "[NEWX,#{index}]"if (a==b)
-
-    aa = a.to_a
-    begin_split = b.first
-    end_split   = b.last
-
-    p1 = aa[0..aa.index([begin_split-1,aa.first].max)]
-    p2 = aa[aa.index([end_split+1, aa.last].min)..aa.last]
-
-    nt = "[NEWX,#{index}]"
-
-    ret = nil
-    if begin_split > a.first && end_split < a.last
-      ret = [(p1.first..p1.last), nt, (p2.first..p2.last)]
-    elsif begin_split == a.first
-      ret = [nt, (p2.first..p2.last)]
-    elsif end_split == a.last
-      ret = [(p1.first..p1.last), nt]
-    end
-
-    return ret
-  end
-
-  def self.merge r, s
-    return nil if !r.mergeable_with? s
-    return nil if !s.is_terminal?
-
-    other_source_begin = s.source.first.first
-    other_source_end   = s.source.first.last
-    other_target_begin = s.target.first.first
-    other_target_end   = s.target.first.last
-
-    new_rule = Rule.new
-    new_rule.source_context = r.source_context
-    new_rule.target_context = r.target_context
-    new_rule.arity = r.arity+1
-    new_rule.alignment = Array.new
-    r.alignment.each { |p| new_rule.alignment << Array.new(p) } # deep copy
-
-    c = new_rule.arity
-    done = false
-    correct_src = 0
-    r.source.each_with_index { |i,j|
-      if i.is_a? Range
-        if (   !done \
-            && other_source_begin >= i.first \
-            && other_source_end   <= i.last)
-          new_rule.source << Rule.split(i, (other_source_begin..other_source_end), c, "source")
-          new_rule.source.flatten!
-          done = true
-        else
-          new_rule.source << i
-        end
-      else
-        new_rule.source << i
-      end
-    }
-    # relabel Xs (linear)
-    switch = false
-    k = 1
-    new_rule.source.each_with_index { |i,j|
-      if i.is_a? String
-        m = i.match(/\[(X|NEWX),(\d+)\]/)
-        n = m[1]
-        l = m[2].to_i
-        if k != l
-          switch = true
-        end
-        new_rule.source[j] = "[#{n},#{k}]"
-        k += 1
-      end
-    }
-    puts "switch #{switch}" if DEBUG
-    done = false
-    correct_tgt = 0
-    r.target.each_with_index { |i,j|
-      if i.is_a? Range
-        if (   !done \
-            && other_target_begin >= i.first \
-            && other_target_end   <= i.last)
-          new_rule.target << Rule.split(i, (other_target_begin..other_target_end), c)
-          new_rule.target.flatten!
-          done = true
-        else
-          new_rule.target << i
-        end
-      else
-        new_rule.target << i
-        reorder = true
-      end
-    }
-
-    correct_src = r.len_src-new_rule.len_src
-    correct_tgt = r.len_tgt-new_rule.len_tgt
-    puts "correct_src #{correct_src}"
-    puts "correct_tgt #{correct_tgt}"
-
-    start_correct_source = nil
-    j = 0
-    fl = []
-    new_rule.source.each { |i|
-      if i.is_a? Range
-        fl << new_rule.source_context[i]
-      else
-        if i.match(/\[NEWX,\d+\]/)
-          puts "j = #{j}"
-          start_correct_source = j
-        end
-        fl << i
-      end
-      j += 1
-    }
-    fl.flatten!
-
-    start_correct_target = nil
-    j = 0
-    fl.each { |i|
-      if i.match(/\[NEWX,\d+\]/)
-        puts "j = #{j}"
-        start_correct_source = j
-        break
-      end
-      j += 1
-    }
-
-    el = []
-    new_rule.target.each { |i|
-      if i.is_a? Range
-        el << new_rule.target_context[i]
-      else
-        el << i
-      end
-      j += 1
-    }
-    el.flatten!
-
-    start_correct_target = nil
-    j = 0
-    el.each { |i|
-      if i.match(/\[NEWX,\d+\]/)
-        puts "j = #{j}"
-        start_correct_target = j
-        break
-      end
-      j += 1
-    }
-
-    puts "start_correct_source = #{start_correct_source}"
-    puts "start_correct_target = #{start_correct_target}"
-
-    new_rule.base_alignment2 correct_src, correct_tgt, start_correct_source, start_correct_target
-    puts "not uniq #{new_rule.alignment.to_s}"
-    new_rule.alignment.uniq!
-
-    puts "a before: #{new_rule.alignment.to_s}"
-    puts fl.to_s
-    new_rule.alignment.reject! { |p|
-      !fl[p.first] || !el[p.last] || fl[p.first].match(/\[(NEWX|X),\d+\]/) || el[p.last].match(/\[(NEWX|X),\d+\]/)
-    }
-    puts "a after: #{new_rule.alignment.to_s}"
-    puts "old len_src #{r.len_src}"
-    puts "new len_src #{new_rule.len_src}"
-    puts "old len_tgt #{r.len_tgt}"
-    puts "new len_tgt #{new_rule.len_tgt}"
-
-    if switch
-      new_rule.target.each_with_index { |i,j|
-        if i.is_a? String
-          m = i.match(/\[(X|NEWX),(\d+)\]/)
-          n = m[1]
-          k = m[2].to_i
-          l = nil
-          if k == 1
-            l = 2
-          else # 2
-            l = 1
-          end
-          new_rule.target[j] = "[#{n},#{l}]"
-        end
-      }
-    end
-
-    new_rule.source.each_with_index { |i,j|
-      if i.is_a?(String) && i.match(/\[NEWX,\d\]/)
-        i.gsub!(/NEWX/, "X")
-      end
-    }
-    new_rule.target.each_with_index { |i,j|
-      if i.is_a?(String) && i.match(/\[NEWX,\d\]/)
-        i.gsub!(/NEWX/, "X")
-      end
-    }
-
-    return new_rule
-  end
-
-  def expand_fake_alignment
-    new_alignment = []
-    if DEBUG
-      puts @alignment.to_s
-      puts @source.to_s
-      puts @target.to_s
-    end
-    fl = @source.map { |i|
-      if i.is_a? Range
-        @source_context[i].map{|x|x.split}
-      else
-        i
-      end
-    }.flatten 1
-    el = @target.map { |i|
-      if i.is_a? Range
-        @target_context[i].map{|x|x.split}
-      else
-        i
-      end
-    }.flatten 1
-    if DEBUG
-      puts fl.to_s
-      puts el.to_s
-      puts "->"
-    end
-
-    offsets_src = {}
-    #offsets_src.default = 0
-    o = 0
-    fl.each_with_index { |i,j|
-      if i.is_a? Array
-        o += i.size-1
-      end
-      offsets_src[j] = o
-    }
-    offsets_tgt = {}
-    #offsets_tgt.default = 0
-    o = 0
-    el.each_with_index { |i,j|
-      if i.is_a? Array
-        o += i.size-1
-      end
-      offsets_tgt[j] = o
-    }
-
-    @alignment.each { |p|
-      if DEBUG
-        puts p.to_s
-        puts "#{offsets_src[p.first]} -- #{offsets_tgt[p.last]}"
-      end
-      new_alignment << [ p.first+offsets_src[p.first], p.last+offsets_tgt[p.last] ]
-      if DEBUG
-        puts new_alignment.last.to_s
-        puts "---"
-        puts
-      end
-    }
-    @alignment = new_alignment
-  end
-
-end
-
-def PhrasePhraseExtraction.has_alignment a, i, dir="src"
-  index = 0
-  index = 1 if dir=="tgt"
-  a.each { |p|
-    return true if p[index]==i
-  }
-  return false
-end
-
-def PhrasePhraseExtraction.extract fstart, fend, estart, eend, f, e, a, flen, elen
-  a.each { |p|
-    fi=p[0]; ei=p[1]
-    if (fstart..fend).include? fi
-      if ei<estart || ei>eend
-        return []
-      end
-    end
-    if (estart..eend).include? ei
-      if fi<fstart || fi>fend
-        return []
-      end
-    end
-
-  }
-  rules = []
-  fs = fstart
-  loop do
-    fe = fend
-    loop do
-      rules << Rule.new(fs..fe, estart..eend, f, e)
-      a.each { |p|
-        if (fs..fe).include?(p.first)
-          rules.last.alignment << p
-        end
-      }
-      rules.last.base_alignment
-      fe += 1
-      break if has_alignment(a, fe, "tgt")||fe>=elen
-    end
-    fs -= 1
-    break has_alignment(a, fs, "src")||fs<0
-  end
-
-  return rules
-end
-
-def PhrasePhraseExtraction.make_gappy_rules rules, seed_rules
-  MAX_NT.times {
-    new_rules = []
-    rules.each { |r|
-      seed_rules.each { |s|
-        if r.mergeable_with? s
-          new = Rule.merge r, s
-          new_rules << new
-          puts "#{r.to_s} <<< #{s.to_s}" if DEBUG
-          puts " = #{new.to_s}\n\n" if DEBUG
-        end
-      }
-    }
-    rules += new_rules
-  }
-
-  return rules
-end
-
-def PhrasePhraseExtraction.make_seed_rules a, e, f
-  rules = []
-  (0..e.size-1).each { |estart|
-  (estart..e.size-1).each { |eend|
-
-    fstart = f.size-1
-    fend   = 0
-    a.each { |p|
-      fi=p[0]; ei=p[1]
-      if estart<=ei && ei<=eend
-        fstart = [fi, fstart].min
-        fend   = [fi, fend].max
-      end
-    }
-    next if fstart>fend
-    puts "fstart #{fstart}, fend #{fend}, estart #{estart}, eend #{eend}" if DEBUG
-    new_rules = extract fstart, fend, estart, eend, f, e, a, f.size, e.size
-    new_rules.each { |r|
-      puts r.to_s if DEBUG
-    }
-    rules += new_rules
-  }
-  }
-
-  return rules
-end
-
-def PhrasePhraseExtraction.test
-  # 0 1 2 3
-  # a b c d
-  # w x y z
-  # 0-0
-  # 1-3
-  # 2-2
-  # 3-1
-  ra = Rule.new
-  rb = Rule.new
-  ra.source = [(0..2), "[X,1]"]
-  ra.target = [(0..0), "[X,1]", (2..3)]
-  ra.source_context = ["a", "b", "c", "d"]
-  ra.target_context = ["w", "x", "y", "z"]
-  ra.alignment = [[0,0],[1,3],[2,2]]
-  ra.arity = 1
-  rb.source = [(1..1)]
-  rb.target = [(3..3)]
-  rb.source_context = ["a", "b", "c", "d"]
-  rb.target_context = ["w", "x", "y", "z"]
-  rb.alignment = [[0,0]]
-  rb.arity = 0
-
-  puts ra.mergeable_with? rb
-  nr = Rule.merge ra, rb
-  puts ra.to_s
-  puts rb.to_s
-  puts nr.to_s
-end
-
-def PhrasePhraseExtraction.test_phrase
-  ra = Rule.new
-  rb = Rule.new
-  ra.source = [(0..2), "[X,1]"]
-  ra.target = [(0..0), "[X,1]", (2..3)]
-  ra.source_context = ["a a", "b b", "c c", "d d"]
-  ra.target_context = ["w w", "x x", "y y", "z z"]
-  ra.alignment = [[0,0],[1,3],[2,2]]
-  #ra.expand_fake_alignment
-  ra.arity = 1
-  rb.source = [(1..1)]
-  rb.target = [(3..3)]
-  rb.source_context = ra.source_context
-  rb.target_context = rb.source_context
-  rb.alignment = [[0,0]]
-  #rb.expand_fake_alignment
-  rb.arity = 0
-
-  puts ra.mergeable_with? rb
-  nr = Rule.merge ra, rb
-  puts ra.to_s
-  puts rb.to_s
-  nr.expand_fake_alignment
-  puts nr.to_s
-end
-
-def PhrasePhraseExtraction.test_phrase2
-  source_context = ["a", "b", "c", "Blechbänder", ", besteht", "der Spreizdorn im wesentlichen", "aus", "x"]
-  target_context = ["w", "x", "y", "the expansion", "mandrel consists", "essentially of expansion mandrel", "z"]
-
-  ra = Rule.new
-  ra.source = ["[X,1]", (3..6)]
-  ra.target = ["[X,1]", (3..5)]
-  ra.source_context = source_context
-  ra.target_context = target_context
-  ra.alignment = [[1,1],[2,2],[3,3],[4,2]]
-  ra.arity = 1
-
-  rb = Rule.new
-  rb.source = [(4..6)]
-  rb.target = [(4..5)]
-  rb.source_context = source_context
-  rb.target_context = target_context
-  rb.alignment = [[0,0],[1,1],[2,0]]
-  rb.arity = 0
-
-  puts ra.mergeable_with? rb
-  nr = Rule.merge ra, rb
-  puts ra.to_s
-  puts rb.to_s
-  nr.expand_fake_alignment
-  puts nr.to_s
-end
-
-def PhrasePhraseExtraction.extract_rules f, e, as, expand=false
-  a = []
-  as.each { |p|
-    x,y = p.split "-"
-    x = x.to_i; y = y.to_i
-    a << [x,y]
-  }
-  rules = PhrasePhraseExtraction.make_seed_rules a, e,f
-  seed_rules = PhrasePhraseExtraction.remove_too_large_seed_phrases rules
-  rules = PhrasePhraseExtraction.make_gappy_rules rules, seed_rules
-
-  if PhrasePhraseExtraction::FORBID_SRC_ADJACENT_SRC_NT
-    rules = PhrasePhraseExtraction.remove_adj_nt rules
-  end
-
-  rules = PhrasePhraseExtraction.remove_too_long_src_sides rules
-
-  if expand
-    rules.each { |r| r.expand_fake_alignment }
-  end
-
-  return rules.uniq
-end
-
-def PhrasePhraseExtraction.remove_too_large_seed_phrases rules
-  return rules.reject { |r|
-    src_len, tgt_len = r.len
-    src_len>PhrasePhraseExtraction::MAX_SEED_NUM_WORDS \
-    || tgt_len>PhrasePhraseExtraction::MAX_SEED_NUM_WORDS }
-end
-
-def PhrasePhraseExtraction.remove_adj_nt rules
-  return rules.reject { |r|
-    b = false
-    prev = false
-    r.source.each { |i|
-      if i.is_a? String
-        if prev
-          b = true
-          break
-        end
-        prev = true
-      else
-        prev = false
-      end
-    }
-    b
-  }
-end
-
-def PhrasePhraseExtraction.remove_too_long_src_sides rules
-  return rules.reject { |r|
-    r.len.first > PhrasePhraseExtraction::MAX_SRC_SZ
-  }
-end
-
-end # module
-
-def main
-  file = ReadFile.new ARGV[0]
-
-  f = file.gets.split
-  e = file.gets.split
-  a = []
-  file.gets.split.each { |p|
-    x,y = p.split "-"
-    x = x.to_i; y = y.to_i
-    a << [x,y]
-  }
-  rules = PhrasePhraseExtraction.make_seed_rules a, e, f
-  seed_rules = PhrasePhraseExtraction.remove_too_large_seed_phrases rules
-  rules = PhrasePhraseExtraction.make_gappy_rules rules, seed_rules
-
-  if PhrasePhraseExtraction::FORBID_SRC_ADJACENT_SRC_NT
-    rules = PhrasePhraseExtraction.remove_adj_nt rules
-  end
-
-  rules = PhrasePhraseExtraction.remove_too_long_src_sides rules
-
-  rules.uniq!
-
-  rules.each { |r|
-    puts r.as_trule_string
-  }
-end
-#main
-
-def test
-  #PhrasePhraseExtraction.test
-  #PhrasePhraseExtraction.test_phrase
-  PhrasePhraseExtraction.test_phrase2
-end
-test
-
diff --git a/server.rb b/server.rb
index 9ad49f3..f976ee6 100755
--- a/server.rb
+++ b/server.rb
@@ -9,7 +9,7 @@ require 'json'
 require 'haml'
 require 'uri'
 require_relative './derivation_to_json/derivation_to_json'
-require_relative './phrase_alignment/phrase_alignment'
+require_relative './phrase2_extraction/phrase2_extraction'
 
 # #############################################################################
 # Load configuration file and setup global variables
-- 
cgit v1.2.3