New files

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@59 ec762483-ff6d-05da-a07a-a48fb63a330f
author: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-29 17:34:16 +0000
committer: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-29 17:34:16 +0000
commit: 3799d17514ed645c185a9cd3c14535ae4d152ac8 (patch)
tree: 65b3848f64b87154e1dd66a280d589c84dafb9ea /gi/posterior-regularisation
parent: 00758559aeb9277a4b6929f890ccd8a009deeab6 (diff)
2 files changed, 215 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/Corpus.java b/gi/posterior-regularisation/Corpus.java
new file mode 100644
index 00000000..047e6ee8
--- /dev/null
+++ b/gi/posterior-regularisation/Corpus.java
@@ -0,0 +1,183 @@
+import gnu.trove.TIntArrayList;
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.Pattern;
+
+public class Corpus
+{
+	private Lexicon<String> tokenLexicon = new Lexicon<String>();
+	private Lexicon<TIntArrayList> ngramLexicon = new Lexicon<TIntArrayList>();
+	private List<Edge> edges = new ArrayList<Edge>();
+	private Map<Ngram,List<Edge>> phraseToContext = new HashMap<Ngram,List<Edge>>();
+	private Map<Ngram,List<Edge>> contextToPhrase = new HashMap<Ngram,List<Edge>>();
+	
+	public class Ngram
+	{
+		private Ngram(int id)
+		{
+			ngramId = id;
+		}
+		public int getId()
+		{
+			return ngramId;
+		}
+		public TIntArrayList getTokenIds()
+		{
+			return ngramLexicon.lookup(ngramId);
+		}
+		public String toString()
+		{
+			StringBuffer b = new StringBuffer();
+			for (int tid: getTokenIds().toNativeArray())
+			{
+				if (b.length() > 0)
+					b.append(" ");
+				b.append(tokenLexicon.lookup(tid));
+			}
+			return b.toString();
+		}
+		public int hashCode()
+		{
+			return ngramId;
+		}
+		public boolean equals(Object other)
+		{
+			return other instanceof Ngram && ngramId == ((Ngram) other).ngramId;
+		}
+		private int ngramId;
+	}
+	
+	public class Edge
+	{
+		Edge(Ngram phrase, Ngram context, int count)
+		{
+			this.phrase = phrase;
+			this.context = context;
+			this.count = count;
+		}
+		public Ngram getPhrase()
+		{
+			return phrase;
+		}
+		public Ngram getContext()
+		{
+			return context;
+		}
+		public int getCount()
+		{
+			return count;
+		}
+		private Ngram phrase;
+		private Ngram context;
+		private int count;
+	}
+
+	List<Edge> getEdges()
+	{
+		return edges;
+	}
+	
+	int numEdges()
+	{
+		return edges.size();
+	}
+
+	Set<Ngram> getPhrases()
+	{
+		return phraseToContext.keySet();
+	}
+	
+	List<Edge> getEdgesForPhrase(Ngram phrase)
+	{
+		return phraseToContext.get(phrase);
+	}
+	
+	Set<Ngram> getContexts()
+	{
+		return contextToPhrase.keySet();
+	}
+	
+	List<Edge> getEdgesForContext(Ngram context)
+	{
+		return contextToPhrase.get(context);
+	}
+	
+	int numTokens()
+	{
+		return tokenLexicon.size();
+	}
+	
+	static Corpus readFromFile(Reader in) throws IOException
+	{
+		Corpus c = new Corpus();
+		
+		// read in line-by-line
+		BufferedReader bin = new BufferedReader(in);
+		String line;
+		Pattern separator = Pattern.compile(" \\|\\|\\| ");
+
+		while ((line = bin.readLine()) != null)
+		{
+			// split into phrase and contexts
+			StringTokenizer st = new StringTokenizer(line, "\t");
+			assert (st.hasMoreTokens());
+			String phraseToks = st.nextToken();
+			assert (st.hasMoreTokens());
+			String rest = st.nextToken();
+			assert (!st.hasMoreTokens());
+
+			// process phrase	
+			st = new StringTokenizer(phraseToks, " ");
+			TIntArrayList ptoks = new TIntArrayList();
+			while (st.hasMoreTokens())
+				ptoks.add(c.tokenLexicon.insert(st.nextToken()));
+			int phraseId = c.ngramLexicon.insert(ptoks);
+			Ngram phrase = c.new Ngram(phraseId);
+			
+			// process contexts
+			String[] parts = separator.split(rest);
+			assert (parts.length % 2 == 0);
+			for (int i = 0; i < parts.length; i += 2)
+			{
+				// process pairs of strings - context and count
+				TIntArrayList ctx = new TIntArrayList();
+				String ctxString = parts[i];
+				String countString = parts[i + 1];
+				StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " ");
+				while (ctxStrtok.hasMoreTokens())
+				{
+					String token = ctxStrtok.nextToken();
+					if (!token.equals("<PHRASE>"))
+						ctx.add(c.tokenLexicon.insert(token));
+				}
+				int contextId = c.ngramLexicon.insert(ctx);
+				Ngram context = c.new Ngram(contextId);
+
+				assert (countString.startsWith("C="));
+				Edge e = c.new Edge(phrase, context, Integer.parseInt(countString.substring(2).trim()));
+				c.edges.add(e);
+				
+				// index the edge for fast phrase lookup
+				List<Edge> edges = c.phraseToContext.get(phrase);
+				if (edges == null)
+				{
+					edges = new ArrayList<Edge>();
+					c.phraseToContext.put(phrase, edges);
+				}
+				edges.add(e);
+				
+				// index the edge for fast context lookup
+				edges = c.contextToPhrase.get(context);
+				if (edges == null)
+				{
+					edges = new ArrayList<Edge>();
+					c.contextToPhrase.put(context, edges);
+				}
+				edges.add(e);
+			}
+		}
+		
+		return c;
+	}	
+}
diff --git a/gi/posterior-regularisation/Lexicon.java b/gi/posterior-regularisation/Lexicon.java
new file mode 100644
index 00000000..9f0245ee
--- /dev/null
+++ b/gi/posterior-regularisation/Lexicon.java
@@ -0,0 +1,32 @@
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class Lexicon<T>
+{
+	public int insert(T word)
+	{
+		Integer i = wordToIndex.get(word);
+		if (i == null)
+		{
+			i = indexToWord.size();
+			wordToIndex.put(word, i);
+			indexToWord.add(word);
+		}
+		return i;
+	}
+
+	public T lookup(int index)
+	{
+		return indexToWord.get(index);
+	}
+
+	public int size()
+	{
+		return indexToWord.size();
+	}
+
+	private Map<T, Integer> wordToIndex = new HashMap<T, Integer>();
+	private List<T> indexToWord = new ArrayList<T>();
+}
+\ No newline at end of file
author	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-29 17:34:16 +0000
committer	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-29 17:34:16 +0000
commit	3799d17514ed645c185a9cd3c14535ae4d152ac8 (patch)
tree	65b3848f64b87154e1dd66a280d589c84dafb9ea /gi/posterior-regularisation
parent	00758559aeb9277a4b6929f890ccd8a009deeab6 (diff)