From 9f34a708e6eb14362788199fdb736aae557aca38 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Tue, 29 Jun 2010 17:34:16 +0000 Subject: New files git-svn-id: https://ws10smt.googlecode.com/svn/trunk@59 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/posterior-regularisation/Corpus.java | 183 +++++++++++++++++++++++++++++++ gi/posterior-regularisation/Lexicon.java | 32 ++++++ 2 files changed, 215 insertions(+) create mode 100644 gi/posterior-regularisation/Corpus.java create mode 100644 gi/posterior-regularisation/Lexicon.java (limited to 'gi/posterior-regularisation') diff --git a/gi/posterior-regularisation/Corpus.java b/gi/posterior-regularisation/Corpus.java new file mode 100644 index 00000000..047e6ee8 --- /dev/null +++ b/gi/posterior-regularisation/Corpus.java @@ -0,0 +1,183 @@ +import gnu.trove.TIntArrayList; + +import java.io.*; +import java.util.*; +import java.util.regex.Pattern; + +public class Corpus +{ + private Lexicon tokenLexicon = new Lexicon(); + private Lexicon ngramLexicon = new Lexicon(); + private List edges = new ArrayList(); + private Map> phraseToContext = new HashMap>(); + private Map> contextToPhrase = new HashMap>(); + + public class Ngram + { + private Ngram(int id) + { + ngramId = id; + } + public int getId() + { + return ngramId; + } + public TIntArrayList getTokenIds() + { + return ngramLexicon.lookup(ngramId); + } + public String toString() + { + StringBuffer b = new StringBuffer(); + for (int tid: getTokenIds().toNativeArray()) + { + if (b.length() > 0) + b.append(" "); + b.append(tokenLexicon.lookup(tid)); + } + return b.toString(); + } + public int hashCode() + { + return ngramId; + } + public boolean equals(Object other) + { + return other instanceof Ngram && ngramId == ((Ngram) other).ngramId; + } + private int ngramId; + } + + public class Edge + { + Edge(Ngram phrase, Ngram context, int count) + { + this.phrase = phrase; + this.context = context; + this.count = count; + } + public Ngram getPhrase() + { + return phrase; + } + public Ngram getContext() + { + return context; + } + public int getCount() + { + return count; + } + private Ngram phrase; + private Ngram context; + private int count; + } + + List getEdges() + { + return edges; + } + + int numEdges() + { + return edges.size(); + } + + Set getPhrases() + { + return phraseToContext.keySet(); + } + + List getEdgesForPhrase(Ngram phrase) + { + return phraseToContext.get(phrase); + } + + Set getContexts() + { + return contextToPhrase.keySet(); + } + + List getEdgesForContext(Ngram context) + { + return contextToPhrase.get(context); + } + + int numTokens() + { + return tokenLexicon.size(); + } + + static Corpus readFromFile(Reader in) throws IOException + { + Corpus c = new Corpus(); + + // read in line-by-line + BufferedReader bin = new BufferedReader(in); + String line; + Pattern separator = Pattern.compile(" \\|\\|\\| "); + + while ((line = bin.readLine()) != null) + { + // split into phrase and contexts + StringTokenizer st = new StringTokenizer(line, "\t"); + assert (st.hasMoreTokens()); + String phraseToks = st.nextToken(); + assert (st.hasMoreTokens()); + String rest = st.nextToken(); + assert (!st.hasMoreTokens()); + + // process phrase + st = new StringTokenizer(phraseToks, " "); + TIntArrayList ptoks = new TIntArrayList(); + while (st.hasMoreTokens()) + ptoks.add(c.tokenLexicon.insert(st.nextToken())); + int phraseId = c.ngramLexicon.insert(ptoks); + Ngram phrase = c.new Ngram(phraseId); + + // process contexts + String[] parts = separator.split(rest); + assert (parts.length % 2 == 0); + for (int i = 0; i < parts.length; i += 2) + { + // process pairs of strings - context and count + TIntArrayList ctx = new TIntArrayList(); + String ctxString = parts[i]; + String countString = parts[i + 1]; + StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " "); + while (ctxStrtok.hasMoreTokens()) + { + String token = ctxStrtok.nextToken(); + if (!token.equals("")) + ctx.add(c.tokenLexicon.insert(token)); + } + int contextId = c.ngramLexicon.insert(ctx); + Ngram context = c.new Ngram(contextId); + + assert (countString.startsWith("C=")); + Edge e = c.new Edge(phrase, context, Integer.parseInt(countString.substring(2).trim())); + c.edges.add(e); + + // index the edge for fast phrase lookup + List edges = c.phraseToContext.get(phrase); + if (edges == null) + { + edges = new ArrayList(); + c.phraseToContext.put(phrase, edges); + } + edges.add(e); + + // index the edge for fast context lookup + edges = c.contextToPhrase.get(context); + if (edges == null) + { + edges = new ArrayList(); + c.contextToPhrase.put(context, edges); + } + edges.add(e); + } + } + + return c; + } +} diff --git a/gi/posterior-regularisation/Lexicon.java b/gi/posterior-regularisation/Lexicon.java new file mode 100644 index 00000000..9f0245ee --- /dev/null +++ b/gi/posterior-regularisation/Lexicon.java @@ -0,0 +1,32 @@ +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class Lexicon +{ + public int insert(T word) + { + Integer i = wordToIndex.get(word); + if (i == null) + { + i = indexToWord.size(); + wordToIndex.put(word, i); + indexToWord.add(word); + } + return i; + } + + public T lookup(int index) + { + return indexToWord.get(index); + } + + public int size() + { + return indexToWord.size(); + } + + private Map wordToIndex = new HashMap(); + private List indexToWord = new ArrayList(); +} \ No newline at end of file -- cgit v1.2.3