summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation/Corpus.java
diff options
context:
space:
mode:
Diffstat (limited to 'gi/posterior-regularisation/Corpus.java')
-rw-r--r--gi/posterior-regularisation/Corpus.java183
1 files changed, 183 insertions, 0 deletions
diff --git a/gi/posterior-regularisation/Corpus.java b/gi/posterior-regularisation/Corpus.java
new file mode 100644
index 00000000..047e6ee8
--- /dev/null
+++ b/gi/posterior-regularisation/Corpus.java
@@ -0,0 +1,183 @@
+import gnu.trove.TIntArrayList;
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.Pattern;
+
+public class Corpus
+{
+ private Lexicon<String> tokenLexicon = new Lexicon<String>();
+ private Lexicon<TIntArrayList> ngramLexicon = new Lexicon<TIntArrayList>();
+ private List<Edge> edges = new ArrayList<Edge>();
+ private Map<Ngram,List<Edge>> phraseToContext = new HashMap<Ngram,List<Edge>>();
+ private Map<Ngram,List<Edge>> contextToPhrase = new HashMap<Ngram,List<Edge>>();
+
+ public class Ngram
+ {
+ private Ngram(int id)
+ {
+ ngramId = id;
+ }
+ public int getId()
+ {
+ return ngramId;
+ }
+ public TIntArrayList getTokenIds()
+ {
+ return ngramLexicon.lookup(ngramId);
+ }
+ public String toString()
+ {
+ StringBuffer b = new StringBuffer();
+ for (int tid: getTokenIds().toNativeArray())
+ {
+ if (b.length() > 0)
+ b.append(" ");
+ b.append(tokenLexicon.lookup(tid));
+ }
+ return b.toString();
+ }
+ public int hashCode()
+ {
+ return ngramId;
+ }
+ public boolean equals(Object other)
+ {
+ return other instanceof Ngram && ngramId == ((Ngram) other).ngramId;
+ }
+ private int ngramId;
+ }
+
+ public class Edge
+ {
+ Edge(Ngram phrase, Ngram context, int count)
+ {
+ this.phrase = phrase;
+ this.context = context;
+ this.count = count;
+ }
+ public Ngram getPhrase()
+ {
+ return phrase;
+ }
+ public Ngram getContext()
+ {
+ return context;
+ }
+ public int getCount()
+ {
+ return count;
+ }
+ private Ngram phrase;
+ private Ngram context;
+ private int count;
+ }
+
+ List<Edge> getEdges()
+ {
+ return edges;
+ }
+
+ int numEdges()
+ {
+ return edges.size();
+ }
+
+ Set<Ngram> getPhrases()
+ {
+ return phraseToContext.keySet();
+ }
+
+ List<Edge> getEdgesForPhrase(Ngram phrase)
+ {
+ return phraseToContext.get(phrase);
+ }
+
+ Set<Ngram> getContexts()
+ {
+ return contextToPhrase.keySet();
+ }
+
+ List<Edge> getEdgesForContext(Ngram context)
+ {
+ return contextToPhrase.get(context);
+ }
+
+ int numTokens()
+ {
+ return tokenLexicon.size();
+ }
+
+ static Corpus readFromFile(Reader in) throws IOException
+ {
+ Corpus c = new Corpus();
+
+ // read in line-by-line
+ BufferedReader bin = new BufferedReader(in);
+ String line;
+ Pattern separator = Pattern.compile(" \\|\\|\\| ");
+
+ while ((line = bin.readLine()) != null)
+ {
+ // split into phrase and contexts
+ StringTokenizer st = new StringTokenizer(line, "\t");
+ assert (st.hasMoreTokens());
+ String phraseToks = st.nextToken();
+ assert (st.hasMoreTokens());
+ String rest = st.nextToken();
+ assert (!st.hasMoreTokens());
+
+ // process phrase
+ st = new StringTokenizer(phraseToks, " ");
+ TIntArrayList ptoks = new TIntArrayList();
+ while (st.hasMoreTokens())
+ ptoks.add(c.tokenLexicon.insert(st.nextToken()));
+ int phraseId = c.ngramLexicon.insert(ptoks);
+ Ngram phrase = c.new Ngram(phraseId);
+
+ // process contexts
+ String[] parts = separator.split(rest);
+ assert (parts.length % 2 == 0);
+ for (int i = 0; i < parts.length; i += 2)
+ {
+ // process pairs of strings - context and count
+ TIntArrayList ctx = new TIntArrayList();
+ String ctxString = parts[i];
+ String countString = parts[i + 1];
+ StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " ");
+ while (ctxStrtok.hasMoreTokens())
+ {
+ String token = ctxStrtok.nextToken();
+ if (!token.equals("<PHRASE>"))
+ ctx.add(c.tokenLexicon.insert(token));
+ }
+ int contextId = c.ngramLexicon.insert(ctx);
+ Ngram context = c.new Ngram(contextId);
+
+ assert (countString.startsWith("C="));
+ Edge e = c.new Edge(phrase, context, Integer.parseInt(countString.substring(2).trim()));
+ c.edges.add(e);
+
+ // index the edge for fast phrase lookup
+ List<Edge> edges = c.phraseToContext.get(phrase);
+ if (edges == null)
+ {
+ edges = new ArrayList<Edge>();
+ c.phraseToContext.put(phrase, edges);
+ }
+ edges.add(e);
+
+ // index the edge for fast context lookup
+ edges = c.contextToPhrase.get(context);
+ if (edges == null)
+ {
+ edges = new ArrayList<Edge>();
+ c.contextToPhrase.put(context, edges);
+ }
+ edges.add(e);
+ }
+ }
+
+ return c;
+ }
+}