summaryrefslogtreecommitdiff
path: root/gi/posterior-regularisation/Corpus.java
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
commite26434979adc33bd949566ba7bf02dff64e80a3e (patch)
treed1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/posterior-regularisation/Corpus.java
parent0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/posterior-regularisation/Corpus.java')
-rw-r--r--gi/posterior-regularisation/Corpus.java167
1 files changed, 0 insertions, 167 deletions
diff --git a/gi/posterior-regularisation/Corpus.java b/gi/posterior-regularisation/Corpus.java
deleted file mode 100644
index 07b27387..00000000
--- a/gi/posterior-regularisation/Corpus.java
+++ /dev/null
@@ -1,167 +0,0 @@
-import gnu.trove.TIntArrayList;
-
-import java.io.*;
-import java.util.*;
-import java.util.regex.Pattern;
-
-public class Corpus
-{
- private Lexicon<String> tokenLexicon = new Lexicon<String>();
- private Lexicon<TIntArrayList> phraseLexicon = new Lexicon<TIntArrayList>();
- private Lexicon<TIntArrayList> contextLexicon = new Lexicon<TIntArrayList>();
- private List<Edge> edges = new ArrayList<Edge>();
- private List<List<Edge>> phraseToContext = new ArrayList<List<Edge>>();
- private List<List<Edge>> contextToPhrase = new ArrayList<List<Edge>>();
-
- public class Edge
- {
- Edge(int phraseId, int contextId, int count)
- {
- this.phraseId = phraseId;
- this.contextId = contextId;
- this.count = count;
- }
- public int getPhraseId()
- {
- return phraseId;
- }
- public TIntArrayList getPhrase()
- {
- return phraseLexicon.lookup(phraseId);
- }
- public String getPhraseString()
- {
- StringBuffer b = new StringBuffer();
- for (int tid: getPhrase().toNativeArray())
- {
- if (b.length() > 0)
- b.append(" ");
- b.append(tokenLexicon.lookup(tid));
- }
- return b.toString();
- }
- public int getContextId()
- {
- return contextId;
- }
- public TIntArrayList getContext()
- {
- return contextLexicon.lookup(contextId);
- }
- public String getContextString()
- {
- StringBuffer b = new StringBuffer();
- for (int tid: getContext().toNativeArray())
- {
- if (b.length() > 0)
- b.append(" ");
- b.append(tokenLexicon.lookup(tid));
- }
- return b.toString();
- }
- public int getCount()
- {
- return count;
- }
- private int phraseId;
- private int contextId;
- private int count;
- }
-
- List<Edge> getEdges()
- {
- return edges;
- }
-
- int getNumEdges()
- {
- return edges.size();
- }
-
- int getNumPhrases()
- {
- return phraseLexicon.size();
- }
-
- List<Edge> getEdgesForPhrase(int phraseId)
- {
- return phraseToContext.get(phraseId);
- }
-
- int getNumContexts()
- {
- return contextLexicon.size();
- }
-
- List<Edge> getEdgesForContext(int contextId)
- {
- return contextToPhrase.get(contextId);
- }
-
- int getNumTokens()
- {
- return tokenLexicon.size();
- }
-
- static Corpus readFromFile(Reader in) throws IOException
- {
- Corpus c = new Corpus();
-
- // read in line-by-line
- BufferedReader bin = new BufferedReader(in);
- String line;
- Pattern separator = Pattern.compile(" \\|\\|\\| ");
-
- while ((line = bin.readLine()) != null)
- {
- // split into phrase and contexts
- StringTokenizer st = new StringTokenizer(line, "\t");
- assert (st.hasMoreTokens());
- String phraseToks = st.nextToken();
- assert (st.hasMoreTokens());
- String rest = st.nextToken();
- assert (!st.hasMoreTokens());
-
- // process phrase
- st = new StringTokenizer(phraseToks, " ");
- TIntArrayList ptoks = new TIntArrayList();
- while (st.hasMoreTokens())
- ptoks.add(c.tokenLexicon.insert(st.nextToken()));
- int phraseId = c.phraseLexicon.insert(ptoks);
- if (phraseId == c.phraseToContext.size())
- c.phraseToContext.add(new ArrayList<Edge>());
-
- // process contexts
- String[] parts = separator.split(rest);
- assert (parts.length % 2 == 0);
- for (int i = 0; i < parts.length; i += 2)
- {
- // process pairs of strings - context and count
- TIntArrayList ctx = new TIntArrayList();
- String ctxString = parts[i];
- String countString = parts[i + 1];
- StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " ");
- while (ctxStrtok.hasMoreTokens())
- {
- String token = ctxStrtok.nextToken();
- if (!token.equals("<PHRASE>"))
- ctx.add(c.tokenLexicon.insert(token));
- }
- int contextId = c.contextLexicon.insert(ctx);
- if (contextId == c.contextToPhrase.size())
- c.contextToPhrase.add(new ArrayList<Edge>());
-
- assert (countString.startsWith("C="));
- Edge e = c.new Edge(phraseId, contextId,
- Integer.parseInt(countString.substring(2).trim()));
- c.edges.add(e);
-
- // index the edge for fast phrase, context lookup
- c.phraseToContext.get(phraseId).add(e);
- c.contextToPhrase.get(contextId).add(e);
- }
- }
-
- return c;
- }
-}