From e26434979adc33bd949566ba7bf02dff64e80a3e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- gi/posterior-regularisation/Corpus.java | 167 -------------------------------- 1 file changed, 167 deletions(-) delete mode 100644 gi/posterior-regularisation/Corpus.java (limited to 'gi/posterior-regularisation/Corpus.java') diff --git a/gi/posterior-regularisation/Corpus.java b/gi/posterior-regularisation/Corpus.java deleted file mode 100644 index 07b27387..00000000 --- a/gi/posterior-regularisation/Corpus.java +++ /dev/null @@ -1,167 +0,0 @@ -import gnu.trove.TIntArrayList; - -import java.io.*; -import java.util.*; -import java.util.regex.Pattern; - -public class Corpus -{ - private Lexicon tokenLexicon = new Lexicon(); - private Lexicon phraseLexicon = new Lexicon(); - private Lexicon contextLexicon = new Lexicon(); - private List edges = new ArrayList(); - private List> phraseToContext = new ArrayList>(); - private List> contextToPhrase = new ArrayList>(); - - public class Edge - { - Edge(int phraseId, int contextId, int count) - { - this.phraseId = phraseId; - this.contextId = contextId; - this.count = count; - } - public int getPhraseId() - { - return phraseId; - } - public TIntArrayList getPhrase() - { - return phraseLexicon.lookup(phraseId); - } - public String getPhraseString() - { - StringBuffer b = new StringBuffer(); - for (int tid: getPhrase().toNativeArray()) - { - if (b.length() > 0) - b.append(" "); - b.append(tokenLexicon.lookup(tid)); - } - return b.toString(); - } - public int getContextId() - { - return contextId; - } - public TIntArrayList getContext() - { - return contextLexicon.lookup(contextId); - } - public String getContextString() - { - StringBuffer b = new StringBuffer(); - for (int tid: getContext().toNativeArray()) - { - if (b.length() > 0) - b.append(" "); - b.append(tokenLexicon.lookup(tid)); - } - return b.toString(); - } - public int getCount() - { - return count; - } - private int phraseId; - private int contextId; - private int count; - } - - List getEdges() - { - return edges; - } - - int getNumEdges() - { - return edges.size(); - } - - int getNumPhrases() - { - return phraseLexicon.size(); - } - - List getEdgesForPhrase(int phraseId) - { - return phraseToContext.get(phraseId); - } - - int getNumContexts() - { - return contextLexicon.size(); - } - - List getEdgesForContext(int contextId) - { - return contextToPhrase.get(contextId); - } - - int getNumTokens() - { - return tokenLexicon.size(); - } - - static Corpus readFromFile(Reader in) throws IOException - { - Corpus c = new Corpus(); - - // read in line-by-line - BufferedReader bin = new BufferedReader(in); - String line; - Pattern separator = Pattern.compile(" \\|\\|\\| "); - - while ((line = bin.readLine()) != null) - { - // split into phrase and contexts - StringTokenizer st = new StringTokenizer(line, "\t"); - assert (st.hasMoreTokens()); - String phraseToks = st.nextToken(); - assert (st.hasMoreTokens()); - String rest = st.nextToken(); - assert (!st.hasMoreTokens()); - - // process phrase - st = new StringTokenizer(phraseToks, " "); - TIntArrayList ptoks = new TIntArrayList(); - while (st.hasMoreTokens()) - ptoks.add(c.tokenLexicon.insert(st.nextToken())); - int phraseId = c.phraseLexicon.insert(ptoks); - if (phraseId == c.phraseToContext.size()) - c.phraseToContext.add(new ArrayList()); - - // process contexts - String[] parts = separator.split(rest); - assert (parts.length % 2 == 0); - for (int i = 0; i < parts.length; i += 2) - { - // process pairs of strings - context and count - TIntArrayList ctx = new TIntArrayList(); - String ctxString = parts[i]; - String countString = parts[i + 1]; - StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " "); - while (ctxStrtok.hasMoreTokens()) - { - String token = ctxStrtok.nextToken(); - if (!token.equals("")) - ctx.add(c.tokenLexicon.insert(token)); - } - int contextId = c.contextLexicon.insert(ctx); - if (contextId == c.contextToPhrase.size()) - c.contextToPhrase.add(new ArrayList()); - - assert (countString.startsWith("C=")); - Edge e = c.new Edge(phraseId, contextId, - Integer.parseInt(countString.substring(2).trim())); - c.edges.add(e); - - // index the edge for fast phrase, context lookup - c.phraseToContext.get(phraseId).add(e); - c.contextToPhrase.get(contextId).add(e); - } - } - - return c; - } -} -- cgit v1.2.3