cdec cleanup, remove bayesian stuff, parsing stuff

author: Chris Dyer <cdyer@cab.ark.cs.cmu.edu> 2012-10-02 00:19:43 -0400
committer: Chris Dyer <cdyer@cab.ark.cs.cmu.edu> 2012-10-02 00:19:43 -0400
commit: e26434979adc33bd949566ba7bf02dff64e80a3e (patch)
tree: d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/posterior-regularisation/prjava/src/phrase/Corpus.java
parent: 0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
1 files changed, 0 insertions, 288 deletions
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
deleted file mode 100644
index 4b1939cd..00000000
--- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
+++ /dev/null
@@ -1,288 +0,0 @@
-package phrase;
-
-import gnu.trove.TIntArrayList;
-
-import java.io.*;
-import java.util.*;
-import java.util.regex.Pattern;
-
-
-public class Corpus
-{
-	private Lexicon<String> wordLexicon = new Lexicon<String>();
-	private Lexicon<TIntArrayList> phraseLexicon = new Lexicon<TIntArrayList>();
-	private Lexicon<TIntArrayList> contextLexicon = new Lexicon<TIntArrayList>();
-	private List<Edge> edges = new ArrayList<Edge>();
-	private List<List<Edge>> phraseToContext = new ArrayList<List<Edge>>();
-	private List<List<Edge>> contextToPhrase = new ArrayList<List<Edge>>();
-	public int splitSentinel;
-	public int phraseSentinel;
-	public int rareSentinel;
-
-	public Corpus()
-	{
-		splitSentinel = wordLexicon.insert("<SPLIT>");
-		phraseSentinel = wordLexicon.insert("<PHRASE>");		
-		rareSentinel = wordLexicon.insert("<RARE>");
-	}
-	
-	public class Edge
-	{
-		
-		Edge(int phraseId, int contextId, double count,int tag)
-		{
-			this.phraseId = phraseId;
-			this.contextId = contextId;
-			this.count = count;
-			fixTag=tag;
-		}
-		
-		Edge(int phraseId, int contextId, double count)
-		{
-			this.phraseId = phraseId;
-			this.contextId = contextId;
-			this.count = count;
-			fixTag=-1;
-		}
-		public int getTag(){
-			return fixTag;
-		}
-		
-		public int getPhraseId()
-		{
-			return phraseId;
-		}
-		public TIntArrayList getPhrase()
-		{
-			return Corpus.this.getPhrase(phraseId);
-		}
-		public String getPhraseString()
-		{
-			return Corpus.this.getPhraseString(phraseId);
-		}		
-		public int getContextId()
-		{
-			return contextId;
-		}
-		public TIntArrayList getContext()
-		{
-			return Corpus.this.getContext(contextId);
-		}
-		public String getContextString(boolean insertPhraseSentinel)
-		{
-			return Corpus.this.getContextString(contextId, insertPhraseSentinel);
-		}
-		public double getCount()
-		{
-			return count;
-		}
-		public boolean equals(Object other)
-		{
-			if (other instanceof Edge) 
-			{
-				Edge oe = (Edge) other;
-				return oe.phraseId == phraseId && oe.contextId == contextId; 
-			}
-			else return false;
-		}
-		public int hashCode()
-		{   // this is how boost's hash_combine does it
-			int seed = phraseId;
-			seed ^= contextId + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-			return seed;
-		}
-		public String toString()
-		{
-			return getPhraseString() + "\t" + getContextString(true);
-		}
-		
-		private int phraseId;
-		private int contextId;
-		private double count;
-		private int fixTag;
-	}
-
-	List<Edge> getEdges()
-	{
-		return edges;
-	}
-	
-	int getNumEdges()
-	{
-		return edges.size();
-	}
-
-	int getNumPhrases()
-	{
-		return phraseLexicon.size();
-	}
-	
-	int getNumContextPositions()
-	{
-		return contextLexicon.lookup(0).size();
-	}
-	
-	List<Edge> getEdgesForPhrase(int phraseId)
-	{
-		return phraseToContext.get(phraseId);
-	}
-	
-	int getNumContexts()
-	{
-		return contextLexicon.size();
-	}
-	
-	List<Edge> getEdgesForContext(int contextId)
-	{
-		return contextToPhrase.get(contextId);
-	}
-	
-	int getNumWords()
-	{
-		return wordLexicon.size();
-	}
-	
-	String getWord(int wordId)
-	{
-		return wordLexicon.lookup(wordId);
-	}
-	
-	public TIntArrayList getPhrase(int phraseId)
-	{
-		return phraseLexicon.lookup(phraseId);
-	}
-	
-	public String getPhraseString(int phraseId)
-	{
-		StringBuffer b = new StringBuffer();
-		for (int tid: getPhrase(phraseId).toNativeArray())
-		{
-			if (b.length() > 0)
-				b.append(" ");
-			b.append(wordLexicon.lookup(tid));
-		}
-		return b.toString();
-	}		
-	
-	public TIntArrayList getContext(int contextId)
-	{
-		return contextLexicon.lookup(contextId);
-	}
-	
-	public String getContextString(int contextId, boolean insertPhraseSentinel)
-	{
-		StringBuffer b = new StringBuffer();
-		TIntArrayList c = getContext(contextId);
-		for (int i = 0; i < c.size(); ++i)
-		{
-			if (i > 0) b.append(" ");
-			//if (i == c.size() / 2) b.append("<PHRASE> ");
-			b.append(wordLexicon.lookup(c.get(i)));
-		}
-		return b.toString();
-	}
-	
-	public boolean isSentinel(int wordId)
-	{
-		return wordId == splitSentinel || wordId == phraseSentinel;
-	}
-	
-	List<Edge> readEdges(Reader in) throws IOException
-	{	
-		// read in line-by-line
-		BufferedReader bin = new BufferedReader(in);
-		String line;
-		Pattern separator = Pattern.compile(" \\|\\|\\| ");
-		
-		List<Edge> edges = new ArrayList<Edge>();
-		while ((line = bin.readLine()) != null)
-		{
-			// split into phrase and contexts
-			StringTokenizer st = new StringTokenizer(line, "\t");
-			assert (st.hasMoreTokens());
-			String phraseToks = st.nextToken();
-			assert (st.hasMoreTokens());
-			String rest = st.nextToken();
-			assert (!st.hasMoreTokens());
-
-			// process phrase	
-			st = new StringTokenizer(phraseToks, " ");
-			TIntArrayList ptoks = new TIntArrayList();
-			while (st.hasMoreTokens())
-				ptoks.add(wordLexicon.insert(st.nextToken()));
-			int phraseId = phraseLexicon.insert(ptoks);
-			
-			// process contexts
-			String[] parts = separator.split(rest);
-			assert (parts.length % 2 == 0);
-			for (int i = 0; i < parts.length; i += 2)
-			{
-				// process pairs of strings - context and count
-				String ctxString = parts[i];
-				String countString = parts[i + 1];
-
-				assert (countString.startsWith("C="));
-
-				String []countToks=countString.split(" ");
-				
-				double count = Double.parseDouble(countToks[0].substring(2).trim());
-				
-				TIntArrayList ctx = new TIntArrayList();
-				StringTokenizer ctxStrtok = new StringTokenizer(ctxString, " ");
-				while (ctxStrtok.hasMoreTokens())
-				{
-					String token = ctxStrtok.nextToken();
-					ctx.add(wordLexicon.insert(token));
-				}
-				int contextId = contextLexicon.insert(ctx);
-
-
-				if(countToks.length<2){
-					edges.add(new Edge(phraseId, contextId, count));
-				}
-				else{
-					int tag=Integer.parseInt(countToks[1].substring(2));
-					edges.add(new Edge(phraseId, contextId, count,tag));
-				}
-			}
-		}
-		return edges;
-	}
-	
-	static Corpus readFromFile(Reader in) throws IOException
-	{	
-		Corpus c = new Corpus();
-		c.edges = c.readEdges(in);
-		for (Edge edge: c.edges)
-		{
-			while (edge.getPhraseId() >= c.phraseToContext.size())
-				c.phraseToContext.add(new ArrayList<Edge>());
-			while (edge.getContextId() >= c.contextToPhrase.size())
-				c.contextToPhrase.add(new ArrayList<Edge>());
-			
-			// index the edge for fast phrase, context lookup
-			c.phraseToContext.get(edge.getPhraseId()).add(edge);
-			c.contextToPhrase.get(edge.getContextId()).add(edge);
-		}
-		return c;
-	}
-		
-	TIntArrayList phraseEdges(TIntArrayList phrase)
-	{
-		TIntArrayList r = new TIntArrayList(4);
-		for (int p = 0; p < phrase.size(); ++p)
-		{
-			if (p == 0 || phrase.get(p-1) == splitSentinel) 				
-				r.add(p);
-			if (p == phrase.size() - 1 || phrase.get(p+1) == splitSentinel) 
-				r.add(p);
-		}
-		return r;
-	}
-
-	public void printStats(PrintStream out) 
-	{
-		out.println("Corpus has " + edges.size() + " edges " + phraseLexicon.size() + " phrases " 
-				+ contextLexicon.size() + " contexts and " + wordLexicon.size() + " word types");
-	}
-}
-\ No newline at end of file
author	Chris Dyer <cdyer@cab.ark.cs.cmu.edu>	2012-10-02 00:19:43 -0400
committer	Chris Dyer <cdyer@cab.ark.cs.cmu.edu>	2012-10-02 00:19:43 -0400
commit	e26434979adc33bd949566ba7bf02dff64e80a3e (patch)
tree	d1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/posterior-regularisation/prjava/src/phrase/Corpus.java
parent	0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)