From e26434979adc33bd949566ba7bf02dff64e80a3e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- .../prjava/src/data/Corpus.java | 233 --------------------- 1 file changed, 233 deletions(-) delete mode 100644 gi/posterior-regularisation/prjava/src/data/Corpus.java (limited to 'gi/posterior-regularisation/prjava/src/data/Corpus.java') diff --git a/gi/posterior-regularisation/prjava/src/data/Corpus.java b/gi/posterior-regularisation/prjava/src/data/Corpus.java deleted file mode 100644 index 425ede11..00000000 --- a/gi/posterior-regularisation/prjava/src/data/Corpus.java +++ /dev/null @@ -1,233 +0,0 @@ -package data; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Scanner; - -public class Corpus { - - public static final String alphaFilename="../posdata/corpus.alphabet"; - public static final String tagalphaFilename="../posdata/corpus.tag.alphabet"; - -// public static final String START_SYM=""; - public static final String END_SYM=""; - public static final String NUM_TOK=""; - - public static final String UNK_TOK=""; - - private ArrayListsent; - private ArrayListdata; - - public ArrayListtag; - public ArrayListtagData; - - public static boolean convertNumTok=true; - - private HashMapfreq; - public HashMapvocab; - - public HashMaptagVocab; - private int tagV; - - private int V; - - public static void main(String[] args) { - Corpus c=new Corpus("../posdata/en_test.conll"); - System.out.println( - Arrays.toString(c.get(0)) - ); - System.out.println( - Arrays.toString(c.getInt(0)) - ); - - System.out.println( - Arrays.toString(c.get(1)) - ); - System.out.println( - Arrays.toString(c.getInt(1)) - ); - } - - public Corpus(String filename,HashMapdict){ - V=0; - tagV=0; - freq=new HashMap(); - tagVocab=new HashMap(); - vocab=dict; - - sent=new ArrayList(); - tag=new ArrayList(); - - Scanner sc=io.FileUtil.openInFile(filename); - ArrayLists=new ArrayList(); - // s.add(START_SYM); - while(sc.hasNextLine()){ - String line=sc.nextLine(); - String toks[]=line.split("\t"); - if(toks.length<2){ - s.add(END_SYM); - sent.add(s.toArray(new String[0])); - s=new ArrayList(); - // s.add(START_SYM); - continue; - } - String tok=toks[1].toLowerCase(); - s.add(tok); - } - sc.close(); - - buildData(); - } - - public Corpus(String filename){ - V=0; - freq=new HashMap(); - vocab=new HashMap(); - tagVocab=new HashMap(); - - sent=new ArrayList(); - tag=new ArrayList(); - - System.out.println("Reading:"+filename); - - Scanner sc=io.FileUtil.openInFile(filename); - ArrayLists=new ArrayList(); - ArrayListtags=new ArrayList(); - //s.add(START_SYM); - while(sc.hasNextLine()){ - String line=sc.nextLine(); - String toks[]=line.split("\t"); - if(toks.length<2){ - s.add(END_SYM); - tags.add(END_SYM); - if(s.size()>2){ - sent.add(s.toArray(new String[0])); - tag.add(tags.toArray(new String [0])); - } - s=new ArrayList(); - tags=new ArrayList(); - // s.add(START_SYM); - continue; - } - - String tok=toks[1].toLowerCase(); - if(convertNumTok && tok.matches(".*\\d.*")){ - tok=NUM_TOK; - } - s.add(tok); - - if(toks.length>3){ - tok=toks[3].toLowerCase(); - }else{ - tok="_"; - } - tags.add(tok); - - } - sc.close(); - - for(int i=0;i(); - for(int i=0;i(); - for(int i=0;i2){ - vocab.put(key, V); - V++; - } - } - io.SerializedObjects.writeSerializedObject(vocab, alphaFilename); - io.SerializedObjects.writeSerializedObject(tagVocab,tagalphaFilename); - } - - private void addTag(String tag){ - Integer i=tagVocab.get(tag); - if(i==null){ - tagVocab.put(tag, tagV); - tagV++; - } - } - -} -- cgit v1.2.3