package data; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Scanner; public class Corpus { public static final String alphaFilename="../posdata/corpus.alphabet"; public static final String tagalphaFilename="../posdata/corpus.tag.alphabet"; // public static final String START_SYM=""; public static final String END_SYM=""; public static final String NUM_TOK=""; public static final String UNK_TOK=""; private ArrayListsent; private ArrayListdata; public ArrayListtag; public ArrayListtagData; public static boolean convertNumTok=true; private HashMapfreq; public HashMapvocab; public HashMaptagVocab; private int tagV; private int V; public static void main(String[] args) { Corpus c=new Corpus("../posdata/en_test.conll"); System.out.println( Arrays.toString(c.get(0)) ); System.out.println( Arrays.toString(c.getInt(0)) ); System.out.println( Arrays.toString(c.get(1)) ); System.out.println( Arrays.toString(c.getInt(1)) ); } public Corpus(String filename,HashMapdict){ V=0; tagV=0; freq=new HashMap(); tagVocab=new HashMap(); vocab=dict; sent=new ArrayList(); tag=new ArrayList(); Scanner sc=io.FileUtil.openInFile(filename); ArrayLists=new ArrayList(); // s.add(START_SYM); while(sc.hasNextLine()){ String line=sc.nextLine(); String toks[]=line.split("\t"); if(toks.length<2){ s.add(END_SYM); sent.add(s.toArray(new String[0])); s=new ArrayList(); // s.add(START_SYM); continue; } String tok=toks[1].toLowerCase(); s.add(tok); } sc.close(); buildData(); } public Corpus(String filename){ V=0; freq=new HashMap(); vocab=new HashMap(); tagVocab=new HashMap(); sent=new ArrayList(); tag=new ArrayList(); System.out.println("Reading:"+filename); Scanner sc=io.FileUtil.openInFile(filename); ArrayLists=new ArrayList(); ArrayListtags=new ArrayList(); //s.add(START_SYM); while(sc.hasNextLine()){ String line=sc.nextLine(); String toks[]=line.split("\t"); if(toks.length<2){ s.add(END_SYM); tags.add(END_SYM); if(s.size()>2){ sent.add(s.toArray(new String[0])); tag.add(tags.toArray(new String [0])); } s=new ArrayList(); tags=new ArrayList(); // s.add(START_SYM); continue; } String tok=toks[1].toLowerCase(); if(convertNumTok && tok.matches(".*\\d.*")){ tok=NUM_TOK; } s.add(tok); if(toks.length>3){ tok=toks[3].toLowerCase(); }else{ tok="_"; } tags.add(tok); } sc.close(); for(int i=0;i(); for(int i=0;i(); for(int i=0;i2){ vocab.put(key, V); V++; } } io.SerializedObjects.writeSerializedObject(vocab, alphaFilename); io.SerializedObjects.writeSerializedObject(tagVocab,tagalphaFilename); } private void addTag(String tag){ Integer i=tagVocab.get(tag); if(i==null){ tagVocab.put(tag, tagV); tagV++; } } }