From ad418214fe3b3fcd33d81225eb3d3fb08b67f88a Mon Sep 17 00:00:00 2001 From: desaicwtf Date: Mon, 28 Jun 2010 23:14:21 +0000 Subject: add draft version of POS induction with HMM and L1 Linf constraints git-svn-id: https://ws10smt.googlecode.com/svn/trunk@47 ec762483-ff6d-05da-a07a-a48fb63a330f --- .../prjava/src/data/Corpus.java | 230 +++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 gi/posterior-regularisation/prjava/src/data/Corpus.java (limited to 'gi/posterior-regularisation/prjava/src/data') diff --git a/gi/posterior-regularisation/prjava/src/data/Corpus.java b/gi/posterior-regularisation/prjava/src/data/Corpus.java new file mode 100644 index 00000000..f0da0b33 --- /dev/null +++ b/gi/posterior-regularisation/prjava/src/data/Corpus.java @@ -0,0 +1,230 @@ +package data; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Scanner; + +public class Corpus { + + public static final String alphaFilename="../posdata/corpus.alphabet"; + public static final String tagalphaFilename="../posdata/corpus.tag.alphabet"; + +// public static final String START_SYM=""; + public static final String END_SYM=""; + public static final String NUM_TOK=""; + + public static final String UNK_TOK=""; + + private ArrayListsent; + private ArrayListdata; + + public ArrayListtag; + public ArrayListtagData; + + public static boolean convertNumTok=true; + + private HashMapfreq; + public HashMapvocab; + + public HashMaptagVocab; + private int tagV; + + private int V; + + public static void main(String[] args) { + Corpus c=new Corpus("../posdata/en_test.conll"); + System.out.println( + Arrays.toString(c.get(0)) + ); + System.out.println( + Arrays.toString(c.getInt(0)) + ); + + System.out.println( + Arrays.toString(c.get(1)) + ); + System.out.println( + Arrays.toString(c.getInt(1)) + ); + } + + public Corpus(String filename,HashMapdict){ + V=0; + tagV=0; + freq=new HashMap(); + tagVocab=new HashMap(); + vocab=dict; + + sent=new ArrayList(); + tag=new ArrayList(); + + Scanner sc=io.FileUtil.openInFile(filename); + ArrayLists=new ArrayList(); + // s.add(START_SYM); + while(sc.hasNextLine()){ + String line=sc.nextLine(); + String toks[]=line.split("\t"); + if(toks.length<2){ + s.add(END_SYM); + sent.add(s.toArray(new String[0])); + s=new ArrayList(); + // s.add(START_SYM); + continue; + } + String tok=toks[1].toLowerCase(); + s.add(tok); + } + sc.close(); + + buildData(); + } + + public Corpus(String filename){ + V=0; + freq=new HashMap(); + vocab=new HashMap(); + tagVocab=new HashMap(); + + sent=new ArrayList(); + tag=new ArrayList(); + + System.out.println("Reading:"+filename); + + Scanner sc=io.FileUtil.openInFile(filename); + ArrayLists=new ArrayList(); + ArrayListtags=new ArrayList(); + //s.add(START_SYM); + while(sc.hasNextLine()){ + String line=sc.nextLine(); + String toks[]=line.split("\t"); + if(toks.length<2){ + s.add(END_SYM); + tags.add(END_SYM); + if(s.size()>2){ + sent.add(s.toArray(new String[0])); + tag.add(tags.toArray(new String [0])); + } + s=new ArrayList(); + tags=new ArrayList(); + // s.add(START_SYM); + continue; + } + + String tok=toks[1].toLowerCase(); + if(convertNumTok && tok.matches(".*\\d.*")){ + tok=NUM_TOK; + } + s.add(tok); + + if(toks.length>3){ + tok=toks[3].toLowerCase(); + }else{ + tok="_"; + } + tags.add(tok); + + } + sc.close(); + + for(int i=0;i(); + for(int i=0;i(); + for(int i=0;i2){ + vocab.put(key, V); + V++; + } + } + io.SerializedObjects.writeSerializedObject(vocab, alphaFilename); + io.SerializedObjects.writeSerializedObject(tagVocab,tagalphaFilename); + } + + private void addTag(String tag){ + Integer i=tagVocab.get(tag); + if(i==null){ + tagVocab.put(tag, tagV); + tagV++; + } + } + +} -- cgit v1.2.3