diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-27 16:13:19 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-27 16:13:19 +0000 |
commit | fd519b0e45c857b266814994ba8c1421f508e522 (patch) | |
tree | 6d50c9b954e3c13e9df627c1ecc25c53544a5f58 /report/pyp_clustering/acl09-short/code/pygibbs3.c | |
parent | 4c5df460c9da5c935438850ef7993463a9113286 (diff) |
preso
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@435 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'report/pyp_clustering/acl09-short/code/pygibbs3.c')
-rw-r--r-- | report/pyp_clustering/acl09-short/code/pygibbs3.c | 198 |
1 files changed, 198 insertions, 0 deletions
diff --git a/report/pyp_clustering/acl09-short/code/pygibbs3.c b/report/pyp_clustering/acl09-short/code/pygibbs3.c new file mode 100644 index 00000000..3c2240a1 --- /dev/null +++ b/report/pyp_clustering/acl09-short/code/pygibbs3.c @@ -0,0 +1,198 @@ +#include <stdio.h> +#include <math.h> + +#define myrand() (double) (((unsigned long) randomMT()) / 4294967296.) + +#define W 30114 +#define N 831190 +#define KWMAX 1000 + +#define NLOOPS 1000 +#define BURNIN 0 +#define SAMPLEFREQ 1 + +#define ALPHA 0.0 // PYB a +//#define GAMMA 1000000000.0 +#define GAMMA .01 // Dirichlet over multinomial P0 + +double BETA; // CRP alpha (PYB b) +int w[N], z[N]; // words, table assignments +int typecount[W], typetot; //# of tables of each type, total # tables +int usedcount[W]; +double ztot[W][KWMAX]; +double k; // total # tables +int nactive; + +void initialise(void); +void anderson(void); +void fileread(void); + +void initialise(void) +{ + int i,j; + + for (i = 1; i < W; i++) { + typecount[i] = 0; + usedcount[i] = 0; + for (j = 0; j < KWMAX; j++) { + ztot[i][j] = 0; + } + } + +} + +void anderson(void) //stochastic Anderson-style initialisation +{ + int i,j, tag; + double max, totprob, r, runtot; + double probs[KWMAX]; + int ind, temp; + + ztot[w[0]][0] = 1; + z[0] = 0; + typecount[w[0]] = 1; + usedcount[w[0]] = 1; + k = 1; + typetot = 1; + + for (i = 1; i < N; i++) { + // printf("%5d\n", w[i]); + max = 0; tag = 0; totprob = 0; + for (j = 0; j < usedcount[w[i]]; j++) { + probs[j] = ztot[w[i]][j] - ALPHA; + totprob += probs[j]; + } + probs[usedcount[w[i]]] = (ALPHA*k+BETA)*((double) typecount[w[i]]+GAMMA)/((double) typetot+W*GAMMA); + totprob += probs[usedcount[w[i]]]; + // printf("%10.6lf\n",totprob); + r = myrand()*totprob; + max = probs[0]; + j = 0; + while (r>max) { + j++; + max += probs[j]; + } + // printf("%5d\n",j); + z[i] = j; + ztot[w[i]][j]++; + if (ztot[w[i]][j]==1) { + typecount[w[i]]++; + usedcount[w[i]]++; + if (usedcount[w[i]]==KWMAX) { + printf("Maximum number of tables exceeded!!!\n"); + } + typetot++; + k++; + } + } +} + +void fileread(void) +{ + int i,j, wt; + FILE *fileptr; + + fileptr = fopen("wsj.dat", "r"); + + for (i = 1; i < N; i++) { + fscanf(fileptr, "%d", &wt); + w[i] = wt-1; + z[i] = 0; + } + printf("Total cases: %10d\n", N); + fclose(fileptr); +} + +main(int argc, char* argv[]) +{ + int i,j,loop,run; + int temp,ind, tag; + double newprob, WBETA; + double probs[KWMAX]; + double max, totprob, r; + int sampcount; + FILE *fileptr; + char filename[30]; + double score; + + if (argc < 2) { + printf("Please provide a value of b\n"); + exit(0); + } + BETA = strtol(argv[1]); + printf("Basic initialising...\n"); + + // you can seed with any uint32, but the best are odds in 0..(2^32 - 1) + seedMT(4157U); + + sprintf(filename,"typecountrecordwsjpeak%0.1f.%0.1f.dat",ALPHA,BETA); + fileptr = fopen(filename, "w"); + + printf("Reading from file...\n"); + fileread(); + + printf("Initialising...\n"); + initialise(); + printf("k = %1.0f, typetot = %d\n",k,typetot); + + printf("Finding start state...\n"); + anderson(); + printf("Beginning burnin...\n"); + for (loop = 0; loop < NLOOPS; loop++) { + for (i = 0; i < N; i++) { + j = z[i]; + ztot[w[i]][j]--; + if (ztot[w[i]][j] == 0) { + if (j==usedcount[w[i]]) { + usedcount[w[i]]--; + } + typecount[w[i]]--; + typetot--; + k--; + } + max = 0; tag = 0; totprob = 0; + for (j = 0; j <= usedcount[w[i]]; j++) { + if (ztot[w[i]][j] > 0) { + probs[j] = ztot[w[i]][j] - ALPHA; + } else { + probs[j] = 0; + if (tag == 0) { + probs[j] = (ALPHA*k+BETA)*(((double) typecount[w[i]])+GAMMA)/(((double) typetot)+((double) W)*GAMMA); + tag = 1; + } + } + totprob += probs[j]; + } + r = myrand()*totprob; + max = probs[0]; + j = 0; + while (r>max) { + j++; + max += probs[j]; + } + z[i] = j; + ztot[w[i]][j]++; + if (ztot[w[i]][j]==1) { + if (j == usedcount[w[i]]) { + usedcount[w[i]]++; + if (usedcount[w[i]]==KWMAX) { + printf("Maximum number of tables exceeded!!!\n"); + } + } + typecount[w[i]]++; + typetot++; + k++; + } + } + printf("Completed sample # %5d\n", loop); + if (k != typetot) printf("k = %1.0f, typetot = %d\n",k,typetot); + if (loop >= BURNIN && loop % SAMPLEFREQ == 0) { + for (i = 0; i < W; i++) { + fprintf(fileptr," %d", typecount[i]); //print (table?) count for each word type + } + fprintf(fileptr,"\n"); + } + } + fclose(fileptr); +} + |