summaryrefslogtreecommitdiff
path: root/report/pyp_clustering/acl09-short/code/pygibbs_geom.c
diff options
context:
space:
mode:
Diffstat (limited to 'report/pyp_clustering/acl09-short/code/pygibbs_geom.c')
-rw-r--r--report/pyp_clustering/acl09-short/code/pygibbs_geom.c212
1 files changed, 212 insertions, 0 deletions
diff --git a/report/pyp_clustering/acl09-short/code/pygibbs_geom.c b/report/pyp_clustering/acl09-short/code/pygibbs_geom.c
new file mode 100644
index 00000000..bafa0416
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/pygibbs_geom.c
@@ -0,0 +1,212 @@
+#include <stdio.h>
+#include <math.h>
+
+#define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
+
+#define W 30114
+#define N 831190
+#define KWMAX 5000
+
+#define NLOOPS 11000
+#define BURNIN 1000
+#define SAMPLEFREQ 10
+
+#define ALPHA 0.0 // PYB a
+//#define GAMMA 1000000000.0
+#define GAMMA .01 // Dirichlet over multinomial P0
+
+double BETA; // CRP alpha (PYB b)
+int w[N], z[N]; // words, table assignments
+double base[N]; // base prob of word under geometric
+int typecount[W], typetot; //# of tables of each type, total # tables
+int usedcount[W];
+double ztot[W][KWMAX];
+double k; // total # tables
+int nactive;
+
+void initialise(void);
+void anderson(void);
+void fileread(void);
+
+void initialise(void)
+{
+ int i,j;
+
+ for (i = 1; i < W; i++) {
+ typecount[i] = 0;
+ usedcount[i] = 0;
+ for (j = 0; j < KWMAX; j++) {
+ ztot[i][j] = 0;
+ }
+ }
+
+}
+
+double base_p(int len) {
+ double p = 1.0/26;
+ return pow(p,len)*pow(.5,len); //assume p_# = .5
+}
+
+void anderson(void) //stochastic Anderson-style initialisation
+{
+ int i,j, tag;
+ double max, totprob, r, runtot;
+ double probs[KWMAX];
+ int ind, temp;
+
+ ztot[w[0]][0] = 1;
+ z[0] = 0;
+ typecount[w[0]] = 1;
+ usedcount[w[0]] = 1;
+ k = 1;
+ typetot = 1;
+
+ for (i = 1; i < N; i++) {
+ // printf("%5d\n", w[i]);
+ max = 0; tag = 0; totprob = 0;
+ for (j = 0; j < usedcount[w[i]]; j++) {
+ probs[j] = ztot[w[i]][j] - ALPHA;
+ totprob += probs[j];
+ }
+ probs[usedcount[w[i]]] = (ALPHA*k+BETA)*base[i];
+ totprob += probs[usedcount[w[i]]];
+ // printf("%10.6lf\n",totprob);
+ r = myrand()*totprob;
+ max = probs[0];
+ j = 0;
+ while (r>max) {
+ j++;
+ max += probs[j];
+ }
+ // printf("%5d\n",j);
+ z[i] = j;
+ ztot[w[i]][j]++;
+ if (ztot[w[i]][j]==1) {
+ typecount[w[i]]++;
+ usedcount[w[i]]++;
+ if (usedcount[w[i]]==KWMAX) {
+ printf("Maximum number of tables exceeded!!!\n");
+ }
+ typetot++;
+ k++;
+ }
+ }
+}
+
+void fileread(void)
+{
+ int i,j, wt, len;
+ FILE *fileptr;
+
+ fileptr = fopen("wsj.dat", "r");
+
+ for (i = 1; i < N; i++) {
+ fscanf(fileptr, "%d", &wt);
+ w[i] = wt-1;
+ z[i] = 0;
+ }
+ printf("Total cases: %10d\n", N);
+ fclose(fileptr);
+
+ fileptr = fopen("wsj_lengths.dat", "r");
+
+ for (i = 1; i < N; i++) {
+ fscanf(fileptr, "%d", &len);
+ base[i] = base_p(len);
+ }
+ fclose(fileptr);
+}
+
+main(int argc, char* argv[])
+{
+ int i,j,loop,run;
+ int temp,ind, tag;
+ double newprob, WBETA;
+ double probs[KWMAX];
+ double max, totprob, r;
+ int sampcount;
+ FILE *fileptr;
+ char filename[30];
+ double score;
+
+ if (argc < 2) {
+ printf("Please provide a value of b\n");
+ exit(0);
+ }
+ BETA = strtol(argv[1]);
+ printf("Basic initialising...\n");
+
+ // you can seed with any uint32, but the best are odds in 0..(2^32 - 1)
+ seedMT(4157U);
+
+ sprintf(filename,"typecountrecordwsjgeom%0.1f.%0.1f.dat",ALPHA,BETA);
+ fileptr = fopen(filename, "w");
+
+ printf("Reading from file...\n");
+ fileread();
+
+ printf("Initialising...\n");
+ initialise();
+ printf("k = %1.0f, typetot = %d\n",k,typetot);
+
+ printf("Finding start state...\n");
+ anderson();
+ printf("Beginning burnin...\n");
+ for (loop = 0; loop < NLOOPS; loop++) {
+ for (i = 0; i < N; i++) {
+ j = z[i];
+ ztot[w[i]][j]--;
+ if (ztot[w[i]][j] == 0) {
+ if (j==usedcount[w[i]]) {
+ usedcount[w[i]]--;
+ }
+ typecount[w[i]]--;
+ typetot--;
+ k--;
+ }
+ max = 0; tag = 0; totprob = 0;
+ for (j = 0; j <= usedcount[w[i]]; j++) {
+ if (ztot[w[i]][j] > 0) {
+ probs[j] = ztot[w[i]][j] - ALPHA;
+ } else {
+ probs[j] = 0;
+ if (tag == 0) {
+ probs[j] = (ALPHA*k+BETA)*base[i];
+ tag = 1;
+ }
+ }
+ totprob += probs[j];
+ }
+ r = myrand()*totprob;
+ max = probs[0];
+ j = 0;
+ while (r>max) {
+ j++;
+ max += probs[j];
+ }
+ z[i] = j;
+ ztot[w[i]][j]++;
+ if (ztot[w[i]][j]==1) {
+ if (j == usedcount[w[i]]) {
+ usedcount[w[i]]++;
+ if (usedcount[w[i]]==KWMAX) {
+ printf("Maximum number of tables exceeded!!!\n");
+ }
+ }
+ typecount[w[i]]++;
+ typetot++;
+ k++;
+ }
+ }
+ printf("Completed sample # %5d\n", loop);
+ if (k != typetot) printf("k = %1.0f, typetot = %d\n",k,typetot);
+ if (loop >= BURNIN && loop % SAMPLEFREQ == 0) {
+ for (i = 0; i < W; i++) {
+ fprintf(fileptr," %d", typecount[i]); //print (table?) count for each word type
+ }
+ fprintf(fileptr,"\n");
+ }
+ }
+ fclose(fileptr);
+}
+