summaryrefslogtreecommitdiff
path: root/report/pyp_clustering/acl09-short/code
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-27 16:13:19 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-27 16:13:19 +0000
commitfd519b0e45c857b266814994ba8c1421f508e522 (patch)
tree6d50c9b954e3c13e9df627c1ecc25c53544a5f58 /report/pyp_clustering/acl09-short/code
parent4c5df460c9da5c935438850ef7993463a9113286 (diff)
preso
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@435 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'report/pyp_clustering/acl09-short/code')
-rw-r--r--report/pyp_clustering/acl09-short/code/antoniakpred.m12
-rw-r--r--report/pyp_clustering/acl09-short/code/approximations.eps897
-rw-r--r--report/pyp_clustering/acl09-short/code/cohnpred.m12
-rw-r--r--report/pyp_clustering/acl09-short/code/cokus.c167
-rw-r--r--report/pyp_clustering/acl09-short/code/crppred.m12
-rw-r--r--report/pyp_clustering/acl09-short/code/crppred_geom.m12
-rw-r--r--report/pyp_clustering/acl09-short/code/logbinmean.m38
-rw-r--r--report/pyp_clustering/acl09-short/code/noP0pred.m11
-rw-r--r--report/pyp_clustering/acl09-short/code/plot0.eps633
-rw-r--r--report/pyp_clustering/acl09-short/code/plot0.pdfbin0 -> 20351 bytes
-rw-r--r--report/pyp_clustering/acl09-short/code/plot1.eps579
-rw-r--r--report/pyp_clustering/acl09-short/code/plot1.pdfbin0 -> 17830 bytes
-rw-r--r--report/pyp_clustering/acl09-short/code/plot2.eps552
-rw-r--r--report/pyp_clustering/acl09-short/code/plot2.pdfbin0 -> 14992 bytes
-rw-r--r--report/pyp_clustering/acl09-short/code/plot3.eps721
-rw-r--r--report/pyp_clustering/acl09-short/code/plot3.pdfbin0 -> 19393 bytes
-rw-r--r--report/pyp_clustering/acl09-short/code/pygibbs3.c198
-rwxr-xr-xreport/pyp_clustering/acl09-short/code/pygibbs_geombin0 -> 9705 bytes
-rw-r--r--report/pyp_clustering/acl09-short/code/pygibbs_geom.c212
-rwxr-xr-xreport/pyp_clustering/acl09-short/code/run-peak.prl8
-rwxr-xr-xreport/pyp_clustering/acl09-short/code/run.prl8
-rwxr-xr-xreport/pyp_clustering/acl09-short/code/word_lengths.prl21
-rw-r--r--report/pyp_clustering/acl09-short/code/wsjplots2.m99
-rw-r--r--report/pyp_clustering/acl09-short/code/wsjplots_acl.m74
-rw-r--r--report/pyp_clustering/acl09-short/code/wsjplots_acl_monkeys.m164
-rw-r--r--report/pyp_clustering/acl09-short/code/wsjplots_acl_pair.m117
-rw-r--r--report/pyp_clustering/acl09-short/code/wsjplots_acl_talk0.m54
-rw-r--r--report/pyp_clustering/acl09-short/code/wsjplots_acl_talk1.m59
-rw-r--r--report/pyp_clustering/acl09-short/code/wsjplots_acl_talk2.m58
-rw-r--r--report/pyp_clustering/acl09-short/code/wsjplots_acl_talk3.m74
-rw-r--r--report/pyp_clustering/acl09-short/code/wsjplots_cl.m99
31 files changed, 4891 insertions, 0 deletions
diff --git a/report/pyp_clustering/acl09-short/code/antoniakpred.m b/report/pyp_clustering/acl09-short/code/antoniakpred.m
new file mode 100644
index 00000000..c4153c04
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/antoniakpred.m
@@ -0,0 +1,12 @@
+function output = antoniakpred(input,b)
+
+uniqin = unique(input);
+prediction = zeros(max(input),1);
+
+p0=1/30114;
+for i = 1:length(uniqin)
+ prediction(uniqin(i)) = b*p0*log((b*p0+uniqin(i))/(b*p0));
+end
+
+output = prediction(input);
+
diff --git a/report/pyp_clustering/acl09-short/code/approximations.eps b/report/pyp_clustering/acl09-short/code/approximations.eps
new file mode 100644
index 00000000..67857497
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/approximations.eps
@@ -0,0 +1,897 @@
+%!PS-Adobe-3.0 EPSF-3.0
+%%Creator: MATLAB, The Mathworks, Inc. Version 7.7.0.471 (R2008b). Operating System: Linux 2.6.18-92.1.13.el5.inf.1PAE #1 SMP Mon Oct 20 10:33:44 BST 2008 i686.
+%%Title: /afs/inf.ed.ac.uk/user/s/sgwater/research/papers/2009/acl09-short/code/approximations.eps
+%%CreationDate: 04/25/2009 11:31:18
+%%DocumentNeededFonts: Helvetica
+%%DocumentProcessColors: Cyan Magenta Yellow Black
+%%LanguageLevel: 2
+%%Pages: 1
+%%BoundingBox: 89 164 503 676
+%%EndComments
+
+%%BeginProlog
+% MathWorks dictionary
+/MathWorks 160 dict begin
+% definition operators
+/bdef {bind def} bind def
+/ldef {load def} bind def
+/xdef {exch def} bdef
+/xstore {exch store} bdef
+% operator abbreviations
+/c /clip ldef
+/cc /concat ldef
+/cp /closepath ldef
+/gr /grestore ldef
+/gs /gsave ldef
+/mt /moveto ldef
+/np /newpath ldef
+/cm /currentmatrix ldef
+/sm /setmatrix ldef
+/rm /rmoveto ldef
+/rl /rlineto ldef
+/s {show newpath} bdef
+/sc {setcmykcolor} bdef
+/sr /setrgbcolor ldef
+/sg /setgray ldef
+/w /setlinewidth ldef
+/j /setlinejoin ldef
+/cap /setlinecap ldef
+/rc {rectclip} bdef
+/rf {rectfill} bdef
+% page state control
+/pgsv () def
+/bpage {/pgsv save def} bdef
+/epage {pgsv restore} bdef
+/bplot /gsave ldef
+/eplot {stroke grestore} bdef
+% orientation switch
+/portraitMode 0 def /landscapeMode 1 def /rotateMode 2 def
+% coordinate system mappings
+/dpi2point 0 def
+% font control
+/FontSize 0 def
+/FMS {/FontSize xstore findfont [FontSize 0 0 FontSize neg 0 0]
+ makefont setfont} bdef
+/reencode {exch dup where {pop load} {pop StandardEncoding} ifelse
+ exch dup 3 1 roll findfont dup length dict begin
+ { 1 index /FID ne {def}{pop pop} ifelse } forall
+ /Encoding exch def currentdict end definefont pop} bdef
+/isroman {findfont /CharStrings get /Agrave known} bdef
+/FMSR {3 1 roll 1 index dup isroman {reencode} {pop pop} ifelse
+ exch FMS} bdef
+/csm {1 dpi2point div -1 dpi2point div scale neg translate
+ dup landscapeMode eq {pop -90 rotate}
+ {rotateMode eq {90 rotate} if} ifelse} bdef
+% line types: solid, dotted, dashed, dotdash
+/SO { [] 0 setdash } bdef
+/DO { [.5 dpi2point mul 4 dpi2point mul] 0 setdash } bdef
+/DA { [6 dpi2point mul] 0 setdash } bdef
+/DD { [.5 dpi2point mul 4 dpi2point mul 6 dpi2point mul 4
+ dpi2point mul] 0 setdash } bdef
+% macros for lines and objects
+/L {lineto stroke} bdef
+/MP {3 1 roll moveto 1 sub {rlineto} repeat} bdef
+/AP {{rlineto} repeat} bdef
+/PDlw -1 def
+/W {/PDlw currentlinewidth def setlinewidth} def
+/PP {closepath eofill} bdef
+/DP {closepath stroke} bdef
+/MR {4 -2 roll moveto dup 0 exch rlineto exch 0 rlineto
+ neg 0 exch rlineto closepath} bdef
+/FR {MR stroke} bdef
+/PR {MR fill} bdef
+/L1i {{currentfile picstr readhexstring pop} image} bdef
+/tMatrix matrix def
+/MakeOval {newpath tMatrix currentmatrix pop translate scale
+0 0 1 0 360 arc tMatrix setmatrix} bdef
+/FO {MakeOval stroke} bdef
+/PO {MakeOval fill} bdef
+/PD {currentlinewidth 2 div 0 360 arc fill
+ PDlw -1 eq not {PDlw w /PDlw -1 def} if} def
+/FA {newpath tMatrix currentmatrix pop translate scale
+ 0 0 1 5 -2 roll arc tMatrix setmatrix stroke} bdef
+/PA {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
+ 0 0 1 5 -2 roll arc closepath tMatrix setmatrix fill} bdef
+/FAn {newpath tMatrix currentmatrix pop translate scale
+ 0 0 1 5 -2 roll arcn tMatrix setmatrix stroke} bdef
+/PAn {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
+ 0 0 1 5 -2 roll arcn closepath tMatrix setmatrix fill} bdef
+/vradius 0 def /hradius 0 def /lry 0 def
+/lrx 0 def /uly 0 def /ulx 0 def /rad 0 def
+/MRR {/vradius xdef /hradius xdef /lry xdef /lrx xdef /uly xdef
+ /ulx xdef newpath tMatrix currentmatrix pop ulx hradius add uly
+ vradius add translate hradius vradius scale 0 0 1 180 270 arc
+ tMatrix setmatrix lrx hradius sub uly vradius add translate
+ hradius vradius scale 0 0 1 270 360 arc tMatrix setmatrix
+ lrx hradius sub lry vradius sub translate hradius vradius scale
+ 0 0 1 0 90 arc tMatrix setmatrix ulx hradius add lry vradius sub
+ translate hradius vradius scale 0 0 1 90 180 arc tMatrix setmatrix
+ closepath} bdef
+/FRR {MRR stroke } bdef
+/PRR {MRR fill } bdef
+/MlrRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lry uly sub 2 div def
+ newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
+ rad rad scale 0 0 1 90 270 arc tMatrix setmatrix lrx rad sub lry rad
+ sub translate rad rad scale 0 0 1 270 90 arc tMatrix setmatrix
+ closepath} bdef
+/FlrRR {MlrRR stroke } bdef
+/PlrRR {MlrRR fill } bdef
+/MtbRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lrx ulx sub 2 div def
+ newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
+ rad rad scale 0 0 1 180 360 arc tMatrix setmatrix lrx rad sub lry rad
+ sub translate rad rad scale 0 0 1 0 180 arc tMatrix setmatrix
+ closepath} bdef
+/FtbRR {MtbRR stroke } bdef
+/PtbRR {MtbRR fill } bdef
+/stri 6 array def /dtri 6 array def
+/smat 6 array def /dmat 6 array def
+/tmat1 6 array def /tmat2 6 array def /dif 3 array def
+/asub {/ind2 exch def /ind1 exch def dup dup
+ ind1 get exch ind2 get sub exch } bdef
+/tri_to_matrix {
+ 2 0 asub 3 1 asub 4 0 asub 5 1 asub
+ dup 0 get exch 1 get 7 -1 roll astore } bdef
+/compute_transform {
+ dmat dtri tri_to_matrix tmat1 invertmatrix
+ smat stri tri_to_matrix tmat2 concatmatrix } bdef
+/ds {stri astore pop} bdef
+/dt {dtri astore pop} bdef
+/db {2 copy /cols xdef /rows xdef mul dup 3 mul string
+ currentfile
+ 3 index 0 eq {/ASCIIHexDecode filter}
+ {/ASCII85Decode filter 3 index 2 eq {/RunLengthDecode filter} if }
+ ifelse exch readstring pop
+ dup 0 3 index getinterval /rbmap xdef
+ dup 2 index dup getinterval /gbmap xdef
+ 1 index dup 2 mul exch getinterval /bbmap xdef pop pop}bdef
+/it {gs np dtri aload pop moveto lineto lineto cp c
+ cols rows 8 compute_transform
+ rbmap gbmap bbmap true 3 colorimage gr}bdef
+/il {newpath moveto lineto stroke}bdef
+currentdict end def
+%%EndProlog
+
+%%BeginSetup
+MathWorks begin
+
+0 cap
+
+end
+%%EndSetup
+
+%%Page: 1 1
+%%BeginPageSetup
+%%PageBoundingBox: 89 164 503 676
+MathWorks begin
+bpage
+%%EndPageSetup
+
+%%BeginObject: obj1
+bplot
+
+/dpi2point 12 def
+portraitMode 1068 8112 csm
+
+ 0 0 4976 6135 rc
+86 dict begin %Colortable dictionary
+/c0 { 0.000000 0.000000 0.000000 sr} bdef
+/c1 { 1.000000 1.000000 1.000000 sr} bdef
+/c2 { 0.900000 0.000000 0.000000 sr} bdef
+/c3 { 0.000000 0.820000 0.000000 sr} bdef
+/c4 { 0.000000 0.000000 0.800000 sr} bdef
+/c5 { 0.910000 0.820000 0.320000 sr} bdef
+/c6 { 1.000000 0.260000 0.820000 sr} bdef
+/c7 { 0.000000 0.820000 0.820000 sr} bdef
+c0
+1 j
+1 sg
+ 0 0 4977 6136 rf
+6 w
+0 5000 3856 0 0 -5000 647 5460 4 MP
+PP
+-3856 0 0 5000 3856 0 0 -5000 647 5460 5 MP stroke
+4 w
+DO
+SO
+6 w
+0 sg
+ 647 5460 mt 4503 5460 L
+ 647 460 mt 4503 460 L
+ 647 5460 mt 647 460 L
+4503 5460 mt 4503 460 L
+ 647 5460 mt 4503 5460 L
+ 647 5460 mt 647 460 L
+ 754 5460 mt 754 5410 L
+ 754 460 mt 754 510 L
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 168 FMSR
+
+ 708 5650 mt
+(1) s
+1076 5460 mt 1076 5410 L
+1076 460 mt 1076 510 L
+1053 5650 mt
+( ) s
+1265 5460 mt 1265 5410 L
+1265 460 mt 1265 510 L
+1242 5650 mt
+( ) s
+1398 5460 mt 1398 5410 L
+1398 460 mt 1398 510 L
+1375 5650 mt
+( ) s
+1502 5460 mt 1502 5410 L
+1502 460 mt 1502 510 L
+1479 5650 mt
+( ) s
+1587 5460 mt 1587 5410 L
+1587 460 mt 1587 510 L
+1564 5650 mt
+( ) s
+1659 5460 mt 1659 5410 L
+1659 460 mt 1659 510 L
+1636 5650 mt
+( ) s
+1721 5460 mt 1721 5410 L
+1721 460 mt 1721 510 L
+1698 5650 mt
+( ) s
+1776 5460 mt 1776 5410 L
+1776 460 mt 1776 510 L
+1753 5650 mt
+( ) s
+1825 5460 mt 1825 5410 L
+1825 460 mt 1825 510 L
+1732 5650 mt
+(10) s
+2147 5460 mt 2147 5410 L
+2147 460 mt 2147 510 L
+2124 5650 mt
+( ) s
+2336 5460 mt 2336 5410 L
+2336 460 mt 2336 510 L
+2313 5650 mt
+( ) s
+2470 5460 mt 2470 5410 L
+2470 460 mt 2470 510 L
+2447 5650 mt
+( ) s
+2573 5460 mt 2573 5410 L
+2573 460 mt 2573 510 L
+2550 5650 mt
+( ) s
+2658 5460 mt 2658 5410 L
+2658 460 mt 2658 510 L
+2635 5650 mt
+( ) s
+2730 5460 mt 2730 5410 L
+2730 460 mt 2730 510 L
+2707 5650 mt
+( ) s
+2792 5460 mt 2792 5410 L
+2792 460 mt 2792 510 L
+2769 5650 mt
+( ) s
+2847 5460 mt 2847 5410 L
+2847 460 mt 2847 510 L
+2824 5650 mt
+( ) s
+2896 5460 mt 2896 5410 L
+2896 460 mt 2896 510 L
+2756 5650 mt
+(100) s
+3218 5460 mt 3218 5410 L
+3218 460 mt 3218 510 L
+3195 5650 mt
+( ) s
+3407 5460 mt 3407 5410 L
+3407 460 mt 3407 510 L
+3384 5650 mt
+( ) s
+3541 5460 mt 3541 5410 L
+3541 460 mt 3541 510 L
+3518 5650 mt
+( ) s
+3645 5460 mt 3645 5410 L
+3645 460 mt 3645 510 L
+3622 5650 mt
+( ) s
+3729 5460 mt 3729 5410 L
+3729 460 mt 3729 510 L
+3706 5650 mt
+( ) s
+3801 5460 mt 3801 5410 L
+3801 460 mt 3801 510 L
+3778 5650 mt
+( ) s
+3863 5460 mt 3863 5410 L
+3863 460 mt 3863 510 L
+3840 5650 mt
+( ) s
+3918 5460 mt 3918 5410 L
+3918 460 mt 3918 510 L
+3895 5650 mt
+( ) s
+3967 5460 mt 3967 5410 L
+3967 460 mt 3967 510 L
+3781 5650 mt
+(1000) s
+4289 5460 mt 4289 5410 L
+4289 460 mt 4289 510 L
+4266 5650 mt
+( ) s
+4478 5460 mt 4478 5410 L
+4478 460 mt 4478 510 L
+4455 5650 mt
+( ) s
+ 647 5321 mt 697 5321 L
+4503 5321 mt 4453 5321 L
+ 379 5383 mt
+(0.1) s
+ 647 4903 mt 697 4903 L
+4503 4903 mt 4453 4903 L
+ 566 4965 mt
+( ) s
+ 647 4658 mt 697 4658 L
+4503 4658 mt 4453 4658 L
+ 566 4720 mt
+( ) s
+ 647 4484 mt 697 4484 L
+4503 4484 mt 4453 4484 L
+ 566 4546 mt
+( ) s
+ 647 4350 mt 697 4350 L
+4503 4350 mt 4453 4350 L
+ 566 4412 mt
+( ) s
+ 647 4240 mt 697 4240 L
+4503 4240 mt 4453 4240 L
+ 566 4302 mt
+( ) s
+ 647 4147 mt 697 4147 L
+4503 4147 mt 4453 4147 L
+ 566 4209 mt
+( ) s
+ 647 4066 mt 697 4066 L
+4503 4066 mt 4453 4066 L
+ 566 4128 mt
+( ) s
+ 647 3995 mt 697 3995 L
+4503 3995 mt 4453 3995 L
+ 566 4057 mt
+( ) s
+ 647 3932 mt 697 3932 L
+4503 3932 mt 4453 3932 L
+ 519 3994 mt
+(1) s
+ 647 3514 mt 697 3514 L
+4503 3514 mt 4453 3514 L
+ 566 3576 mt
+( ) s
+ 647 3269 mt 697 3269 L
+4503 3269 mt 4453 3269 L
+ 566 3331 mt
+( ) s
+ 647 3096 mt 697 3096 L
+4503 3096 mt 4453 3096 L
+ 566 3158 mt
+( ) s
+ 647 2961 mt 697 2961 L
+4503 2961 mt 4453 2961 L
+ 566 3023 mt
+( ) s
+ 647 2851 mt 697 2851 L
+4503 2851 mt 4453 2851 L
+ 566 2913 mt
+( ) s
+ 647 2758 mt 697 2758 L
+4503 2758 mt 4453 2758 L
+ 566 2820 mt
+( ) s
+ 647 2677 mt 697 2677 L
+4503 2677 mt 4453 2677 L
+ 566 2739 mt
+( ) s
+ 647 2606 mt 697 2606 L
+4503 2606 mt 4453 2606 L
+ 566 2668 mt
+( ) s
+ 647 2543 mt 697 2543 L
+4503 2543 mt 4453 2543 L
+ 426 2605 mt
+(10) s
+ 647 2125 mt 697 2125 L
+4503 2125 mt 4453 2125 L
+ 566 2187 mt
+( ) s
+ 647 1880 mt 697 1880 L
+4503 1880 mt 4453 1880 L
+ 566 1942 mt
+( ) s
+ 647 1707 mt 697 1707 L
+4503 1707 mt 4453 1707 L
+ 566 1769 mt
+( ) s
+ 647 1572 mt 697 1572 L
+4503 1572 mt 4453 1572 L
+ 566 1634 mt
+( ) s
+ 647 1462 mt 697 1462 L
+4503 1462 mt 4453 1462 L
+ 566 1524 mt
+( ) s
+ 647 1369 mt 697 1369 L
+4503 1369 mt 4453 1369 L
+ 566 1431 mt
+( ) s
+ 647 1289 mt 697 1289 L
+4503 1289 mt 4453 1289 L
+ 566 1351 mt
+( ) s
+ 647 1217 mt 697 1217 L
+4503 1217 mt 4453 1217 L
+ 566 1279 mt
+( ) s
+ 647 1154 mt 697 1154 L
+4503 1154 mt 4453 1154 L
+ 332 1216 mt
+(100) s
+ 647 736 mt 697 736 L
+4503 736 mt 4453 736 L
+ 566 798 mt
+( ) s
+ 647 491 mt 697 491 L
+4503 491 mt 4453 491 L
+ 566 553 mt
+( ) s
+ 647 5460 mt 4503 5460 L
+ 647 460 mt 4503 460 L
+ 647 5460 mt 647 460 L
+4503 5460 mt 4503 460 L
+gs 647 460 3857 5001 rc
+18 w
+0.7 sg
+265 -2 266 -1 265 -1 265 -1 265 -1 266 -1 265 -1 265 -1
+266 -2 265 -1 265 -1 265 -1 266 -3 886 3932 14 MP stroke
+6 w
+gr
+
+0.7 sg
+0 sg
+ 850 3932 mt 922 3932 L
+ 886 3896 mt 886 3968 L
+1116 3929 mt 1188 3929 L
+1152 3893 mt 1152 3965 L
+1381 3928 mt 1453 3928 L
+1417 3892 mt 1417 3964 L
+1646 3927 mt 1718 3927 L
+1682 3891 mt 1682 3963 L
+1911 3926 mt 1983 3926 L
+1947 3890 mt 1947 3962 L
+2177 3924 mt 2249 3924 L
+2213 3888 mt 2213 3960 L
+2442 3923 mt 2514 3923 L
+2478 3887 mt 2478 3959 L
+2707 3922 mt 2779 3922 L
+2743 3886 mt 2743 3958 L
+2973 3922 mt 3045 3922 L
+3009 3886 mt 3009 3958 L
+3238 3919 mt 3310 3919 L
+3274 3883 mt 3274 3955 L
+3503 3919 mt 3575 3919 L
+3539 3883 mt 3539 3955 L
+3768 3919 mt 3840 3919 L
+3804 3883 mt 3804 3955 L
+4034 3917 mt 4106 3917 L
+4070 3881 mt 4070 3953 L
+4299 3918 mt 4371 3918 L
+4335 3882 mt 4335 3954 L
+ 861 3907 mt 911 3957 L
+ 911 3907 mt 861 3957 L
+1127 3904 mt 1177 3954 L
+1177 3904 mt 1127 3954 L
+1392 3903 mt 1442 3953 L
+1442 3903 mt 1392 3953 L
+1657 3902 mt 1707 3952 L
+1707 3902 mt 1657 3952 L
+1922 3901 mt 1972 3951 L
+1972 3901 mt 1922 3951 L
+2188 3899 mt 2238 3949 L
+2238 3899 mt 2188 3949 L
+2453 3898 mt 2503 3948 L
+2503 3898 mt 2453 3948 L
+2718 3897 mt 2768 3947 L
+2768 3897 mt 2718 3947 L
+2984 3897 mt 3034 3947 L
+3034 3897 mt 2984 3947 L
+3249 3894 mt 3299 3944 L
+3299 3894 mt 3249 3944 L
+3514 3894 mt 3564 3944 L
+3564 3894 mt 3514 3944 L
+3779 3894 mt 3829 3944 L
+3829 3894 mt 3779 3944 L
+4045 3892 mt 4095 3942 L
+4095 3892 mt 4045 3942 L
+4310 3893 mt 4360 3943 L
+4360 3893 mt 4310 3943 L
+gs 647 460 3857 5001 rc
+gr
+
+ 36 36 886 3932 FO
+ 36 36 1152 3929 FO
+ 36 36 1417 3928 FO
+ 36 36 1682 3927 FO
+ 36 36 1947 3926 FO
+ 36 36 2213 3924 FO
+ 36 36 2478 3923 FO
+ 36 36 2743 3922 FO
+ 36 36 3009 3921 FO
+ 36 36 3274 3919 FO
+ 36 36 3539 3919 FO
+ 36 36 3804 3918 FO
+ 36 36 4070 3916 FO
+ 36 36 4335 3920 FO
+gs 647 460 3857 5001 rc
+18 w
+0.7 sg
+265 -9 266 -9 265 -9 265 -10 265 -9 266 -10 265 -10 265 -10
+266 -11 265 -11 265 -10 265 -14 266 -23 886 3932 14 MP stroke
+DA
+265 -32 266 -32 265 -36 265 -37 265 -40 266 -43 265 -47 265 -50
+266 -56 265 -61 265 -57 265 -84 266 -129 886 5241 14 MP stroke
+SO
+6 w
+gr
+
+0.7 sg
+0 sg
+ 850 3932 mt 922 3932 L
+ 886 3896 mt 886 3968 L
+1116 3909 mt 1188 3909 L
+1152 3873 mt 1152 3945 L
+1381 3895 mt 1453 3895 L
+1417 3859 mt 1417 3931 L
+1646 3885 mt 1718 3885 L
+1682 3849 mt 1682 3921 L
+1911 3874 mt 1983 3874 L
+1947 3838 mt 1947 3910 L
+2177 3863 mt 2249 3863 L
+2213 3827 mt 2213 3899 L
+2442 3854 mt 2514 3854 L
+2478 3818 mt 2478 3890 L
+2707 3843 mt 2779 3843 L
+2743 3807 mt 2743 3879 L
+2973 3833 mt 3045 3833 L
+3009 3797 mt 3009 3869 L
+3238 3822 mt 3310 3822 L
+3274 3786 mt 3274 3858 L
+3503 3816 mt 3575 3816 L
+3539 3780 mt 3539 3852 L
+3768 3806 mt 3840 3806 L
+3804 3770 mt 3804 3842 L
+4034 3802 mt 4106 3802 L
+4070 3766 mt 4070 3838 L
+4299 3782 mt 4371 3782 L
+4335 3746 mt 4335 3818 L
+ 861 3907 mt 911 3957 L
+ 911 3907 mt 861 3957 L
+1127 3884 mt 1177 3934 L
+1177 3884 mt 1127 3934 L
+1392 3870 mt 1442 3920 L
+1442 3870 mt 1392 3920 L
+1657 3860 mt 1707 3910 L
+1707 3860 mt 1657 3910 L
+1922 3849 mt 1972 3899 L
+1972 3849 mt 1922 3899 L
+2188 3838 mt 2238 3888 L
+2238 3838 mt 2188 3888 L
+2453 3829 mt 2503 3879 L
+2503 3829 mt 2453 3879 L
+2718 3818 mt 2768 3868 L
+2768 3818 mt 2718 3868 L
+2984 3808 mt 3034 3858 L
+3034 3808 mt 2984 3858 L
+3249 3797 mt 3299 3847 L
+3299 3797 mt 3249 3847 L
+3514 3791 mt 3564 3841 L
+3564 3791 mt 3514 3841 L
+3779 3781 mt 3829 3831 L
+3829 3781 mt 3779 3831 L
+4045 3777 mt 4095 3827 L
+4095 3777 mt 4045 3827 L
+4310 3757 mt 4360 3807 L
+4360 3757 mt 4310 3807 L
+gs 647 460 3857 5001 rc
+gr
+
+ 36 36 886 3932 FO
+ 36 36 1152 3910 FO
+ 36 36 1417 3895 FO
+ 36 36 1682 3885 FO
+ 36 36 1947 3872 FO
+ 36 36 2213 3860 FO
+ 36 36 2478 3848 FO
+ 36 36 2743 3834 FO
+ 36 36 3009 3821 FO
+ 36 36 3274 3809 FO
+ 36 36 3539 3798 FO
+ 36 36 3804 3788 FO
+ 36 36 4070 3771 FO
+ 36 36 4335 3742 FO
+gs 647 460 3857 5001 rc
+18 w
+0.7 sg
+265 -33 266 -34 265 -36 265 -38 265 -42 266 -44 265 -49 265 -52
+266 -60 265 -66 265 -62 265 -96 266 -158 886 3932 14 MP stroke
+DA
+265 -40 266 -43 265 -46 265 -50 265 -55 266 -60 265 -67 265 -75
+266 -87 265 -100 265 -96 265 -148 266 -242 886 4398 14 MP stroke
+SO
+6 w
+gr
+
+0.7 sg
+0 sg
+ 850 3932 mt 922 3932 L
+ 886 3896 mt 886 3968 L
+1116 3774 mt 1188 3774 L
+1152 3738 mt 1152 3810 L
+1381 3678 mt 1453 3678 L
+1417 3642 mt 1417 3714 L
+1646 3616 mt 1718 3616 L
+1682 3580 mt 1682 3652 L
+1911 3549 mt 1983 3549 L
+1947 3513 mt 1947 3585 L
+2177 3490 mt 2249 3490 L
+2213 3454 mt 2213 3526 L
+2442 3438 mt 2514 3438 L
+2478 3402 mt 2478 3474 L
+2707 3390 mt 2779 3390 L
+2743 3354 mt 2743 3426 L
+2973 3344 mt 3045 3344 L
+3009 3308 mt 3009 3380 L
+3238 3303 mt 3310 3303 L
+3274 3267 mt 3274 3339 L
+3503 3267 mt 3575 3267 L
+3539 3231 mt 3539 3303 L
+3768 3228 mt 3840 3228 L
+3804 3192 mt 3804 3264 L
+4034 3193 mt 4106 3193 L
+4070 3157 mt 4070 3229 L
+4299 3160 mt 4371 3160 L
+4335 3124 mt 4335 3196 L
+ 861 3907 mt 911 3957 L
+ 911 3907 mt 861 3957 L
+1127 3749 mt 1177 3799 L
+1177 3749 mt 1127 3799 L
+1392 3653 mt 1442 3703 L
+1442 3653 mt 1392 3703 L
+1657 3591 mt 1707 3641 L
+1707 3591 mt 1657 3641 L
+1922 3524 mt 1972 3574 L
+1972 3524 mt 1922 3574 L
+2188 3465 mt 2238 3515 L
+2238 3465 mt 2188 3515 L
+2453 3413 mt 2503 3463 L
+2503 3413 mt 2453 3463 L
+2718 3365 mt 2768 3415 L
+2768 3365 mt 2718 3415 L
+2984 3319 mt 3034 3369 L
+3034 3319 mt 2984 3369 L
+3249 3278 mt 3299 3328 L
+3299 3278 mt 3249 3328 L
+3514 3242 mt 3564 3292 L
+3564 3242 mt 3514 3292 L
+3779 3203 mt 3829 3253 L
+3829 3203 mt 3779 3253 L
+4045 3168 mt 4095 3218 L
+4095 3168 mt 4045 3218 L
+4310 3135 mt 4360 3185 L
+4360 3135 mt 4310 3185 L
+gs 647 460 3857 5001 rc
+gr
+
+ 36 36 886 3932 FO
+ 36 36 1152 3825 FO
+ 36 36 1417 3737 FO
+ 36 36 1682 3663 FO
+ 36 36 1947 3567 FO
+ 36 36 2213 3455 FO
+ 36 36 2478 3330 FO
+ 36 36 2743 3183 FO
+ 36 36 3009 3003 FO
+ 36 36 3274 2790 FO
+ 36 36 3539 2539 FO
+ 36 36 3804 2234 FO
+ 36 36 4070 1938 FO
+ 36 36 4335 1575 FO
+gs 647 460 3857 5001 rc
+18 w
+0.7 sg
+265 -54 266 -58 265 -65 265 -72 265 -82 266 -95 265 -109 265 -127
+266 -155 265 -182 265 -180 265 -268 266 -415 886 3932 14 MP stroke
+DA
+265 -56 266 -59 265 -68 265 -74 265 -85 266 -99 265 -113 265 -133
+266 -162 265 -191 265 -187 265 -277 266 -425 886 4013 14 MP stroke
+SO
+6 w
+gr
+
+0.7 sg
+0 sg
+ 850 3932 mt 922 3932 L
+ 886 3896 mt 886 3968 L
+1116 3517 mt 1188 3517 L
+1152 3481 mt 1152 3553 L
+1381 3249 mt 1453 3249 L
+1417 3213 mt 1417 3285 L
+1646 3069 mt 1718 3069 L
+1682 3033 mt 1682 3105 L
+1911 2887 mt 1983 2887 L
+1947 2851 mt 1947 2923 L
+2177 2732 mt 2249 2732 L
+2213 2696 mt 2213 2768 L
+2442 2605 mt 2514 2605 L
+2478 2569 mt 2478 2641 L
+2707 2496 mt 2779 2496 L
+2743 2460 mt 2743 2532 L
+2973 2401 mt 3045 2401 L
+3009 2365 mt 3009 2437 L
+3238 2319 mt 3310 2319 L
+3274 2283 mt 3274 2355 L
+3503 2247 mt 3575 2247 L
+3539 2211 mt 3539 2283 L
+3768 2181 mt 3840 2181 L
+3804 2145 mt 3804 2217 L
+4034 2124 mt 4106 2124 L
+4070 2088 mt 4070 2160 L
+4299 2069 mt 4371 2069 L
+4335 2033 mt 4335 2105 L
+ 861 3907 mt 911 3957 L
+ 911 3907 mt 861 3957 L
+1127 3492 mt 1177 3542 L
+1177 3492 mt 1127 3542 L
+1392 3224 mt 1442 3274 L
+1442 3224 mt 1392 3274 L
+1657 3044 mt 1707 3094 L
+1707 3044 mt 1657 3094 L
+1922 2862 mt 1972 2912 L
+1972 2862 mt 1922 2912 L
+2188 2707 mt 2238 2757 L
+2238 2707 mt 2188 2757 L
+2453 2580 mt 2503 2630 L
+2503 2580 mt 2453 2630 L
+2718 2471 mt 2768 2521 L
+2768 2471 mt 2718 2521 L
+2984 2376 mt 3034 2426 L
+3034 2376 mt 2984 2426 L
+3249 2294 mt 3299 2344 L
+3299 2294 mt 3249 2344 L
+3514 2222 mt 3564 2272 L
+3564 2222 mt 3514 2272 L
+3779 2156 mt 3829 2206 L
+3829 2156 mt 3779 2206 L
+4045 2099 mt 4095 2149 L
+4095 2099 mt 4045 2149 L
+4310 2044 mt 4360 2094 L
+4360 2044 mt 4310 2094 L
+gs 647 460 3857 5001 rc
+gr
+
+ 36 36 886 3932 FO
+ 36 36 1152 3713 FO
+ 36 36 1417 3510 FO
+ 36 36 1682 3318 FO
+ 36 36 1947 3048 FO
+ 36 36 2213 2733 FO
+ 36 36 2478 2401 FO
+ 36 36 2743 2061 FO
+ 36 36 3009 1720 FO
+ 36 36 3274 1380 FO
+ 36 36 3539 1045 FO
+ 36 36 3804 746 FO
+gs 647 460 3857 5001 rc
+gr
+
+ 240 4103 mt -90 rotate
+(Mean number of lexical entries) s
+90 rotate
+1812 5794 mt
+(Word frequency \(n) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 132 FMSR
+
+3188 5878 mt
+(w) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 168 FMSR
+
+3283 5794 mt
+(\)) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 120 FMSR
+
+ 630 5503 mt
+( ) s
+4487 502 mt
+( ) s
+1 sg
+0 846 2267 0 0 -846 707 1366 4 MP
+PP
+-2267 0 0 846 2267 0 0 -846 707 1366 5 MP stroke
+4 w
+DO
+SO
+6 w
+0 sg
+ 707 1366 mt 2974 1366 L
+ 707 520 mt 2974 520 L
+ 707 1366 mt 707 520 L
+2974 1366 mt 2974 520 L
+ 707 1366 mt 2974 1366 L
+ 707 1366 mt 707 520 L
+ 707 1366 mt 2974 1366 L
+ 707 520 mt 2974 520 L
+ 707 1366 mt 707 520 L
+2974 1366 mt 2974 520 L
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 168 FMSR
+
+1183 698 mt
+(Expectation) s
+gs 707 520 2268 847 rc
+18 w
+0.7 sg
+365 0 780 637 2 MP stroke
+gr
+
+18 w
+0.7 sg
+0 sg
+1183 902 mt
+(Antoniak approx.) s
+gs 707 520 2268 847 rc
+DA
+0.7 sg
+365 0 780 841 2 MP stroke
+SO
+gr
+
+0.7 sg
+0 sg
+1183 1105 mt
+(Empirical, fixed base) s
+gs 707 520 2268 847 rc
+6 w
+gs 889 971 147 147 rc
+ 926 1044 mt 998 1044 L
+ 962 1008 mt 962 1080 L
+ 937 1019 mt 987 1069 L
+ 987 1019 mt 937 1069 L
+gr
+
+gr
+
+6 w
+1183 1309 mt
+(Empirical, inferred base) s
+gs 707 520 2268 847 rc
+gs 889 1175 147 147 rc
+ 36 36 962 1248 FO
+gr
+
+gr
+
+
+end %%Color Dict
+
+eplot
+%%EndObject
+
+epage
+end
+
+showpage
+
+%%Trailer
+%%EOF
diff --git a/report/pyp_clustering/acl09-short/code/cohnpred.m b/report/pyp_clustering/acl09-short/code/cohnpred.m
new file mode 100644
index 00000000..35a49605
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/cohnpred.m
@@ -0,0 +1,12 @@
+function output = cohnpred(input,b)
+
+uniqin = unique(input);
+prediction = zeros(max(input),1);
+
+p0=1/30114;
+for i = 1:length(uniqin)
+ prediction(uniqin(i)) = b*p0*(psi(b*p0+uniqin(i)) - psi(b*p0));
+end
+
+output = prediction(input);
+
diff --git a/report/pyp_clustering/acl09-short/code/cokus.c b/report/pyp_clustering/acl09-short/code/cokus.c
new file mode 100644
index 00000000..3a959c0f
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/cokus.c
@@ -0,0 +1,167 @@
+// This is the ``Mersenne Twister'' random number generator MT19937, which
+// generates pseudorandom integers uniformly distributed in 0..(2^32 - 1)
+// starting from any odd seed in 0..(2^32 - 1). This version is a recode
+// by Shawn Cokus (Cokus@math.washington.edu) on March 8, 1998 of a version by
+// Takuji Nishimura (who had suggestions from Topher Cooper and Marc Rieffel in
+// July-August 1997).
+//
+// Effectiveness of the recoding (on Goedel2.math.washington.edu, a DEC Alpha
+// running OSF/1) using GCC -O3 as a compiler: before recoding: 51.6 sec. to
+// generate 300 million random numbers; after recoding: 24.0 sec. for the same
+// (i.e., 46.5% of original time), so speed is now about 12.5 million random
+// number generations per second on this machine.
+//
+// According to the URL <http://www.math.keio.ac.jp/~matumoto/emt.html>
+// (and paraphrasing a bit in places), the Mersenne Twister is ``designed
+// with consideration of the flaws of various existing generators,'' has
+// a period of 2^19937 - 1, gives a sequence that is 623-dimensionally
+// equidistributed, and ``has passed many stringent tests, including the
+// die-hard test of G. Marsaglia and the load test of P. Hellekalek and
+// S. Wegenkittl.'' It is efficient in memory usage (typically using 2506
+// to 5012 bytes of static data, depending on data type sizes, and the code
+// is quite short as well). It generates random numbers in batches of 624
+// at a time, so the caching and pipelining of modern systems is exploited.
+// It is also divide- and mod-free.
+//
+// This library is free software; you can redistribute it and/or modify it
+// under the terms of the GNU Library General Public License as published by
+// the Free Software Foundation (either version 2 of the License or, at your
+// option, any later version). This library is distributed in the hope that
+// it will be useful, but WITHOUT ANY WARRANTY, without even the implied
+// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+// the GNU Library General Public License for more details. You should have
+// received a copy of the GNU Library General Public License along with this
+// library; if not, write to the Free Software Foundation, Inc., 59 Temple
+// Place, Suite 330, Boston, MA 02111-1307, USA.
+//
+// The code as Shawn received it included the following notice:
+//
+// Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura. When
+// you use this, send an e-mail to <matumoto@math.keio.ac.jp> with
+// an appropriate reference to your work.
+//
+// It would be nice to CC: <Cokus@math.washington.edu> when you write.
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+
+//
+// uint32 must be an unsigned integer type capable of holding at least 32
+// bits; exactly 32 should be fastest, but 64 is better on an Alpha with
+// GCC at -O3 optimization so try your options and see what's best for you
+//
+
+typedef unsigned long uint32;
+
+#define N (624) // length of state vector
+#define M (397) // a period parameter
+#define K (0x9908B0DFU) // a magic constant
+#define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u
+#define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u
+#define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u
+#define mixBits(u, v) (hiBit(u)|loBits(v)) // move hi bit of u to hi bit of v
+
+static uint32 state[N+1]; // state vector + 1 extra to not violate ANSI C
+static uint32 *next; // next random value is computed from here
+static int left = -1; // can *next++ this many times before reloading
+
+
+void seedMT(uint32 seed)
+ {
+ //
+ // We initialize state[0..(N-1)] via the generator
+ //
+ // x_new = (69069 * x_old) mod 2^32
+ //
+ // from Line 15 of Table 1, p. 106, Sec. 3.3.4 of Knuth's
+ // _The Art of Computer Programming_, Volume 2, 3rd ed.
+ //
+ // Notes (SJC): I do not know what the initial state requirements
+ // of the Mersenne Twister are, but it seems this seeding generator
+ // could be better. It achieves the maximum period for its modulus
+ // (2^30) iff x_initial is odd (p. 20-21, Sec. 3.2.1.2, Knuth); if
+ // x_initial can be even, you have sequences like 0, 0, 0, ...;
+ // 2^31, 2^31, 2^31, ...; 2^30, 2^30, 2^30, ...; 2^29, 2^29 + 2^31,
+ // 2^29, 2^29 + 2^31, ..., etc. so I force seed to be odd below.
+ //
+ // Even if x_initial is odd, if x_initial is 1 mod 4 then
+ //
+ // the lowest bit of x is always 1,
+ // the next-to-lowest bit of x is always 0,
+ // the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... ,
+ // the 3rd-from-lowest bit of x 4-cycles ... 0 1 1 0 0 1 1 0 ... ,
+ // the 4th-from-lowest bit of x has the 8-cycle ... 0 0 0 1 1 1 1 0 ... ,
+ // ...
+ //
+ // and if x_initial is 3 mod 4 then
+ //
+ // the lowest bit of x is always 1,
+ // the next-to-lowest bit of x is always 1,
+ // the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... ,
+ // the 3rd-from-lowest bit of x 4-cycles ... 0 0 1 1 0 0 1 1 ... ,
+ // the 4th-from-lowest bit of x has the 8-cycle ... 0 0 1 1 1 1 0 0 ... ,
+ // ...
+ //
+ // The generator's potency (min. s>=0 with (69069-1)^s = 0 mod 2^32) is
+ // 16, which seems to be alright by p. 25, Sec. 3.2.1.3 of Knuth. It
+ // also does well in the dimension 2..5 spectral tests, but it could be
+ // better in dimension 6 (Line 15, Table 1, p. 106, Sec. 3.3.4, Knuth).
+ //
+ // Note that the random number user does not see the values generated
+ // here directly since reloadMT() will always munge them first, so maybe
+ // none of all of this matters. In fact, the seed values made here could
+ // even be extra-special desirable if the Mersenne Twister theory says
+ // so-- that's why the only change I made is to restrict to odd seeds.
+ //
+
+ register uint32 x = (seed | 1U) & 0xFFFFFFFFU, *s = state;
+ register int j;
+
+ for(left=0, *s++=x, j=N; --j;
+ *s++ = (x*=69069U) & 0xFFFFFFFFU);
+ }
+
+
+uint32 reloadMT(void)
+ {
+ register uint32 *p0=state, *p2=state+2, *pM=state+M, s0, s1;
+ register int j;
+
+ if(left < -1)
+ seedMT(4357U);
+
+ left=N-1, next=state+1;
+
+ for(s0=state[0], s1=state[1], j=N-M+1; --j; s0=s1, s1=*p2++)
+ *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
+
+ for(pM=state, j=M; --j; s0=s1, s1=*p2++)
+ *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
+
+ s1=state[0], *p0 = *pM ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U);
+ s1 ^= (s1 >> 11);
+ s1 ^= (s1 << 7) & 0x9D2C5680U;
+ s1 ^= (s1 << 15) & 0xEFC60000U;
+ return(s1 ^ (s1 >> 18));
+ }
+
+
+inline uint32 randomMT(void)
+ {
+ uint32 y;
+
+ if(--left < 0)
+ return(reloadMT());
+
+ y = *next++;
+ y ^= (y >> 11);
+ y ^= (y << 7) & 0x9D2C5680U;
+ y ^= (y << 15) & 0xEFC60000U;
+ y ^= (y >> 18);
+ return(y);
+ }
+
+
+
+
diff --git a/report/pyp_clustering/acl09-short/code/crppred.m b/report/pyp_clustering/acl09-short/code/crppred.m
new file mode 100644
index 00000000..17f22652
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/crppred.m
@@ -0,0 +1,12 @@
+function output = crppred(input,b)
+
+uniqin = unique(input);
+prediction = zeros(max(input),1);
+
+p0=1/30114;
+for i = 1:length(uniqin)
+ prediction(uniqin(i)) = b*p0*sum(1./((1:uniqin(i))+b*p0-1));
+end
+
+output = prediction(input);
+
diff --git a/report/pyp_clustering/acl09-short/code/crppred_geom.m b/report/pyp_clustering/acl09-short/code/crppred_geom.m
new file mode 100644
index 00000000..e6869e4f
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/crppred_geom.m
@@ -0,0 +1,12 @@
+function output = crppred_geom(input,lengths,b)
+
+
+output = zeros(length(input),1);
+
+p0=(1/52).^lengths;
+a=b*p0;
+for i = 1:length(input)
+ output(i) = a(i)*sum(1./((1:input(i))+a(i)-1));
+end
+
+
diff --git a/report/pyp_clustering/acl09-short/code/logbinmean.m b/report/pyp_clustering/acl09-short/code/logbinmean.m
new file mode 100644
index 00000000..23dbb0ac
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/logbinmean.m
@@ -0,0 +1,38 @@
+function [ logbinsvalid , meanval, seval ] = logbinmean( frequency, typecount, NBINS , MinCounts );
+
+% calculate distribution of frequency
+Maxfrequency = max( frequency );
+meanK = mean( frequency );
+linbins = linspace( log10(1) , log10( Maxfrequency ) , NBINS );
+stepb = linbins( 2 ) - linbins( 1 );
+
+logbins = 10.^linbins;
+
+% !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+LL = length( linbins ) - 1;
+for i=1:LL
+ lowb = linbins( i );
+ highb = linbins( i+1 );
+ linbinsout( i ) = (highb + lowb) / 2;
+
+ lowb = logbins( i );
+ highb = logbins( i+1 );
+ step = highb - lowb;
+ logbinsout( i ) = 10^linbinsout( i );
+
+ indices = find( frequency >= lowb & frequency < highb);
+
+ meanval(i) = mean(typecount(indices));
+ rawcounts(i) = length(indices);
+ seval(i) = std(typecount(indices))./sqrt(rawcounts(i));
+
+end
+
+valid = 1:LL;
+valid( find( rawcounts <= MinCounts )) = [];
+
+linbinsvalid = linbinsout( valid );
+logbinsvalid = logbinsout( valid );
+
+meanval = meanval( valid );
+seval = seval( valid );
diff --git a/report/pyp_clustering/acl09-short/code/noP0pred.m b/report/pyp_clustering/acl09-short/code/noP0pred.m
new file mode 100644
index 00000000..f72f1432
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/noP0pred.m
@@ -0,0 +1,11 @@
+function output = antoniakpred(input,b)
+
+uniqin = unique(input);
+prediction = zeros(max(input),1);
+
+for i = 1:length(uniqin)
+ prediction(uniqin(i)) = b*log((b+uniqin(i))/b);
+end
+
+output = prediction(input);
+
diff --git a/report/pyp_clustering/acl09-short/code/plot0.eps b/report/pyp_clustering/acl09-short/code/plot0.eps
new file mode 100644
index 00000000..6094346a
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/plot0.eps
@@ -0,0 +1,633 @@
+%!PS-Adobe-3.0 EPSF-3.0
+%%Creator: MATLAB, The Mathworks, Inc. Version 7.7.0.471 (R2008b). Operating System: Linux 2.6.18-128.1.6.el5.inf.1PAE #1 SMP Wed Apr 15 10:23:41 BST 2009 i686.
+%%Title: /afs/inf.ed.ac.uk/user/s/sgwater/research/papers/2009/acl09-short/code/plot0.eps
+%%CreationDate: 07/23/2009 17:36:19
+%%DocumentNeededFonts: Helvetica
+%%DocumentProcessColors: Cyan Magenta Yellow Black
+%%LanguageLevel: 2
+%%Pages: 1
+%%BoundingBox: -44 170 641 672
+%%EndComments
+
+%%BeginProlog
+% MathWorks dictionary
+/MathWorks 160 dict begin
+% definition operators
+/bdef {bind def} bind def
+/ldef {load def} bind def
+/xdef {exch def} bdef
+/xstore {exch store} bdef
+% operator abbreviations
+/c /clip ldef
+/cc /concat ldef
+/cp /closepath ldef
+/gr /grestore ldef
+/gs /gsave ldef
+/mt /moveto ldef
+/np /newpath ldef
+/cm /currentmatrix ldef
+/sm /setmatrix ldef
+/rm /rmoveto ldef
+/rl /rlineto ldef
+/s {show newpath} bdef
+/sc {setcmykcolor} bdef
+/sr /setrgbcolor ldef
+/sg /setgray ldef
+/w /setlinewidth ldef
+/j /setlinejoin ldef
+/cap /setlinecap ldef
+/rc {rectclip} bdef
+/rf {rectfill} bdef
+% page state control
+/pgsv () def
+/bpage {/pgsv save def} bdef
+/epage {pgsv restore} bdef
+/bplot /gsave ldef
+/eplot {stroke grestore} bdef
+% orientation switch
+/portraitMode 0 def /landscapeMode 1 def /rotateMode 2 def
+% coordinate system mappings
+/dpi2point 0 def
+% font control
+/FontSize 0 def
+/FMS {/FontSize xstore findfont [FontSize 0 0 FontSize neg 0 0]
+ makefont setfont} bdef
+/reencode {exch dup where {pop load} {pop StandardEncoding} ifelse
+ exch dup 3 1 roll findfont dup length dict begin
+ { 1 index /FID ne {def}{pop pop} ifelse } forall
+ /Encoding exch def currentdict end definefont pop} bdef
+/isroman {findfont /CharStrings get /Agrave known} bdef
+/FMSR {3 1 roll 1 index dup isroman {reencode} {pop pop} ifelse
+ exch FMS} bdef
+/csm {1 dpi2point div -1 dpi2point div scale neg translate
+ dup landscapeMode eq {pop -90 rotate}
+ {rotateMode eq {90 rotate} if} ifelse} bdef
+% line types: solid, dotted, dashed, dotdash
+/SO { [] 0 setdash } bdef
+/DO { [.5 dpi2point mul 4 dpi2point mul] 0 setdash } bdef
+/DA { [6 dpi2point mul] 0 setdash } bdef
+/DD { [.5 dpi2point mul 4 dpi2point mul 6 dpi2point mul 4
+ dpi2point mul] 0 setdash } bdef
+% macros for lines and objects
+/L {lineto stroke} bdef
+/MP {3 1 roll moveto 1 sub {rlineto} repeat} bdef
+/AP {{rlineto} repeat} bdef
+/PDlw -1 def
+/W {/PDlw currentlinewidth def setlinewidth} def
+/PP {closepath eofill} bdef
+/DP {closepath stroke} bdef
+/MR {4 -2 roll moveto dup 0 exch rlineto exch 0 rlineto
+ neg 0 exch rlineto closepath} bdef
+/FR {MR stroke} bdef
+/PR {MR fill} bdef
+/L1i {{currentfile picstr readhexstring pop} image} bdef
+/tMatrix matrix def
+/MakeOval {newpath tMatrix currentmatrix pop translate scale
+0 0 1 0 360 arc tMatrix setmatrix} bdef
+/FO {MakeOval stroke} bdef
+/PO {MakeOval fill} bdef
+/PD {currentlinewidth 2 div 0 360 arc fill
+ PDlw -1 eq not {PDlw w /PDlw -1 def} if} def
+/FA {newpath tMatrix currentmatrix pop translate scale
+ 0 0 1 5 -2 roll arc tMatrix setmatrix stroke} bdef
+/PA {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
+ 0 0 1 5 -2 roll arc closepath tMatrix setmatrix fill} bdef
+/FAn {newpath tMatrix currentmatrix pop translate scale
+ 0 0 1 5 -2 roll arcn tMatrix setmatrix stroke} bdef
+/PAn {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
+ 0 0 1 5 -2 roll arcn closepath tMatrix setmatrix fill} bdef
+/vradius 0 def /hradius 0 def /lry 0 def
+/lrx 0 def /uly 0 def /ulx 0 def /rad 0 def
+/MRR {/vradius xdef /hradius xdef /lry xdef /lrx xdef /uly xdef
+ /ulx xdef newpath tMatrix currentmatrix pop ulx hradius add uly
+ vradius add translate hradius vradius scale 0 0 1 180 270 arc
+ tMatrix setmatrix lrx hradius sub uly vradius add translate
+ hradius vradius scale 0 0 1 270 360 arc tMatrix setmatrix
+ lrx hradius sub lry vradius sub translate hradius vradius scale
+ 0 0 1 0 90 arc tMatrix setmatrix ulx hradius add lry vradius sub
+ translate hradius vradius scale 0 0 1 90 180 arc tMatrix setmatrix
+ closepath} bdef
+/FRR {MRR stroke } bdef
+/PRR {MRR fill } bdef
+/MlrRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lry uly sub 2 div def
+ newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
+ rad rad scale 0 0 1 90 270 arc tMatrix setmatrix lrx rad sub lry rad
+ sub translate rad rad scale 0 0 1 270 90 arc tMatrix setmatrix
+ closepath} bdef
+/FlrRR {MlrRR stroke } bdef
+/PlrRR {MlrRR fill } bdef
+/MtbRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lrx ulx sub 2 div def
+ newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
+ rad rad scale 0 0 1 180 360 arc tMatrix setmatrix lrx rad sub lry rad
+ sub translate rad rad scale 0 0 1 0 180 arc tMatrix setmatrix
+ closepath} bdef
+/FtbRR {MtbRR stroke } bdef
+/PtbRR {MtbRR fill } bdef
+/stri 6 array def /dtri 6 array def
+/smat 6 array def /dmat 6 array def
+/tmat1 6 array def /tmat2 6 array def /dif 3 array def
+/asub {/ind2 exch def /ind1 exch def dup dup
+ ind1 get exch ind2 get sub exch } bdef
+/tri_to_matrix {
+ 2 0 asub 3 1 asub 4 0 asub 5 1 asub
+ dup 0 get exch 1 get 7 -1 roll astore } bdef
+/compute_transform {
+ dmat dtri tri_to_matrix tmat1 invertmatrix
+ smat stri tri_to_matrix tmat2 concatmatrix } bdef
+/ds {stri astore pop} bdef
+/dt {dtri astore pop} bdef
+/db {2 copy /cols xdef /rows xdef mul dup 3 mul string
+ currentfile
+ 3 index 0 eq {/ASCIIHexDecode filter}
+ {/ASCII85Decode filter 3 index 2 eq {/RunLengthDecode filter} if }
+ ifelse exch readstring pop
+ dup 0 3 index getinterval /rbmap xdef
+ dup 2 index dup getinterval /gbmap xdef
+ 1 index dup 2 mul exch getinterval /bbmap xdef pop pop}bdef
+/it {gs np dtri aload pop moveto lineto lineto cp c
+ cols rows 8 compute_transform
+ rbmap gbmap bbmap true 3 colorimage gr}bdef
+/il {newpath moveto lineto stroke}bdef
+currentdict end def
+%%EndProlog
+
+%%BeginSetup
+MathWorks begin
+
+0 cap
+
+end
+%%EndSetup
+
+%%Page: 1 1
+%%BeginPageSetup
+%%PageBoundingBox: -44 170 641 672
+MathWorks begin
+bpage
+%%EndPageSetup
+
+%%BeginObject: obj1
+bplot
+
+/dpi2point 12 def
+portraitMode -0528 8064 csm
+
+ 0 0 8231 6023 rc
+88 dict begin %Colortable dictionary
+/c0 { 0.000000 0.000000 0.000000 sr} bdef
+/c1 { 1.000000 1.000000 1.000000 sr} bdef
+/c2 { 0.900000 0.000000 0.000000 sr} bdef
+/c3 { 0.000000 0.820000 0.000000 sr} bdef
+/c4 { 0.000000 0.000000 0.800000 sr} bdef
+/c5 { 0.910000 0.820000 0.320000 sr} bdef
+/c6 { 1.000000 0.260000 0.820000 sr} bdef
+/c7 { 0.000000 0.820000 0.820000 sr} bdef
+c0
+1 j
+1 sg
+ 0 0 8232 6024 rf
+6 w
+0 4908 6379 0 0 -4908 1070 5360 4 MP
+PP
+-6379 0 0 4908 6379 0 0 -4908 1070 5360 5 MP stroke
+4 w
+DO
+SO
+6 w
+0 sg
+1070 5360 mt 7449 5360 L
+1070 452 mt 7449 452 L
+1070 5360 mt 1070 452 L
+7449 5360 mt 7449 452 L
+1070 5360 mt 7449 5360 L
+1070 5360 mt 1070 452 L
+1247 5360 mt 1247 5296 L
+1247 452 mt 1247 515 L
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1194 5572 mt
+(1) s
+1780 5360 mt 1780 5296 L
+1780 452 mt 1780 515 L
+1754 5572 mt
+( ) s
+2092 5360 mt 2092 5296 L
+2092 452 mt 2092 515 L
+2066 5572 mt
+( ) s
+2314 5360 mt 2314 5296 L
+2314 452 mt 2314 515 L
+2288 5572 mt
+( ) s
+2485 5360 mt 2485 5296 L
+2485 452 mt 2485 515 L
+2459 5572 mt
+( ) s
+2626 5360 mt 2626 5296 L
+2626 452 mt 2626 515 L
+2600 5572 mt
+( ) s
+2744 5360 mt 2744 5296 L
+2744 452 mt 2744 515 L
+2718 5572 mt
+( ) s
+2847 5360 mt 2847 5296 L
+2847 452 mt 2847 515 L
+2821 5572 mt
+( ) s
+2938 5360 mt 2938 5296 L
+2938 452 mt 2938 515 L
+2912 5572 mt
+( ) s
+3019 5360 mt 3019 5296 L
+3019 452 mt 3019 515 L
+2913 5572 mt
+(10) s
+3552 5360 mt 3552 5296 L
+3552 452 mt 3552 515 L
+3526 5572 mt
+( ) s
+3864 5360 mt 3864 5296 L
+3864 452 mt 3864 515 L
+3838 5572 mt
+( ) s
+4085 5360 mt 4085 5296 L
+4085 452 mt 4085 515 L
+4059 5572 mt
+( ) s
+4257 5360 mt 4257 5296 L
+4257 452 mt 4257 515 L
+4231 5572 mt
+( ) s
+4397 5360 mt 4397 5296 L
+4397 452 mt 4397 515 L
+4371 5572 mt
+( ) s
+4516 5360 mt 4516 5296 L
+4516 452 mt 4516 515 L
+4490 5572 mt
+( ) s
+4619 5360 mt 4619 5296 L
+4619 452 mt 4619 515 L
+4593 5572 mt
+( ) s
+4710 5360 mt 4710 5296 L
+4710 452 mt 4710 515 L
+4684 5572 mt
+( ) s
+4791 5360 mt 4791 5296 L
+4791 452 mt 4791 515 L
+4631 5572 mt
+(100) s
+5324 5360 mt 5324 5296 L
+5324 452 mt 5324 515 L
+5298 5572 mt
+( ) s
+5636 5360 mt 5636 5296 L
+5636 452 mt 5636 515 L
+5610 5572 mt
+( ) s
+5857 5360 mt 5857 5296 L
+5857 452 mt 5857 515 L
+5831 5572 mt
+( ) s
+6029 5360 mt 6029 5296 L
+6029 452 mt 6029 515 L
+6003 5572 mt
+( ) s
+6169 5360 mt 6169 5296 L
+6169 452 mt 6169 515 L
+6143 5572 mt
+( ) s
+6288 5360 mt 6288 5296 L
+6288 452 mt 6288 515 L
+6262 5572 mt
+( ) s
+6391 5360 mt 6391 5296 L
+6391 452 mt 6391 515 L
+6365 5572 mt
+( ) s
+6481 5360 mt 6481 5296 L
+6481 452 mt 6481 515 L
+6455 5572 mt
+( ) s
+6563 5360 mt 6563 5296 L
+6563 452 mt 6563 515 L
+6350 5572 mt
+(1000) s
+7096 5360 mt 7096 5296 L
+7096 452 mt 7096 515 L
+7070 5572 mt
+( ) s
+7408 5360 mt 7408 5296 L
+7408 452 mt 7408 515 L
+7382 5572 mt
+( ) s
+1070 5352 mt 1133 5352 L
+7449 5352 mt 7385 5352 L
+ 982 5423 mt
+( ) s
+1070 5233 mt 1133 5233 L
+7449 5233 mt 7385 5233 L
+ 982 5304 mt
+( ) s
+1070 5126 mt 1133 5126 L
+7449 5126 mt 7385 5126 L
+ 929 5197 mt
+(1) s
+1070 4422 mt 1133 4422 L
+7449 4422 mt 7385 4422 L
+ 982 4493 mt
+( ) s
+1070 4011 mt 1133 4011 L
+7449 4011 mt 7385 4011 L
+ 982 4082 mt
+( ) s
+1070 3719 mt 1133 3719 L
+7449 3719 mt 7385 3719 L
+ 982 3790 mt
+( ) s
+1070 3492 mt 1133 3492 L
+7449 3492 mt 7385 3492 L
+ 982 3563 mt
+( ) s
+1070 3307 mt 1133 3307 L
+7449 3307 mt 7385 3307 L
+ 982 3378 mt
+( ) s
+1070 3151 mt 1133 3151 L
+7449 3151 mt 7385 3151 L
+ 982 3222 mt
+( ) s
+1070 3015 mt 1133 3015 L
+7449 3015 mt 7385 3015 L
+ 982 3086 mt
+( ) s
+1070 2896 mt 1133 2896 L
+7449 2896 mt 7385 2896 L
+ 982 2967 mt
+( ) s
+1070 2789 mt 1133 2789 L
+7449 2789 mt 7385 2789 L
+ 822 2860 mt
+(10) s
+1070 2085 mt 1133 2085 L
+7449 2085 mt 7385 2085 L
+ 982 2156 mt
+( ) s
+1070 1674 mt 1133 1674 L
+7449 1674 mt 7385 1674 L
+ 982 1745 mt
+( ) s
+1070 1382 mt 1133 1382 L
+7449 1382 mt 7385 1382 L
+ 982 1453 mt
+( ) s
+1070 1155 mt 1133 1155 L
+7449 1155 mt 7385 1155 L
+ 982 1226 mt
+( ) s
+1070 970 mt 1133 970 L
+7449 970 mt 7385 970 L
+ 982 1041 mt
+( ) s
+1070 814 mt 1133 814 L
+7449 814 mt 7385 814 L
+ 982 885 mt
+( ) s
+1070 678 mt 1133 678 L
+7449 678 mt 7385 678 L
+ 982 749 mt
+( ) s
+1070 558 mt 1133 558 L
+7449 558 mt 7385 558 L
+ 982 629 mt
+( ) s
+1070 452 mt 1133 452 L
+7449 452 mt 7385 452 L
+ 715 523 mt
+(100) s
+1070 5360 mt 7449 5360 L
+1070 452 mt 7449 452 L
+1070 5360 mt 1070 452 L
+7449 5360 mt 7449 452 L
+gs 1070 452 6380 4909 rc
+24 w
+gr
+
+24 w
+/c8 { 0.000000 0.700000 0.500000 sr} bdef
+c8
+ 60 60 1466 5126 FO
+ 60 60 1905 4428 FO
+ 60 60 2344 3977 FO
+ 60 60 2783 3675 FO
+ 60 60 3222 3368 FO
+ 60 60 3660 3107 FO
+ 60 60 4099 2893 FO
+ 60 60 4538 2710 FO
+ 60 60 4977 2550 FO
+ 60 60 5416 2412 FO
+ 60 60 5855 2292 FO
+ 60 60 6294 2180 FO
+ 60 60 6733 2083 FO
+ 60 60 7171 1991 FO
+gs 1070 452 6380 4909 rc
+gr
+
+/c9 { 0.400000 0.400000 1.000000 sr} bdef
+c9
+ 60 60 1466 5126 FO
+ 60 60 1905 4860 FO
+ 60 60 2344 4699 FO
+ 60 60 2783 4594 FO
+ 60 60 3222 4483 FO
+ 60 60 3660 4383 FO
+ 60 60 4099 4295 FO
+ 60 60 4538 4214 FO
+ 60 60 4977 4137 FO
+ 60 60 5416 4068 FO
+ 60 60 5855 4008 FO
+ 60 60 6294 3941 FO
+ 60 60 6733 3883 FO
+ 60 60 7171 3827 FO
+gs 1070 452 6380 4909 rc
+gr
+
+/c10 { 1.000000 0.400000 0.200000 sr} bdef
+c10
+ 60 60 1466 5126 FO
+ 60 60 1905 5088 FO
+ 60 60 2344 5064 FO
+ 60 60 2783 5048 FO
+ 60 60 3222 5029 FO
+ 60 60 3660 5010 FO
+ 60 60 4099 4994 FO
+ 60 60 4538 4976 FO
+ 60 60 4977 4959 FO
+ 60 60 5416 4942 FO
+ 60 60 5855 4930 FO
+ 60 60 6294 4914 FO
+ 60 60 6733 4908 FO
+ 60 60 7171 4874 FO
+gs 1070 452 6380 4909 rc
+gr
+
+0 sg
+ 60 60 1466 5126 FO
+ 60 60 1905 5122 FO
+ 60 60 2344 5119 FO
+ 60 60 2783 5118 FO
+ 60 60 3222 5116 FO
+ 60 60 3660 5113 FO
+ 60 60 4099 5112 FO
+ 60 60 4538 5110 FO
+ 60 60 4977 5109 FO
+ 60 60 5416 5105 FO
+ 60 60 5855 5105 FO
+ 60 60 6294 5104 FO
+ 60 60 6733 5101 FO
+ 60 60 7171 5103 FO
+gs 1070 452 6380 4909 rc
+gr
+
+ 617 4557 mt -90 rotate
+(Mean number of lexical entries \(tables\)) s
+90 rotate
+3390 5724 mt
+(Word frequency \(n) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 144 FMSR
+
+4963 5820 mt
+(w) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+5066 5724 mt
+(\)) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 120 FMSR
+
+1053 5403 mt
+( ) s
+7433 494 mt
+( ) s
+6 w
+1 sg
+0 1070 1481 0 0 -1070 1129 1582 4 MP
+PP
+-1481 0 0 1070 1481 0 0 -1070 1129 1582 5 MP stroke
+4 w
+DO
+SO
+6 w
+0 sg
+1129 1582 mt 2610 1582 L
+1129 512 mt 2610 512 L
+1129 1582 mt 1129 512 L
+2610 1582 mt 2610 512 L
+1129 1582 mt 2610 1582 L
+1129 1582 mt 1129 512 L
+1129 1582 mt 2610 1582 L
+1129 512 mt 2610 512 L
+1129 1582 mt 1129 512 L
+2610 1582 mt 2610 512 L
+%%IncludeResource: font Symbol
+/Symbol /ISOLatin1Encoding 192 FMSR
+
+1601 728 mt
+(a) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1722 728 mt
+( = 100000) s
+gs 1129 512 1482 1071 rc
+24 w
+gs 1236 513 293 293 rc
+c8
+ 60 60 1382 659 FO
+gr
+
+c8
+gr
+
+24 w
+c8
+0 sg
+%%IncludeResource: font Symbol
+/Symbol /ISOLatin1Encoding 192 FMSR
+
+1601 987 mt
+(a) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1722 987 mt
+( = 10000) s
+gs 1129 512 1482 1071 rc
+gs 1236 771 293 293 rc
+c9
+ 60 60 1382 917 FO
+gr
+
+c9
+gr
+
+c9
+0 sg
+%%IncludeResource: font Symbol
+/Symbol /ISOLatin1Encoding 192 FMSR
+
+1601 1246 mt
+(a) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1722 1246 mt
+( = 1000) s
+gs 1129 512 1482 1071 rc
+gs 1236 1030 293 293 rc
+c10
+ 60 60 1382 1176 FO
+gr
+
+c10
+gr
+
+c10
+0 sg
+%%IncludeResource: font Symbol
+/Symbol /ISOLatin1Encoding 192 FMSR
+
+1601 1505 mt
+(a) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1722 1505 mt
+( = 100) s
+gs 1129 512 1482 1071 rc
+gs 1236 1288 293 293 rc
+ 60 60 1382 1434 FO
+gr
+
+6 w
+gr
+
+6 w
+
+end %%Color Dict
+
+eplot
+%%EndObject
+
+epage
+end
+
+showpage
+
+%%Trailer
+%%EOF
diff --git a/report/pyp_clustering/acl09-short/code/plot0.pdf b/report/pyp_clustering/acl09-short/code/plot0.pdf
new file mode 100644
index 00000000..fd1b4595
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/plot0.pdf
Binary files differ
diff --git a/report/pyp_clustering/acl09-short/code/plot1.eps b/report/pyp_clustering/acl09-short/code/plot1.eps
new file mode 100644
index 00000000..ebb2f194
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/plot1.eps
@@ -0,0 +1,579 @@
+%!PS-Adobe-3.0 EPSF-3.0
+%%Creator: MATLAB, The Mathworks, Inc. Version 7.7.0.471 (R2008b). Operating System: Linux 2.6.18-128.1.6.el5.inf.1PAE #1 SMP Wed Apr 15 10:23:41 BST 2009 i686.
+%%Title: /afs/inf.ed.ac.uk/user/s/sgwater/research/papers/2009/acl09-short/code/plot1.eps
+%%CreationDate: 07/23/2009 17:34:27
+%%DocumentNeededFonts: Helvetica
+%%DocumentProcessColors: Cyan Magenta Yellow Black
+%%LanguageLevel: 2
+%%Pages: 1
+%%BoundingBox: -44 170 641 672
+%%EndComments
+
+%%BeginProlog
+% MathWorks dictionary
+/MathWorks 160 dict begin
+% definition operators
+/bdef {bind def} bind def
+/ldef {load def} bind def
+/xdef {exch def} bdef
+/xstore {exch store} bdef
+% operator abbreviations
+/c /clip ldef
+/cc /concat ldef
+/cp /closepath ldef
+/gr /grestore ldef
+/gs /gsave ldef
+/mt /moveto ldef
+/np /newpath ldef
+/cm /currentmatrix ldef
+/sm /setmatrix ldef
+/rm /rmoveto ldef
+/rl /rlineto ldef
+/s {show newpath} bdef
+/sc {setcmykcolor} bdef
+/sr /setrgbcolor ldef
+/sg /setgray ldef
+/w /setlinewidth ldef
+/j /setlinejoin ldef
+/cap /setlinecap ldef
+/rc {rectclip} bdef
+/rf {rectfill} bdef
+% page state control
+/pgsv () def
+/bpage {/pgsv save def} bdef
+/epage {pgsv restore} bdef
+/bplot /gsave ldef
+/eplot {stroke grestore} bdef
+% orientation switch
+/portraitMode 0 def /landscapeMode 1 def /rotateMode 2 def
+% coordinate system mappings
+/dpi2point 0 def
+% font control
+/FontSize 0 def
+/FMS {/FontSize xstore findfont [FontSize 0 0 FontSize neg 0 0]
+ makefont setfont} bdef
+/reencode {exch dup where {pop load} {pop StandardEncoding} ifelse
+ exch dup 3 1 roll findfont dup length dict begin
+ { 1 index /FID ne {def}{pop pop} ifelse } forall
+ /Encoding exch def currentdict end definefont pop} bdef
+/isroman {findfont /CharStrings get /Agrave known} bdef
+/FMSR {3 1 roll 1 index dup isroman {reencode} {pop pop} ifelse
+ exch FMS} bdef
+/csm {1 dpi2point div -1 dpi2point div scale neg translate
+ dup landscapeMode eq {pop -90 rotate}
+ {rotateMode eq {90 rotate} if} ifelse} bdef
+% line types: solid, dotted, dashed, dotdash
+/SO { [] 0 setdash } bdef
+/DO { [.5 dpi2point mul 4 dpi2point mul] 0 setdash } bdef
+/DA { [6 dpi2point mul] 0 setdash } bdef
+/DD { [.5 dpi2point mul 4 dpi2point mul 6 dpi2point mul 4
+ dpi2point mul] 0 setdash } bdef
+% macros for lines and objects
+/L {lineto stroke} bdef
+/MP {3 1 roll moveto 1 sub {rlineto} repeat} bdef
+/AP {{rlineto} repeat} bdef
+/PDlw -1 def
+/W {/PDlw currentlinewidth def setlinewidth} def
+/PP {closepath eofill} bdef
+/DP {closepath stroke} bdef
+/MR {4 -2 roll moveto dup 0 exch rlineto exch 0 rlineto
+ neg 0 exch rlineto closepath} bdef
+/FR {MR stroke} bdef
+/PR {MR fill} bdef
+/L1i {{currentfile picstr readhexstring pop} image} bdef
+/tMatrix matrix def
+/MakeOval {newpath tMatrix currentmatrix pop translate scale
+0 0 1 0 360 arc tMatrix setmatrix} bdef
+/FO {MakeOval stroke} bdef
+/PO {MakeOval fill} bdef
+/PD {currentlinewidth 2 div 0 360 arc fill
+ PDlw -1 eq not {PDlw w /PDlw -1 def} if} def
+/FA {newpath tMatrix currentmatrix pop translate scale
+ 0 0 1 5 -2 roll arc tMatrix setmatrix stroke} bdef
+/PA {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
+ 0 0 1 5 -2 roll arc closepath tMatrix setmatrix fill} bdef
+/FAn {newpath tMatrix currentmatrix pop translate scale
+ 0 0 1 5 -2 roll arcn tMatrix setmatrix stroke} bdef
+/PAn {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
+ 0 0 1 5 -2 roll arcn closepath tMatrix setmatrix fill} bdef
+/vradius 0 def /hradius 0 def /lry 0 def
+/lrx 0 def /uly 0 def /ulx 0 def /rad 0 def
+/MRR {/vradius xdef /hradius xdef /lry xdef /lrx xdef /uly xdef
+ /ulx xdef newpath tMatrix currentmatrix pop ulx hradius add uly
+ vradius add translate hradius vradius scale 0 0 1 180 270 arc
+ tMatrix setmatrix lrx hradius sub uly vradius add translate
+ hradius vradius scale 0 0 1 270 360 arc tMatrix setmatrix
+ lrx hradius sub lry vradius sub translate hradius vradius scale
+ 0 0 1 0 90 arc tMatrix setmatrix ulx hradius add lry vradius sub
+ translate hradius vradius scale 0 0 1 90 180 arc tMatrix setmatrix
+ closepath} bdef
+/FRR {MRR stroke } bdef
+/PRR {MRR fill } bdef
+/MlrRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lry uly sub 2 div def
+ newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
+ rad rad scale 0 0 1 90 270 arc tMatrix setmatrix lrx rad sub lry rad
+ sub translate rad rad scale 0 0 1 270 90 arc tMatrix setmatrix
+ closepath} bdef
+/FlrRR {MlrRR stroke } bdef
+/PlrRR {MlrRR fill } bdef
+/MtbRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lrx ulx sub 2 div def
+ newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
+ rad rad scale 0 0 1 180 360 arc tMatrix setmatrix lrx rad sub lry rad
+ sub translate rad rad scale 0 0 1 0 180 arc tMatrix setmatrix
+ closepath} bdef
+/FtbRR {MtbRR stroke } bdef
+/PtbRR {MtbRR fill } bdef
+/stri 6 array def /dtri 6 array def
+/smat 6 array def /dmat 6 array def
+/tmat1 6 array def /tmat2 6 array def /dif 3 array def
+/asub {/ind2 exch def /ind1 exch def dup dup
+ ind1 get exch ind2 get sub exch } bdef
+/tri_to_matrix {
+ 2 0 asub 3 1 asub 4 0 asub 5 1 asub
+ dup 0 get exch 1 get 7 -1 roll astore } bdef
+/compute_transform {
+ dmat dtri tri_to_matrix tmat1 invertmatrix
+ smat stri tri_to_matrix tmat2 concatmatrix } bdef
+/ds {stri astore pop} bdef
+/dt {dtri astore pop} bdef
+/db {2 copy /cols xdef /rows xdef mul dup 3 mul string
+ currentfile
+ 3 index 0 eq {/ASCIIHexDecode filter}
+ {/ASCII85Decode filter 3 index 2 eq {/RunLengthDecode filter} if }
+ ifelse exch readstring pop
+ dup 0 3 index getinterval /rbmap xdef
+ dup 2 index dup getinterval /gbmap xdef
+ 1 index dup 2 mul exch getinterval /bbmap xdef pop pop}bdef
+/it {gs np dtri aload pop moveto lineto lineto cp c
+ cols rows 8 compute_transform
+ rbmap gbmap bbmap true 3 colorimage gr}bdef
+/il {newpath moveto lineto stroke}bdef
+currentdict end def
+%%EndProlog
+
+%%BeginSetup
+MathWorks begin
+
+0 cap
+
+end
+%%EndSetup
+
+%%Page: 1 1
+%%BeginPageSetup
+%%PageBoundingBox: -44 170 641 672
+MathWorks begin
+bpage
+%%EndPageSetup
+
+%%BeginObject: obj1
+bplot
+
+/dpi2point 12 def
+portraitMode -0528 8064 csm
+
+ 0 0 8231 6023 rc
+88 dict begin %Colortable dictionary
+/c0 { 0.000000 0.000000 0.000000 sr} bdef
+/c1 { 1.000000 1.000000 1.000000 sr} bdef
+/c2 { 0.900000 0.000000 0.000000 sr} bdef
+/c3 { 0.000000 0.820000 0.000000 sr} bdef
+/c4 { 0.000000 0.000000 0.800000 sr} bdef
+/c5 { 0.910000 0.820000 0.320000 sr} bdef
+/c6 { 1.000000 0.260000 0.820000 sr} bdef
+/c7 { 0.000000 0.820000 0.820000 sr} bdef
+c0
+1 j
+1 sg
+ 0 0 8232 6024 rf
+6 w
+0 4908 6379 0 0 -4908 1070 5360 4 MP
+PP
+-6379 0 0 4908 6379 0 0 -4908 1070 5360 5 MP stroke
+4 w
+DO
+SO
+6 w
+0 sg
+1070 5360 mt 7449 5360 L
+1070 452 mt 7449 452 L
+1070 5360 mt 1070 452 L
+7449 5360 mt 7449 452 L
+1070 5360 mt 7449 5360 L
+1070 5360 mt 1070 452 L
+1247 5360 mt 1247 5296 L
+1247 452 mt 1247 515 L
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1194 5572 mt
+(1) s
+1780 5360 mt 1780 5296 L
+1780 452 mt 1780 515 L
+1754 5572 mt
+( ) s
+2092 5360 mt 2092 5296 L
+2092 452 mt 2092 515 L
+2066 5572 mt
+( ) s
+2314 5360 mt 2314 5296 L
+2314 452 mt 2314 515 L
+2288 5572 mt
+( ) s
+2485 5360 mt 2485 5296 L
+2485 452 mt 2485 515 L
+2459 5572 mt
+( ) s
+2626 5360 mt 2626 5296 L
+2626 452 mt 2626 515 L
+2600 5572 mt
+( ) s
+2744 5360 mt 2744 5296 L
+2744 452 mt 2744 515 L
+2718 5572 mt
+( ) s
+2847 5360 mt 2847 5296 L
+2847 452 mt 2847 515 L
+2821 5572 mt
+( ) s
+2938 5360 mt 2938 5296 L
+2938 452 mt 2938 515 L
+2912 5572 mt
+( ) s
+3019 5360 mt 3019 5296 L
+3019 452 mt 3019 515 L
+2913 5572 mt
+(10) s
+3552 5360 mt 3552 5296 L
+3552 452 mt 3552 515 L
+3526 5572 mt
+( ) s
+3864 5360 mt 3864 5296 L
+3864 452 mt 3864 515 L
+3838 5572 mt
+( ) s
+4085 5360 mt 4085 5296 L
+4085 452 mt 4085 515 L
+4059 5572 mt
+( ) s
+4257 5360 mt 4257 5296 L
+4257 452 mt 4257 515 L
+4231 5572 mt
+( ) s
+4397 5360 mt 4397 5296 L
+4397 452 mt 4397 515 L
+4371 5572 mt
+( ) s
+4516 5360 mt 4516 5296 L
+4516 452 mt 4516 515 L
+4490 5572 mt
+( ) s
+4619 5360 mt 4619 5296 L
+4619 452 mt 4619 515 L
+4593 5572 mt
+( ) s
+4710 5360 mt 4710 5296 L
+4710 452 mt 4710 515 L
+4684 5572 mt
+( ) s
+4791 5360 mt 4791 5296 L
+4791 452 mt 4791 515 L
+4631 5572 mt
+(100) s
+5324 5360 mt 5324 5296 L
+5324 452 mt 5324 515 L
+5298 5572 mt
+( ) s
+5636 5360 mt 5636 5296 L
+5636 452 mt 5636 515 L
+5610 5572 mt
+( ) s
+5857 5360 mt 5857 5296 L
+5857 452 mt 5857 515 L
+5831 5572 mt
+( ) s
+6029 5360 mt 6029 5296 L
+6029 452 mt 6029 515 L
+6003 5572 mt
+( ) s
+6169 5360 mt 6169 5296 L
+6169 452 mt 6169 515 L
+6143 5572 mt
+( ) s
+6288 5360 mt 6288 5296 L
+6288 452 mt 6288 515 L
+6262 5572 mt
+( ) s
+6391 5360 mt 6391 5296 L
+6391 452 mt 6391 515 L
+6365 5572 mt
+( ) s
+6481 5360 mt 6481 5296 L
+6481 452 mt 6481 515 L
+6455 5572 mt
+( ) s
+6563 5360 mt 6563 5296 L
+6563 452 mt 6563 515 L
+6350 5572 mt
+(1000) s
+7096 5360 mt 7096 5296 L
+7096 452 mt 7096 515 L
+7070 5572 mt
+( ) s
+7408 5360 mt 7408 5296 L
+7408 452 mt 7408 515 L
+7382 5572 mt
+( ) s
+1070 5352 mt 1133 5352 L
+7449 5352 mt 7385 5352 L
+ 982 5423 mt
+( ) s
+1070 5233 mt 1133 5233 L
+7449 5233 mt 7385 5233 L
+ 982 5304 mt
+( ) s
+1070 5126 mt 1133 5126 L
+7449 5126 mt 7385 5126 L
+ 929 5197 mt
+(1) s
+1070 4422 mt 1133 4422 L
+7449 4422 mt 7385 4422 L
+ 982 4493 mt
+( ) s
+1070 4011 mt 1133 4011 L
+7449 4011 mt 7385 4011 L
+ 982 4082 mt
+( ) s
+1070 3719 mt 1133 3719 L
+7449 3719 mt 7385 3719 L
+ 982 3790 mt
+( ) s
+1070 3492 mt 1133 3492 L
+7449 3492 mt 7385 3492 L
+ 982 3563 mt
+( ) s
+1070 3307 mt 1133 3307 L
+7449 3307 mt 7385 3307 L
+ 982 3378 mt
+( ) s
+1070 3151 mt 1133 3151 L
+7449 3151 mt 7385 3151 L
+ 982 3222 mt
+( ) s
+1070 3015 mt 1133 3015 L
+7449 3015 mt 7385 3015 L
+ 982 3086 mt
+( ) s
+1070 2896 mt 1133 2896 L
+7449 2896 mt 7385 2896 L
+ 982 2967 mt
+( ) s
+1070 2789 mt 1133 2789 L
+7449 2789 mt 7385 2789 L
+ 822 2860 mt
+(10) s
+1070 2085 mt 1133 2085 L
+7449 2085 mt 7385 2085 L
+ 982 2156 mt
+( ) s
+1070 1674 mt 1133 1674 L
+7449 1674 mt 7385 1674 L
+ 982 1745 mt
+( ) s
+1070 1382 mt 1133 1382 L
+7449 1382 mt 7385 1382 L
+ 982 1453 mt
+( ) s
+1070 1155 mt 1133 1155 L
+7449 1155 mt 7385 1155 L
+ 982 1226 mt
+( ) s
+1070 970 mt 1133 970 L
+7449 970 mt 7385 970 L
+ 982 1041 mt
+( ) s
+1070 814 mt 1133 814 L
+7449 814 mt 7385 814 L
+ 982 885 mt
+( ) s
+1070 678 mt 1133 678 L
+7449 678 mt 7385 678 L
+ 982 749 mt
+( ) s
+1070 558 mt 1133 558 L
+7449 558 mt 7385 558 L
+ 982 629 mt
+( ) s
+1070 452 mt 1133 452 L
+7449 452 mt 7385 452 L
+ 715 523 mt
+(100) s
+1070 5360 mt 7449 5360 L
+1070 452 mt 7449 452 L
+1070 5360 mt 1070 452 L
+7449 5360 mt 7449 452 L
+gs 1070 452 6380 4909 rc
+24 w
+gr
+
+24 w
+ 48 48 1466 5126 FO
+ 48 48 1905 5122 FO
+ 48 48 2344 5119 FO
+ 48 48 2783 5118 FO
+ 48 48 3222 5116 FO
+ 48 48 3660 5113 FO
+ 48 48 4099 5112 FO
+ 48 48 4538 5110 FO
+ 48 48 4977 5109 FO
+ 48 48 5416 5105 FO
+ 48 48 5855 5105 FO
+ 48 48 6294 5104 FO
+ 48 48 6733 5101 FO
+ 48 48 7171 5103 FO
+gs 1070 452 6380 4909 rc
+438 -2 439 -2 439 -2 439 -2 439 -2 439 -2 439 -2 439 -1
+438 -2 439 -3 439 -1 439 -3 439 -4 1466 5126 14 MP stroke
+gr
+
+/c8 { 1.000000 0.400000 0.200000 sr} bdef
+c8
+ 48 48 1466 5126 FO
+ 48 48 1905 5088 FO
+ 48 48 2344 5064 FO
+ 48 48 2783 5048 FO
+ 48 48 3222 5029 FO
+ 48 48 3660 5010 FO
+ 48 48 4099 4994 FO
+ 48 48 4538 4976 FO
+ 48 48 4977 4959 FO
+ 48 48 5416 4942 FO
+ 48 48 5855 4930 FO
+ 48 48 6294 4914 FO
+ 48 48 6733 4908 FO
+ 48 48 7171 4874 FO
+gs 1070 452 6380 4909 rc
+438 -16 439 -15 439 -15 439 -16 439 -16 439 -16 439 -17 439 -17
+438 -19 439 -18 439 -17 439 -24 439 -38 1466 5126 14 MP stroke
+gr
+
+/c9 { 0.400000 0.400000 1.000000 sr} bdef
+c9
+ 48 48 1466 5126 FO
+ 48 48 1905 4860 FO
+ 48 48 2344 4699 FO
+ 48 48 2783 4594 FO
+ 48 48 3222 4483 FO
+ 48 48 3660 4383 FO
+ 48 48 4099 4295 FO
+ 48 48 4538 4214 FO
+ 48 48 4977 4137 FO
+ 48 48 5416 4068 FO
+ 48 48 5855 4008 FO
+ 48 48 6294 3941 FO
+ 48 48 6733 3883 FO
+ 48 48 7171 3827 FO
+gs 1070 452 6380 4909 rc
+438 -55 439 -57 439 -61 439 -64 439 -70 439 -75 439 -81 439 -89
+438 -100 439 -111 439 -105 439 -161 439 -266 1466 5126 14 MP stroke
+gr
+
+/c10 { 0.000000 0.700000 0.500000 sr} bdef
+c10
+ 48 48 1466 5126 FO
+ 48 48 1905 4428 FO
+ 48 48 2344 3977 FO
+ 48 48 2783 3675 FO
+ 48 48 3222 3368 FO
+ 48 48 3660 3107 FO
+ 48 48 4099 2893 FO
+ 48 48 4538 2710 FO
+ 48 48 4977 2550 FO
+ 48 48 5416 2412 FO
+ 48 48 5855 2292 FO
+ 48 48 6294 2180 FO
+ 48 48 6733 2083 FO
+ 48 48 7171 1991 FO
+gs 1070 452 6380 4909 rc
+438 -91 439 -97 439 -110 439 -121 439 -138 439 -160 439 -183 439 -214
+438 -261 439 -307 439 -302 439 -451 439 -698 1466 5126 14 MP stroke
+gr
+
+0 sg
+ 617 4557 mt -90 rotate
+(Mean number of lexical entries \(tables\)) s
+90 rotate
+3390 5724 mt
+(Word frequency \(n) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 144 FMSR
+
+4963 5820 mt
+(w) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+5066 5724 mt
+(\)) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 120 FMSR
+
+1053 5403 mt
+( ) s
+7433 494 mt
+( ) s
+6 w
+1 sg
+0 500 1510 0 0 -500 1129 1012 4 MP
+PP
+-1510 0 0 500 1510 0 0 -500 1129 1012 5 MP stroke
+4 w
+DO
+SO
+6 w
+0 sg
+1129 1012 mt 2639 1012 L
+1129 512 mt 2639 512 L
+1129 1012 mt 1129 512 L
+2639 1012 mt 2639 512 L
+1129 1012 mt 2639 1012 L
+1129 1012 mt 1129 512 L
+1129 1012 mt 2639 1012 L
+1129 512 mt 2639 512 L
+1129 1012 mt 1129 512 L
+2639 1012 mt 2639 512 L
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1594 713 mt
+(Empirical) s
+gs 1129 512 1511 501 rc
+24 w
+gs 1257 523 245 245 rc
+ 48 48 1379 645 FO
+gr
+
+gr
+
+24 w
+1594 948 mt
+(Expectation) s
+gs 1129 512 1511 501 rc
+358 0 1200 878 2 MP stroke
+6 w
+gr
+
+6 w
+
+end %%Color Dict
+
+eplot
+%%EndObject
+
+epage
+end
+
+showpage
+
+%%Trailer
+%%EOF
diff --git a/report/pyp_clustering/acl09-short/code/plot1.pdf b/report/pyp_clustering/acl09-short/code/plot1.pdf
new file mode 100644
index 00000000..90fcd9ba
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/plot1.pdf
Binary files differ
diff --git a/report/pyp_clustering/acl09-short/code/plot2.eps b/report/pyp_clustering/acl09-short/code/plot2.eps
new file mode 100644
index 00000000..e5c5536a
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/plot2.eps
@@ -0,0 +1,552 @@
+%!PS-Adobe-3.0 EPSF-3.0
+%%Creator: MATLAB, The Mathworks, Inc. Version 7.7.0.471 (R2008b). Operating System: Linux 2.6.18-128.1.6.el5.inf.1PAE #1 SMP Wed Apr 15 10:23:41 BST 2009 i686.
+%%Title: /afs/inf.ed.ac.uk/user/s/sgwater/research/papers/2009/acl09-short/code/plot2.eps
+%%CreationDate: 07/23/2009 17:33:05
+%%DocumentNeededFonts: Helvetica
+%%DocumentProcessColors: Cyan Magenta Yellow Black
+%%LanguageLevel: 2
+%%Pages: 1
+%%BoundingBox: -44 170 641 672
+%%EndComments
+
+%%BeginProlog
+% MathWorks dictionary
+/MathWorks 160 dict begin
+% definition operators
+/bdef {bind def} bind def
+/ldef {load def} bind def
+/xdef {exch def} bdef
+/xstore {exch store} bdef
+% operator abbreviations
+/c /clip ldef
+/cc /concat ldef
+/cp /closepath ldef
+/gr /grestore ldef
+/gs /gsave ldef
+/mt /moveto ldef
+/np /newpath ldef
+/cm /currentmatrix ldef
+/sm /setmatrix ldef
+/rm /rmoveto ldef
+/rl /rlineto ldef
+/s {show newpath} bdef
+/sc {setcmykcolor} bdef
+/sr /setrgbcolor ldef
+/sg /setgray ldef
+/w /setlinewidth ldef
+/j /setlinejoin ldef
+/cap /setlinecap ldef
+/rc {rectclip} bdef
+/rf {rectfill} bdef
+% page state control
+/pgsv () def
+/bpage {/pgsv save def} bdef
+/epage {pgsv restore} bdef
+/bplot /gsave ldef
+/eplot {stroke grestore} bdef
+% orientation switch
+/portraitMode 0 def /landscapeMode 1 def /rotateMode 2 def
+% coordinate system mappings
+/dpi2point 0 def
+% font control
+/FontSize 0 def
+/FMS {/FontSize xstore findfont [FontSize 0 0 FontSize neg 0 0]
+ makefont setfont} bdef
+/reencode {exch dup where {pop load} {pop StandardEncoding} ifelse
+ exch dup 3 1 roll findfont dup length dict begin
+ { 1 index /FID ne {def}{pop pop} ifelse } forall
+ /Encoding exch def currentdict end definefont pop} bdef
+/isroman {findfont /CharStrings get /Agrave known} bdef
+/FMSR {3 1 roll 1 index dup isroman {reencode} {pop pop} ifelse
+ exch FMS} bdef
+/csm {1 dpi2point div -1 dpi2point div scale neg translate
+ dup landscapeMode eq {pop -90 rotate}
+ {rotateMode eq {90 rotate} if} ifelse} bdef
+% line types: solid, dotted, dashed, dotdash
+/SO { [] 0 setdash } bdef
+/DO { [.5 dpi2point mul 4 dpi2point mul] 0 setdash } bdef
+/DA { [6 dpi2point mul] 0 setdash } bdef
+/DD { [.5 dpi2point mul 4 dpi2point mul 6 dpi2point mul 4
+ dpi2point mul] 0 setdash } bdef
+% macros for lines and objects
+/L {lineto stroke} bdef
+/MP {3 1 roll moveto 1 sub {rlineto} repeat} bdef
+/AP {{rlineto} repeat} bdef
+/PDlw -1 def
+/W {/PDlw currentlinewidth def setlinewidth} def
+/PP {closepath eofill} bdef
+/DP {closepath stroke} bdef
+/MR {4 -2 roll moveto dup 0 exch rlineto exch 0 rlineto
+ neg 0 exch rlineto closepath} bdef
+/FR {MR stroke} bdef
+/PR {MR fill} bdef
+/L1i {{currentfile picstr readhexstring pop} image} bdef
+/tMatrix matrix def
+/MakeOval {newpath tMatrix currentmatrix pop translate scale
+0 0 1 0 360 arc tMatrix setmatrix} bdef
+/FO {MakeOval stroke} bdef
+/PO {MakeOval fill} bdef
+/PD {currentlinewidth 2 div 0 360 arc fill
+ PDlw -1 eq not {PDlw w /PDlw -1 def} if} def
+/FA {newpath tMatrix currentmatrix pop translate scale
+ 0 0 1 5 -2 roll arc tMatrix setmatrix stroke} bdef
+/PA {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
+ 0 0 1 5 -2 roll arc closepath tMatrix setmatrix fill} bdef
+/FAn {newpath tMatrix currentmatrix pop translate scale
+ 0 0 1 5 -2 roll arcn tMatrix setmatrix stroke} bdef
+/PAn {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
+ 0 0 1 5 -2 roll arcn closepath tMatrix setmatrix fill} bdef
+/vradius 0 def /hradius 0 def /lry 0 def
+/lrx 0 def /uly 0 def /ulx 0 def /rad 0 def
+/MRR {/vradius xdef /hradius xdef /lry xdef /lrx xdef /uly xdef
+ /ulx xdef newpath tMatrix currentmatrix pop ulx hradius add uly
+ vradius add translate hradius vradius scale 0 0 1 180 270 arc
+ tMatrix setmatrix lrx hradius sub uly vradius add translate
+ hradius vradius scale 0 0 1 270 360 arc tMatrix setmatrix
+ lrx hradius sub lry vradius sub translate hradius vradius scale
+ 0 0 1 0 90 arc tMatrix setmatrix ulx hradius add lry vradius sub
+ translate hradius vradius scale 0 0 1 90 180 arc tMatrix setmatrix
+ closepath} bdef
+/FRR {MRR stroke } bdef
+/PRR {MRR fill } bdef
+/MlrRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lry uly sub 2 div def
+ newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
+ rad rad scale 0 0 1 90 270 arc tMatrix setmatrix lrx rad sub lry rad
+ sub translate rad rad scale 0 0 1 270 90 arc tMatrix setmatrix
+ closepath} bdef
+/FlrRR {MlrRR stroke } bdef
+/PlrRR {MlrRR fill } bdef
+/MtbRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lrx ulx sub 2 div def
+ newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
+ rad rad scale 0 0 1 180 360 arc tMatrix setmatrix lrx rad sub lry rad
+ sub translate rad rad scale 0 0 1 0 180 arc tMatrix setmatrix
+ closepath} bdef
+/FtbRR {MtbRR stroke } bdef
+/PtbRR {MtbRR fill } bdef
+/stri 6 array def /dtri 6 array def
+/smat 6 array def /dmat 6 array def
+/tmat1 6 array def /tmat2 6 array def /dif 3 array def
+/asub {/ind2 exch def /ind1 exch def dup dup
+ ind1 get exch ind2 get sub exch } bdef
+/tri_to_matrix {
+ 2 0 asub 3 1 asub 4 0 asub 5 1 asub
+ dup 0 get exch 1 get 7 -1 roll astore } bdef
+/compute_transform {
+ dmat dtri tri_to_matrix tmat1 invertmatrix
+ smat stri tri_to_matrix tmat2 concatmatrix } bdef
+/ds {stri astore pop} bdef
+/dt {dtri astore pop} bdef
+/db {2 copy /cols xdef /rows xdef mul dup 3 mul string
+ currentfile
+ 3 index 0 eq {/ASCIIHexDecode filter}
+ {/ASCII85Decode filter 3 index 2 eq {/RunLengthDecode filter} if }
+ ifelse exch readstring pop
+ dup 0 3 index getinterval /rbmap xdef
+ dup 2 index dup getinterval /gbmap xdef
+ 1 index dup 2 mul exch getinterval /bbmap xdef pop pop}bdef
+/it {gs np dtri aload pop moveto lineto lineto cp c
+ cols rows 8 compute_transform
+ rbmap gbmap bbmap true 3 colorimage gr}bdef
+/il {newpath moveto lineto stroke}bdef
+currentdict end def
+%%EndProlog
+
+%%BeginSetup
+MathWorks begin
+
+0 cap
+
+end
+%%EndSetup
+
+%%Page: 1 1
+%%BeginPageSetup
+%%PageBoundingBox: -44 170 641 672
+MathWorks begin
+bpage
+%%EndPageSetup
+
+%%BeginObject: obj1
+bplot
+
+/dpi2point 12 def
+portraitMode -0528 8064 csm
+
+ 0 0 8231 6023 rc
+88 dict begin %Colortable dictionary
+/c0 { 0.000000 0.000000 0.000000 sr} bdef
+/c1 { 1.000000 1.000000 1.000000 sr} bdef
+/c2 { 0.900000 0.000000 0.000000 sr} bdef
+/c3 { 0.000000 0.820000 0.000000 sr} bdef
+/c4 { 0.000000 0.000000 0.800000 sr} bdef
+/c5 { 0.910000 0.820000 0.320000 sr} bdef
+/c6 { 1.000000 0.260000 0.820000 sr} bdef
+/c7 { 0.000000 0.820000 0.820000 sr} bdef
+c0
+1 j
+1 sg
+ 0 0 8232 6024 rf
+6 w
+0 4908 6379 0 0 -4908 1070 5360 4 MP
+PP
+-6379 0 0 4908 6379 0 0 -4908 1070 5360 5 MP stroke
+4 w
+DO
+SO
+6 w
+0 sg
+1070 5360 mt 7449 5360 L
+1070 452 mt 7449 452 L
+1070 5360 mt 1070 452 L
+7449 5360 mt 7449 452 L
+1070 5360 mt 7449 5360 L
+1070 5360 mt 1070 452 L
+1247 5360 mt 1247 5296 L
+1247 452 mt 1247 515 L
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1194 5572 mt
+(1) s
+1780 5360 mt 1780 5296 L
+1780 452 mt 1780 515 L
+1754 5572 mt
+( ) s
+2092 5360 mt 2092 5296 L
+2092 452 mt 2092 515 L
+2066 5572 mt
+( ) s
+2314 5360 mt 2314 5296 L
+2314 452 mt 2314 515 L
+2288 5572 mt
+( ) s
+2485 5360 mt 2485 5296 L
+2485 452 mt 2485 515 L
+2459 5572 mt
+( ) s
+2626 5360 mt 2626 5296 L
+2626 452 mt 2626 515 L
+2600 5572 mt
+( ) s
+2744 5360 mt 2744 5296 L
+2744 452 mt 2744 515 L
+2718 5572 mt
+( ) s
+2847 5360 mt 2847 5296 L
+2847 452 mt 2847 515 L
+2821 5572 mt
+( ) s
+2938 5360 mt 2938 5296 L
+2938 452 mt 2938 515 L
+2912 5572 mt
+( ) s
+3019 5360 mt 3019 5296 L
+3019 452 mt 3019 515 L
+2913 5572 mt
+(10) s
+3552 5360 mt 3552 5296 L
+3552 452 mt 3552 515 L
+3526 5572 mt
+( ) s
+3864 5360 mt 3864 5296 L
+3864 452 mt 3864 515 L
+3838 5572 mt
+( ) s
+4085 5360 mt 4085 5296 L
+4085 452 mt 4085 515 L
+4059 5572 mt
+( ) s
+4257 5360 mt 4257 5296 L
+4257 452 mt 4257 515 L
+4231 5572 mt
+( ) s
+4397 5360 mt 4397 5296 L
+4397 452 mt 4397 515 L
+4371 5572 mt
+( ) s
+4516 5360 mt 4516 5296 L
+4516 452 mt 4516 515 L
+4490 5572 mt
+( ) s
+4619 5360 mt 4619 5296 L
+4619 452 mt 4619 515 L
+4593 5572 mt
+( ) s
+4710 5360 mt 4710 5296 L
+4710 452 mt 4710 515 L
+4684 5572 mt
+( ) s
+4791 5360 mt 4791 5296 L
+4791 452 mt 4791 515 L
+4631 5572 mt
+(100) s
+5324 5360 mt 5324 5296 L
+5324 452 mt 5324 515 L
+5298 5572 mt
+( ) s
+5636 5360 mt 5636 5296 L
+5636 452 mt 5636 515 L
+5610 5572 mt
+( ) s
+5857 5360 mt 5857 5296 L
+5857 452 mt 5857 515 L
+5831 5572 mt
+( ) s
+6029 5360 mt 6029 5296 L
+6029 452 mt 6029 515 L
+6003 5572 mt
+( ) s
+6169 5360 mt 6169 5296 L
+6169 452 mt 6169 515 L
+6143 5572 mt
+( ) s
+6288 5360 mt 6288 5296 L
+6288 452 mt 6288 515 L
+6262 5572 mt
+( ) s
+6391 5360 mt 6391 5296 L
+6391 452 mt 6391 515 L
+6365 5572 mt
+( ) s
+6481 5360 mt 6481 5296 L
+6481 452 mt 6481 515 L
+6455 5572 mt
+( ) s
+6563 5360 mt 6563 5296 L
+6563 452 mt 6563 515 L
+6350 5572 mt
+(1000) s
+7096 5360 mt 7096 5296 L
+7096 452 mt 7096 515 L
+7070 5572 mt
+( ) s
+7408 5360 mt 7408 5296 L
+7408 452 mt 7408 515 L
+7382 5572 mt
+( ) s
+1070 5201 mt 1133 5201 L
+7449 5201 mt 7385 5201 L
+ 769 5272 mt
+(0.1) s
+1070 4725 mt 1133 4725 L
+7449 4725 mt 7385 4725 L
+ 982 4796 mt
+( ) s
+1070 4446 mt 1133 4446 L
+7449 4446 mt 7385 4446 L
+ 982 4517 mt
+( ) s
+1070 4248 mt 1133 4248 L
+7449 4248 mt 7385 4248 L
+ 982 4319 mt
+( ) s
+1070 4095 mt 1133 4095 L
+7449 4095 mt 7385 4095 L
+ 982 4166 mt
+( ) s
+1070 3969 mt 1133 3969 L
+7449 3969 mt 7385 3969 L
+ 982 4040 mt
+( ) s
+1070 3863 mt 1133 3863 L
+7449 3863 mt 7385 3863 L
+ 982 3934 mt
+( ) s
+1070 3771 mt 1133 3771 L
+7449 3771 mt 7385 3771 L
+ 982 3842 mt
+( ) s
+1070 3690 mt 1133 3690 L
+7449 3690 mt 7385 3690 L
+ 982 3761 mt
+( ) s
+1070 3618 mt 1133 3618 L
+7449 3618 mt 7385 3618 L
+ 929 3689 mt
+(1) s
+1070 3141 mt 1133 3141 L
+7449 3141 mt 7385 3141 L
+ 982 3212 mt
+( ) s
+1070 2863 mt 1133 2863 L
+7449 2863 mt 7385 2863 L
+ 982 2934 mt
+( ) s
+1070 2665 mt 1133 2665 L
+7449 2665 mt 7385 2665 L
+ 982 2736 mt
+( ) s
+1070 2511 mt 1133 2511 L
+7449 2511 mt 7385 2511 L
+ 982 2582 mt
+( ) s
+1070 2386 mt 1133 2386 L
+7449 2386 mt 7385 2386 L
+ 982 2457 mt
+( ) s
+1070 2280 mt 1133 2280 L
+7449 2280 mt 7385 2280 L
+ 982 2351 mt
+( ) s
+1070 2188 mt 1133 2188 L
+7449 2188 mt 7385 2188 L
+ 982 2259 mt
+( ) s
+1070 2107 mt 1133 2107 L
+7449 2107 mt 7385 2107 L
+ 982 2178 mt
+( ) s
+1070 2035 mt 1133 2035 L
+7449 2035 mt 7385 2035 L
+ 822 2106 mt
+(10) s
+1070 1558 mt 1133 1558 L
+7449 1558 mt 7385 1558 L
+ 982 1629 mt
+( ) s
+1070 1279 mt 1133 1279 L
+7449 1279 mt 7385 1279 L
+ 982 1350 mt
+( ) s
+1070 1082 mt 1133 1082 L
+7449 1082 mt 7385 1082 L
+ 982 1153 mt
+( ) s
+1070 928 mt 1133 928 L
+7449 928 mt 7385 928 L
+ 982 999 mt
+( ) s
+1070 803 mt 1133 803 L
+7449 803 mt 7385 803 L
+ 982 874 mt
+( ) s
+1070 697 mt 1133 697 L
+7449 697 mt 7385 697 L
+ 982 768 mt
+( ) s
+1070 605 mt 1133 605 L
+7449 605 mt 7385 605 L
+ 982 676 mt
+( ) s
+1070 524 mt 1133 524 L
+7449 524 mt 7385 524 L
+ 982 595 mt
+( ) s
+1070 452 mt 1133 452 L
+7449 452 mt 7385 452 L
+ 715 523 mt
+(100) s
+1070 5360 mt 7449 5360 L
+1070 452 mt 7449 452 L
+1070 5360 mt 1070 452 L
+7449 5360 mt 7449 452 L
+gs 1070 452 6380 4909 rc
+24 w
+438 -2 439 -1 439 -1 439 -1 439 -2 439 -1 439 -1 439 -2
+438 -1 439 -1 439 -2 439 -1 439 -3 1466 3618 14 MP stroke
+/c8 { 1.000000 0.400000 0.200000 sr} bdef
+c8
+438 -10 439 -10 439 -11 439 -11 439 -11 439 -11 439 -11 439 -12
+438 -12 439 -13 439 -11 439 -16 439 -26 1466 3618 14 MP stroke
+DA
+438 -36 439 -37 439 -41 439 -42 439 -46 439 -49 439 -53 439 -57
+438 -64 439 -70 439 -65 439 -95 439 -147 1466 5110 14 MP stroke
+SO
+/c9 { 0.400000 0.400000 1.000000 sr} bdef
+c9
+438 -37 439 -38 439 -42 439 -43 439 -48 439 -51 439 -55 439 -60
+438 -68 439 -75 439 -71 439 -109 439 -180 1466 3618 14 MP stroke
+DA
+438 -46 439 -49 439 -53 439 -56 439 -63 439 -69 439 -76 439 -85
+438 -100 439 -113 439 -110 439 -169 439 -276 1466 4150 14 MP stroke
+SO
+/c10 { 0.000000 0.700000 0.500000 sr} bdef
+c10
+438 -61 439 -66 439 -75 439 -81 439 -94 439 -108 439 -125 439 -144
+438 -177 439 -208 439 -205 439 -305 439 -473 1466 3618 14 MP stroke
+DA
+438 -63 439 -68 439 -77 439 -84 439 -98 439 -112 439 -130 439 -151
+438 -185 439 -218 439 -213 439 -315 439 -484 1466 3710 14 MP stroke
+gr
+
+24 w
+c10
+DA
+0 sg
+ 617 4557 mt -90 rotate
+(Mean number of lexical entries \(tables\)) s
+90 rotate
+3390 5724 mt
+(Word frequency \(n) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 144 FMSR
+
+4963 5820 mt
+(w) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+5066 5724 mt
+(\)) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 120 FMSR
+
+1053 5403 mt
+( ) s
+7433 494 mt
+( ) s
+SO
+6 w
+1 sg
+0 500 2507 0 0 -500 1129 1012 4 MP
+PP
+-2507 0 0 500 2507 0 0 -500 1129 1012 5 MP stroke
+4 w
+DO
+SO
+6 w
+0 sg
+1129 1012 mt 3636 1012 L
+1129 512 mt 3636 512 L
+1129 1012 mt 1129 512 L
+3636 1012 mt 3636 512 L
+1129 1012 mt 3636 1012 L
+1129 1012 mt 1129 512 L
+1129 1012 mt 3636 1012 L
+1129 512 mt 3636 512 L
+1129 1012 mt 1129 512 L
+3636 1012 mt 3636 512 L
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1603 713 mt
+(Expectation) s
+gs 1129 512 2508 501 rc
+24 w
+365 0 1201 645 2 MP stroke
+gr
+
+24 w
+1603 948 mt
+(Antoniak approximation) s
+gs 1129 512 2508 501 rc
+DA
+365 0 1201 878 2 MP stroke
+SO
+6 w
+gr
+
+6 w
+
+end %%Color Dict
+
+eplot
+%%EndObject
+
+epage
+end
+
+showpage
+
+%%Trailer
+%%EOF
diff --git a/report/pyp_clustering/acl09-short/code/plot2.pdf b/report/pyp_clustering/acl09-short/code/plot2.pdf
new file mode 100644
index 00000000..d9783120
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/plot2.pdf
Binary files differ
diff --git a/report/pyp_clustering/acl09-short/code/plot3.eps b/report/pyp_clustering/acl09-short/code/plot3.eps
new file mode 100644
index 00000000..f4ffbb62
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/plot3.eps
@@ -0,0 +1,721 @@
+%!PS-Adobe-3.0 EPSF-3.0
+%%Creator: MATLAB, The Mathworks, Inc. Version 7.7.0.471 (R2008b). Operating System: Linux 2.6.18-128.1.6.el5.inf.1PAE #1 SMP Wed Apr 15 10:23:41 BST 2009 i686.
+%%Title: /afs/inf.ed.ac.uk/user/s/sgwater/research/papers/2009/acl09-short/code/plot3.eps
+%%CreationDate: 07/23/2009 17:31:43
+%%DocumentNeededFonts: Helvetica
+%%DocumentProcessColors: Cyan Magenta Yellow Black
+%%LanguageLevel: 2
+%%Pages: 1
+%%BoundingBox: -44 170 641 672
+%%EndComments
+
+%%BeginProlog
+% MathWorks dictionary
+/MathWorks 160 dict begin
+% definition operators
+/bdef {bind def} bind def
+/ldef {load def} bind def
+/xdef {exch def} bdef
+/xstore {exch store} bdef
+% operator abbreviations
+/c /clip ldef
+/cc /concat ldef
+/cp /closepath ldef
+/gr /grestore ldef
+/gs /gsave ldef
+/mt /moveto ldef
+/np /newpath ldef
+/cm /currentmatrix ldef
+/sm /setmatrix ldef
+/rm /rmoveto ldef
+/rl /rlineto ldef
+/s {show newpath} bdef
+/sc {setcmykcolor} bdef
+/sr /setrgbcolor ldef
+/sg /setgray ldef
+/w /setlinewidth ldef
+/j /setlinejoin ldef
+/cap /setlinecap ldef
+/rc {rectclip} bdef
+/rf {rectfill} bdef
+% page state control
+/pgsv () def
+/bpage {/pgsv save def} bdef
+/epage {pgsv restore} bdef
+/bplot /gsave ldef
+/eplot {stroke grestore} bdef
+% orientation switch
+/portraitMode 0 def /landscapeMode 1 def /rotateMode 2 def
+% coordinate system mappings
+/dpi2point 0 def
+% font control
+/FontSize 0 def
+/FMS {/FontSize xstore findfont [FontSize 0 0 FontSize neg 0 0]
+ makefont setfont} bdef
+/reencode {exch dup where {pop load} {pop StandardEncoding} ifelse
+ exch dup 3 1 roll findfont dup length dict begin
+ { 1 index /FID ne {def}{pop pop} ifelse } forall
+ /Encoding exch def currentdict end definefont pop} bdef
+/isroman {findfont /CharStrings get /Agrave known} bdef
+/FMSR {3 1 roll 1 index dup isroman {reencode} {pop pop} ifelse
+ exch FMS} bdef
+/csm {1 dpi2point div -1 dpi2point div scale neg translate
+ dup landscapeMode eq {pop -90 rotate}
+ {rotateMode eq {90 rotate} if} ifelse} bdef
+% line types: solid, dotted, dashed, dotdash
+/SO { [] 0 setdash } bdef
+/DO { [.5 dpi2point mul 4 dpi2point mul] 0 setdash } bdef
+/DA { [6 dpi2point mul] 0 setdash } bdef
+/DD { [.5 dpi2point mul 4 dpi2point mul 6 dpi2point mul 4
+ dpi2point mul] 0 setdash } bdef
+% macros for lines and objects
+/L {lineto stroke} bdef
+/MP {3 1 roll moveto 1 sub {rlineto} repeat} bdef
+/AP {{rlineto} repeat} bdef
+/PDlw -1 def
+/W {/PDlw currentlinewidth def setlinewidth} def
+/PP {closepath eofill} bdef
+/DP {closepath stroke} bdef
+/MR {4 -2 roll moveto dup 0 exch rlineto exch 0 rlineto
+ neg 0 exch rlineto closepath} bdef
+/FR {MR stroke} bdef
+/PR {MR fill} bdef
+/L1i {{currentfile picstr readhexstring pop} image} bdef
+/tMatrix matrix def
+/MakeOval {newpath tMatrix currentmatrix pop translate scale
+0 0 1 0 360 arc tMatrix setmatrix} bdef
+/FO {MakeOval stroke} bdef
+/PO {MakeOval fill} bdef
+/PD {currentlinewidth 2 div 0 360 arc fill
+ PDlw -1 eq not {PDlw w /PDlw -1 def} if} def
+/FA {newpath tMatrix currentmatrix pop translate scale
+ 0 0 1 5 -2 roll arc tMatrix setmatrix stroke} bdef
+/PA {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
+ 0 0 1 5 -2 roll arc closepath tMatrix setmatrix fill} bdef
+/FAn {newpath tMatrix currentmatrix pop translate scale
+ 0 0 1 5 -2 roll arcn tMatrix setmatrix stroke} bdef
+/PAn {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
+ 0 0 1 5 -2 roll arcn closepath tMatrix setmatrix fill} bdef
+/vradius 0 def /hradius 0 def /lry 0 def
+/lrx 0 def /uly 0 def /ulx 0 def /rad 0 def
+/MRR {/vradius xdef /hradius xdef /lry xdef /lrx xdef /uly xdef
+ /ulx xdef newpath tMatrix currentmatrix pop ulx hradius add uly
+ vradius add translate hradius vradius scale 0 0 1 180 270 arc
+ tMatrix setmatrix lrx hradius sub uly vradius add translate
+ hradius vradius scale 0 0 1 270 360 arc tMatrix setmatrix
+ lrx hradius sub lry vradius sub translate hradius vradius scale
+ 0 0 1 0 90 arc tMatrix setmatrix ulx hradius add lry vradius sub
+ translate hradius vradius scale 0 0 1 90 180 arc tMatrix setmatrix
+ closepath} bdef
+/FRR {MRR stroke } bdef
+/PRR {MRR fill } bdef
+/MlrRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lry uly sub 2 div def
+ newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
+ rad rad scale 0 0 1 90 270 arc tMatrix setmatrix lrx rad sub lry rad
+ sub translate rad rad scale 0 0 1 270 90 arc tMatrix setmatrix
+ closepath} bdef
+/FlrRR {MlrRR stroke } bdef
+/PlrRR {MlrRR fill } bdef
+/MtbRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lrx ulx sub 2 div def
+ newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
+ rad rad scale 0 0 1 180 360 arc tMatrix setmatrix lrx rad sub lry rad
+ sub translate rad rad scale 0 0 1 0 180 arc tMatrix setmatrix
+ closepath} bdef
+/FtbRR {MtbRR stroke } bdef
+/PtbRR {MtbRR fill } bdef
+/stri 6 array def /dtri 6 array def
+/smat 6 array def /dmat 6 array def
+/tmat1 6 array def /tmat2 6 array def /dif 3 array def
+/asub {/ind2 exch def /ind1 exch def dup dup
+ ind1 get exch ind2 get sub exch } bdef
+/tri_to_matrix {
+ 2 0 asub 3 1 asub 4 0 asub 5 1 asub
+ dup 0 get exch 1 get 7 -1 roll astore } bdef
+/compute_transform {
+ dmat dtri tri_to_matrix tmat1 invertmatrix
+ smat stri tri_to_matrix tmat2 concatmatrix } bdef
+/ds {stri astore pop} bdef
+/dt {dtri astore pop} bdef
+/db {2 copy /cols xdef /rows xdef mul dup 3 mul string
+ currentfile
+ 3 index 0 eq {/ASCIIHexDecode filter}
+ {/ASCII85Decode filter 3 index 2 eq {/RunLengthDecode filter} if }
+ ifelse exch readstring pop
+ dup 0 3 index getinterval /rbmap xdef
+ dup 2 index dup getinterval /gbmap xdef
+ 1 index dup 2 mul exch getinterval /bbmap xdef pop pop}bdef
+/it {gs np dtri aload pop moveto lineto lineto cp c
+ cols rows 8 compute_transform
+ rbmap gbmap bbmap true 3 colorimage gr}bdef
+/il {newpath moveto lineto stroke}bdef
+currentdict end def
+%%EndProlog
+
+%%BeginSetup
+MathWorks begin
+
+0 cap
+
+end
+%%EndSetup
+
+%%Page: 1 1
+%%BeginPageSetup
+%%PageBoundingBox: -44 170 641 672
+MathWorks begin
+bpage
+%%EndPageSetup
+
+%%BeginObject: obj1
+bplot
+
+/dpi2point 12 def
+portraitMode -0528 8064 csm
+
+ 0 0 8231 6023 rc
+88 dict begin %Colortable dictionary
+/c0 { 0.000000 0.000000 0.000000 sr} bdef
+/c1 { 1.000000 1.000000 1.000000 sr} bdef
+/c2 { 0.900000 0.000000 0.000000 sr} bdef
+/c3 { 0.000000 0.820000 0.000000 sr} bdef
+/c4 { 0.000000 0.000000 0.800000 sr} bdef
+/c5 { 0.910000 0.820000 0.320000 sr} bdef
+/c6 { 1.000000 0.260000 0.820000 sr} bdef
+/c7 { 0.000000 0.820000 0.820000 sr} bdef
+c0
+1 j
+1 sg
+ 0 0 8232 6024 rf
+6 w
+0 4908 6379 0 0 -4908 1070 5360 4 MP
+PP
+-6379 0 0 4908 6379 0 0 -4908 1070 5360 5 MP stroke
+4 w
+DO
+SO
+6 w
+0 sg
+1070 5360 mt 7449 5360 L
+1070 452 mt 7449 452 L
+1070 5360 mt 1070 452 L
+7449 5360 mt 7449 452 L
+1070 5360 mt 7449 5360 L
+1070 5360 mt 1070 452 L
+1247 5360 mt 1247 5296 L
+1247 452 mt 1247 515 L
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1194 5572 mt
+(1) s
+1780 5360 mt 1780 5296 L
+1780 452 mt 1780 515 L
+1754 5572 mt
+( ) s
+2092 5360 mt 2092 5296 L
+2092 452 mt 2092 515 L
+2066 5572 mt
+( ) s
+2314 5360 mt 2314 5296 L
+2314 452 mt 2314 515 L
+2288 5572 mt
+( ) s
+2485 5360 mt 2485 5296 L
+2485 452 mt 2485 515 L
+2459 5572 mt
+( ) s
+2626 5360 mt 2626 5296 L
+2626 452 mt 2626 515 L
+2600 5572 mt
+( ) s
+2744 5360 mt 2744 5296 L
+2744 452 mt 2744 515 L
+2718 5572 mt
+( ) s
+2847 5360 mt 2847 5296 L
+2847 452 mt 2847 515 L
+2821 5572 mt
+( ) s
+2938 5360 mt 2938 5296 L
+2938 452 mt 2938 515 L
+2912 5572 mt
+( ) s
+3019 5360 mt 3019 5296 L
+3019 452 mt 3019 515 L
+2913 5572 mt
+(10) s
+3552 5360 mt 3552 5296 L
+3552 452 mt 3552 515 L
+3526 5572 mt
+( ) s
+3864 5360 mt 3864 5296 L
+3864 452 mt 3864 515 L
+3838 5572 mt
+( ) s
+4085 5360 mt 4085 5296 L
+4085 452 mt 4085 515 L
+4059 5572 mt
+( ) s
+4257 5360 mt 4257 5296 L
+4257 452 mt 4257 515 L
+4231 5572 mt
+( ) s
+4397 5360 mt 4397 5296 L
+4397 452 mt 4397 515 L
+4371 5572 mt
+( ) s
+4516 5360 mt 4516 5296 L
+4516 452 mt 4516 515 L
+4490 5572 mt
+( ) s
+4619 5360 mt 4619 5296 L
+4619 452 mt 4619 515 L
+4593 5572 mt
+( ) s
+4710 5360 mt 4710 5296 L
+4710 452 mt 4710 515 L
+4684 5572 mt
+( ) s
+4791 5360 mt 4791 5296 L
+4791 452 mt 4791 515 L
+4631 5572 mt
+(100) s
+5324 5360 mt 5324 5296 L
+5324 452 mt 5324 515 L
+5298 5572 mt
+( ) s
+5636 5360 mt 5636 5296 L
+5636 452 mt 5636 515 L
+5610 5572 mt
+( ) s
+5857 5360 mt 5857 5296 L
+5857 452 mt 5857 515 L
+5831 5572 mt
+( ) s
+6029 5360 mt 6029 5296 L
+6029 452 mt 6029 515 L
+6003 5572 mt
+( ) s
+6169 5360 mt 6169 5296 L
+6169 452 mt 6169 515 L
+6143 5572 mt
+( ) s
+6288 5360 mt 6288 5296 L
+6288 452 mt 6288 515 L
+6262 5572 mt
+( ) s
+6391 5360 mt 6391 5296 L
+6391 452 mt 6391 515 L
+6365 5572 mt
+( ) s
+6481 5360 mt 6481 5296 L
+6481 452 mt 6481 515 L
+6455 5572 mt
+( ) s
+6563 5360 mt 6563 5296 L
+6563 452 mt 6563 515 L
+6350 5572 mt
+(1000) s
+7096 5360 mt 7096 5296 L
+7096 452 mt 7096 515 L
+7070 5572 mt
+( ) s
+7408 5360 mt 7408 5296 L
+7408 452 mt 7408 515 L
+7382 5572 mt
+( ) s
+1070 5354 mt 1133 5354 L
+7449 5354 mt 7385 5354 L
+ 982 5425 mt
+( ) s
+1070 5257 mt 1133 5257 L
+7449 5257 mt 7385 5257 L
+ 982 5328 mt
+( ) s
+1070 5171 mt 1133 5171 L
+7449 5171 mt 7385 5171 L
+ 929 5242 mt
+(1) s
+1070 4602 mt 1133 4602 L
+7449 4602 mt 7385 4602 L
+ 982 4673 mt
+( ) s
+1070 4270 mt 1133 4270 L
+7449 4270 mt 7385 4270 L
+ 982 4341 mt
+( ) s
+1070 4034 mt 1133 4034 L
+7449 4034 mt 7385 4034 L
+ 982 4105 mt
+( ) s
+1070 3851 mt 1133 3851 L
+7449 3851 mt 7385 3851 L
+ 982 3922 mt
+( ) s
+1070 3702 mt 1133 3702 L
+7449 3702 mt 7385 3702 L
+ 982 3773 mt
+( ) s
+1070 3575 mt 1133 3575 L
+7449 3575 mt 7385 3575 L
+ 982 3646 mt
+( ) s
+1070 3466 mt 1133 3466 L
+7449 3466 mt 7385 3466 L
+ 982 3537 mt
+( ) s
+1070 3369 mt 1133 3369 L
+7449 3369 mt 7385 3369 L
+ 982 3440 mt
+( ) s
+1070 3283 mt 1133 3283 L
+7449 3283 mt 7385 3283 L
+ 822 3354 mt
+(10) s
+1070 2715 mt 1133 2715 L
+7449 2715 mt 7385 2715 L
+ 982 2786 mt
+( ) s
+1070 2382 mt 1133 2382 L
+7449 2382 mt 7385 2382 L
+ 982 2453 mt
+( ) s
+1070 2147 mt 1133 2147 L
+7449 2147 mt 7385 2147 L
+ 982 2218 mt
+( ) s
+1070 1964 mt 1133 1964 L
+7449 1964 mt 7385 1964 L
+ 982 2035 mt
+( ) s
+1070 1814 mt 1133 1814 L
+7449 1814 mt 7385 1814 L
+ 982 1885 mt
+( ) s
+1070 1688 mt 1133 1688 L
+7449 1688 mt 7385 1688 L
+ 982 1759 mt
+( ) s
+1070 1578 mt 1133 1578 L
+7449 1578 mt 7385 1578 L
+ 982 1649 mt
+( ) s
+1070 1482 mt 1133 1482 L
+7449 1482 mt 7385 1482 L
+ 982 1553 mt
+( ) s
+1070 1395 mt 1133 1395 L
+7449 1395 mt 7385 1395 L
+ 715 1466 mt
+(100) s
+1070 827 mt 1133 827 L
+7449 827 mt 7385 827 L
+ 982 898 mt
+( ) s
+1070 495 mt 1133 495 L
+7449 495 mt 7385 495 L
+ 982 566 mt
+( ) s
+1070 5360 mt 7449 5360 L
+1070 452 mt 7449 452 L
+1070 5360 mt 1070 452 L
+7449 5360 mt 7449 452 L
+gs 1070 452 6380 4909 rc
+24 w
+438 -1 439 -2 439 -1 439 -2 439 -1 439 -2 439 -1 439 -2
+438 -1 439 -2 439 -1 439 -3 439 -3 1466 5171 14 MP stroke
+gr
+
+24 w
+ 48 48 1466 5171 FO
+ 48 48 1905 5168 FO
+ 48 48 2344 5165 FO
+ 48 48 2783 5164 FO
+ 48 48 3222 5162 FO
+ 48 48 3660 5161 FO
+ 48 48 4099 5159 FO
+ 48 48 4538 5158 FO
+ 48 48 4977 5157 FO
+ 48 48 5416 5154 FO
+ 48 48 5855 5154 FO
+ 48 48 6294 5153 FO
+ 48 48 6733 5151 FO
+ 48 48 7171 5153 FO
+gs 1070 452 6380 4909 rc
+gr
+
+0 j
+-55 95 -55 -95 110 0 1411 5203 4 MP
+DP
+-55 95 -55 -95 110 0 1850 5200 4 MP
+DP
+-55 95 -55 -95 110 0 2289 5197 4 MP
+DP
+-55 95 -55 -95 110 0 2728 5196 4 MP
+DP
+-55 95 -55 -95 110 0 3167 5194 4 MP
+DP
+-55 95 -55 -95 110 0 3605 5193 4 MP
+DP
+-55 95 -55 -95 110 0 4044 5191 4 MP
+DP
+-55 95 -55 -95 110 0 4483 5190 4 MP
+DP
+-55 95 -55 -95 110 0 4922 5189 4 MP
+DP
+-55 95 -55 -95 110 0 5361 5185 4 MP
+DP
+-55 95 -55 -95 110 0 5800 5186 4 MP
+DP
+-55 95 -55 -95 110 0 6239 5184 4 MP
+DP
+-55 95 -55 -95 110 0 6678 5182 4 MP
+DP
+-55 95 -55 -95 110 0 7116 5187 4 MP
+DP
+gs 1070 452 6380 4909 rc
+/c8 { 1.000000 0.400000 0.200000 sr} bdef
+c8
+438 -12 439 -12 439 -13 439 -13 439 -13 439 -13 439 -14 439 -13
+438 -15 439 -15 439 -14 439 -19 439 -31 1466 5171 14 MP stroke
+gr
+
+c8
+ 48 48 1466 5171 FO
+ 48 48 1905 5140 FO
+ 48 48 2344 5121 FO
+ 48 48 2783 5108 FO
+ 48 48 3222 5092 FO
+ 48 48 3660 5077 FO
+ 48 48 4099 5065 FO
+ 48 48 4538 5050 FO
+ 48 48 4977 5036 FO
+ 48 48 5416 5022 FO
+ 48 48 5855 5013 FO
+ 48 48 6294 4999 FO
+ 48 48 6733 4995 FO
+ 48 48 7171 4967 FO
+gs 1070 452 6380 4909 rc
+gr
+
+-55 95 -55 -95 110 0 1411 5203 4 MP
+DP
+-55 95 -55 -95 110 0 1850 5173 4 MP
+DP
+-55 95 -55 -95 110 0 2289 5153 4 MP
+DP
+-55 95 -55 -95 110 0 2728 5139 4 MP
+DP
+-55 95 -55 -95 110 0 3167 5122 4 MP
+DP
+-55 95 -55 -95 110 0 3605 5105 4 MP
+DP
+-55 95 -55 -95 110 0 4044 5089 4 MP
+DP
+-55 95 -55 -95 110 0 4483 5070 4 MP
+DP
+-55 95 -55 -95 110 0 4922 5053 4 MP
+DP
+-55 95 -55 -95 110 0 5361 5036 4 MP
+DP
+-55 95 -55 -95 110 0 5800 5021 4 MP
+DP
+-55 95 -55 -95 110 0 6239 5007 4 MP
+DP
+-55 95 -55 -95 110 0 6678 4984 4 MP
+DP
+-55 95 -55 -95 110 0 7116 4944 4 MP
+DP
+gs 1070 452 6380 4909 rc
+/c9 { 0.400000 0.400000 1.000000 sr} bdef
+c9
+438 -44 439 -46 439 -50 439 -51 439 -57 439 -61 439 -65 439 -72
+438 -81 439 -89 439 -85 439 -130 439 -215 1466 5171 14 MP stroke
+gr
+
+c9
+ 48 48 1466 5171 FO
+ 48 48 1905 4956 FO
+ 48 48 2344 4826 FO
+ 48 48 2783 4741 FO
+ 48 48 3222 4651 FO
+ 48 48 3660 4571 FO
+ 48 48 4099 4500 FO
+ 48 48 4538 4435 FO
+ 48 48 4977 4372 FO
+ 48 48 5416 4316 FO
+ 48 48 5855 4268 FO
+ 48 48 6294 4214 FO
+ 48 48 6733 4167 FO
+ 48 48 7171 4121 FO
+gs 1070 452 6380 4909 rc
+gr
+
+-55 95 -55 -95 110 0 1411 5203 4 MP
+DP
+-55 95 -55 -95 110 0 1850 5057 4 MP
+DP
+-55 95 -55 -95 110 0 2289 4938 4 MP
+DP
+-55 95 -55 -95 110 0 2728 4838 4 MP
+DP
+-55 95 -55 -95 110 0 3167 4706 4 MP
+DP
+-55 95 -55 -95 110 0 3605 4555 4 MP
+DP
+-55 95 -55 -95 110 0 4044 4385 4 MP
+DP
+-55 95 -55 -95 110 0 4483 4185 4 MP
+DP
+-55 95 -55 -95 110 0 4922 3940 4 MP
+DP
+-55 95 -55 -95 110 0 5361 3650 4 MP
+DP
+-55 95 -55 -95 110 0 5800 3310 4 MP
+DP
+-55 95 -55 -95 110 0 6239 2895 4 MP
+DP
+-55 95 -55 -95 110 0 6678 2492 4 MP
+DP
+-55 95 -55 -95 110 0 7116 2000 4 MP
+DP
+gs 1070 452 6380 4909 rc
+/c10 { 0.000000 0.700000 0.500000 sr} bdef
+c10
+438 -74 439 -78 439 -89 439 -98 439 -111 439 -129 439 -148 439 -173
+438 -211 439 -248 439 -244 439 -364 439 -564 1466 5171 14 MP stroke
+gr
+
+c10
+ 48 48 1466 5171 FO
+ 48 48 1905 4607 FO
+ 48 48 2344 4243 FO
+ 48 48 2783 3999 FO
+ 48 48 3222 3751 FO
+ 48 48 3660 3540 FO
+ 48 48 4099 3368 FO
+ 48 48 4538 3220 FO
+ 48 48 4977 3090 FO
+ 48 48 5416 2979 FO
+ 48 48 5855 2882 FO
+ 48 48 6294 2791 FO
+ 48 48 6733 2713 FO
+ 48 48 7171 2638 FO
+gs 1070 452 6380 4909 rc
+gr
+
+-55 95 -55 -95 110 0 1411 5203 4 MP
+DP
+-55 95 -55 -95 110 0 1850 4905 4 MP
+DP
+-55 95 -55 -95 110 0 2289 4630 4 MP
+DP
+-55 95 -55 -95 110 0 2728 4368 4 MP
+DP
+-55 95 -55 -95 110 0 3167 4001 4 MP
+DP
+-55 95 -55 -95 110 0 3605 3574 4 MP
+DP
+-55 95 -55 -95 110 0 4044 3123 4 MP
+DP
+-55 95 -55 -95 110 0 4483 2661 4 MP
+DP
+-55 95 -55 -95 110 0 4922 2196 4 MP
+DP
+-55 95 -55 -95 110 0 5361 1735 4 MP
+DP
+-55 95 -55 -95 110 0 5800 1279 4 MP
+DP
+-55 95 -55 -95 110 0 6239 873 4 MP
+DP
+gs 1070 452 6380 4909 rc
+gr
+
+0 sg
+ 617 4557 mt -90 rotate
+(Mean number of lexical entries \(tables\)) s
+90 rotate
+3390 5724 mt
+(Word frequency \(n) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 144 FMSR
+
+4963 5820 mt
+(w) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+5066 5724 mt
+(\)) s
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 120 FMSR
+
+1053 5403 mt
+( ) s
+7433 494 mt
+( ) s
+6 w
+1 sg
+0 729 2519 0 0 -729 1129 1241 4 MP
+PP
+-2519 0 0 729 2519 0 0 -729 1129 1241 5 MP stroke
+4 w
+DO
+SO
+6 w
+0 sg
+1129 1241 mt 3648 1241 L
+1129 512 mt 3648 512 L
+1129 1241 mt 1129 512 L
+3648 1241 mt 3648 512 L
+1129 1241 mt 3648 1241 L
+1129 1241 mt 1129 512 L
+1129 1241 mt 3648 1241 L
+1129 512 mt 3648 512 L
+1129 1241 mt 1129 512 L
+3648 1241 mt 3648 512 L
+%%IncludeResource: font Helvetica
+/Helvetica /ISOLatin1Encoding 192 FMSR
+
+1609 712 mt
+(Expectation) s
+gs 1129 512 2520 730 rc
+24 w
+370 0 1202 644 2 MP stroke
+gr
+
+24 w
+1609 945 mt
+(Empirical, fixed base) s
+gs 1129 512 2520 730 rc
+gs 1265 754 245 245 rc
+ 48 48 1387 876 FO
+gr
+
+gr
+
+1609 1178 mt
+(Empirical, inferred base) s
+gs 1129 512 2520 730 rc
+gs 1265 986 245 245 rc
+-55 95 -55 -95 110 0 1332 1140 4 MP
+DP
+gr
+
+6 w
+gr
+
+6 w
+
+end %%Color Dict
+
+eplot
+%%EndObject
+
+epage
+end
+
+showpage
+
+%%Trailer
+%%EOF
diff --git a/report/pyp_clustering/acl09-short/code/plot3.pdf b/report/pyp_clustering/acl09-short/code/plot3.pdf
new file mode 100644
index 00000000..a3e81faa
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/plot3.pdf
Binary files differ
diff --git a/report/pyp_clustering/acl09-short/code/pygibbs3.c b/report/pyp_clustering/acl09-short/code/pygibbs3.c
new file mode 100644
index 00000000..3c2240a1
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/pygibbs3.c
@@ -0,0 +1,198 @@
+#include <stdio.h>
+#include <math.h>
+
+#define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
+
+#define W 30114
+#define N 831190
+#define KWMAX 1000
+
+#define NLOOPS 1000
+#define BURNIN 0
+#define SAMPLEFREQ 1
+
+#define ALPHA 0.0 // PYB a
+//#define GAMMA 1000000000.0
+#define GAMMA .01 // Dirichlet over multinomial P0
+
+double BETA; // CRP alpha (PYB b)
+int w[N], z[N]; // words, table assignments
+int typecount[W], typetot; //# of tables of each type, total # tables
+int usedcount[W];
+double ztot[W][KWMAX];
+double k; // total # tables
+int nactive;
+
+void initialise(void);
+void anderson(void);
+void fileread(void);
+
+void initialise(void)
+{
+ int i,j;
+
+ for (i = 1; i < W; i++) {
+ typecount[i] = 0;
+ usedcount[i] = 0;
+ for (j = 0; j < KWMAX; j++) {
+ ztot[i][j] = 0;
+ }
+ }
+
+}
+
+void anderson(void) //stochastic Anderson-style initialisation
+{
+ int i,j, tag;
+ double max, totprob, r, runtot;
+ double probs[KWMAX];
+ int ind, temp;
+
+ ztot[w[0]][0] = 1;
+ z[0] = 0;
+ typecount[w[0]] = 1;
+ usedcount[w[0]] = 1;
+ k = 1;
+ typetot = 1;
+
+ for (i = 1; i < N; i++) {
+ // printf("%5d\n", w[i]);
+ max = 0; tag = 0; totprob = 0;
+ for (j = 0; j < usedcount[w[i]]; j++) {
+ probs[j] = ztot[w[i]][j] - ALPHA;
+ totprob += probs[j];
+ }
+ probs[usedcount[w[i]]] = (ALPHA*k+BETA)*((double) typecount[w[i]]+GAMMA)/((double) typetot+W*GAMMA);
+ totprob += probs[usedcount[w[i]]];
+ // printf("%10.6lf\n",totprob);
+ r = myrand()*totprob;
+ max = probs[0];
+ j = 0;
+ while (r>max) {
+ j++;
+ max += probs[j];
+ }
+ // printf("%5d\n",j);
+ z[i] = j;
+ ztot[w[i]][j]++;
+ if (ztot[w[i]][j]==1) {
+ typecount[w[i]]++;
+ usedcount[w[i]]++;
+ if (usedcount[w[i]]==KWMAX) {
+ printf("Maximum number of tables exceeded!!!\n");
+ }
+ typetot++;
+ k++;
+ }
+ }
+}
+
+void fileread(void)
+{
+ int i,j, wt;
+ FILE *fileptr;
+
+ fileptr = fopen("wsj.dat", "r");
+
+ for (i = 1; i < N; i++) {
+ fscanf(fileptr, "%d", &wt);
+ w[i] = wt-1;
+ z[i] = 0;
+ }
+ printf("Total cases: %10d\n", N);
+ fclose(fileptr);
+}
+
+main(int argc, char* argv[])
+{
+ int i,j,loop,run;
+ int temp,ind, tag;
+ double newprob, WBETA;
+ double probs[KWMAX];
+ double max, totprob, r;
+ int sampcount;
+ FILE *fileptr;
+ char filename[30];
+ double score;
+
+ if (argc < 2) {
+ printf("Please provide a value of b\n");
+ exit(0);
+ }
+ BETA = strtol(argv[1]);
+ printf("Basic initialising...\n");
+
+ // you can seed with any uint32, but the best are odds in 0..(2^32 - 1)
+ seedMT(4157U);
+
+ sprintf(filename,"typecountrecordwsjpeak%0.1f.%0.1f.dat",ALPHA,BETA);
+ fileptr = fopen(filename, "w");
+
+ printf("Reading from file...\n");
+ fileread();
+
+ printf("Initialising...\n");
+ initialise();
+ printf("k = %1.0f, typetot = %d\n",k,typetot);
+
+ printf("Finding start state...\n");
+ anderson();
+ printf("Beginning burnin...\n");
+ for (loop = 0; loop < NLOOPS; loop++) {
+ for (i = 0; i < N; i++) {
+ j = z[i];
+ ztot[w[i]][j]--;
+ if (ztot[w[i]][j] == 0) {
+ if (j==usedcount[w[i]]) {
+ usedcount[w[i]]--;
+ }
+ typecount[w[i]]--;
+ typetot--;
+ k--;
+ }
+ max = 0; tag = 0; totprob = 0;
+ for (j = 0; j <= usedcount[w[i]]; j++) {
+ if (ztot[w[i]][j] > 0) {
+ probs[j] = ztot[w[i]][j] - ALPHA;
+ } else {
+ probs[j] = 0;
+ if (tag == 0) {
+ probs[j] = (ALPHA*k+BETA)*(((double) typecount[w[i]])+GAMMA)/(((double) typetot)+((double) W)*GAMMA);
+ tag = 1;
+ }
+ }
+ totprob += probs[j];
+ }
+ r = myrand()*totprob;
+ max = probs[0];
+ j = 0;
+ while (r>max) {
+ j++;
+ max += probs[j];
+ }
+ z[i] = j;
+ ztot[w[i]][j]++;
+ if (ztot[w[i]][j]==1) {
+ if (j == usedcount[w[i]]) {
+ usedcount[w[i]]++;
+ if (usedcount[w[i]]==KWMAX) {
+ printf("Maximum number of tables exceeded!!!\n");
+ }
+ }
+ typecount[w[i]]++;
+ typetot++;
+ k++;
+ }
+ }
+ printf("Completed sample # %5d\n", loop);
+ if (k != typetot) printf("k = %1.0f, typetot = %d\n",k,typetot);
+ if (loop >= BURNIN && loop % SAMPLEFREQ == 0) {
+ for (i = 0; i < W; i++) {
+ fprintf(fileptr," %d", typecount[i]); //print (table?) count for each word type
+ }
+ fprintf(fileptr,"\n");
+ }
+ }
+ fclose(fileptr);
+}
+
diff --git a/report/pyp_clustering/acl09-short/code/pygibbs_geom b/report/pyp_clustering/acl09-short/code/pygibbs_geom
new file mode 100755
index 00000000..14ae82f1
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/pygibbs_geom
Binary files differ
diff --git a/report/pyp_clustering/acl09-short/code/pygibbs_geom.c b/report/pyp_clustering/acl09-short/code/pygibbs_geom.c
new file mode 100644
index 00000000..bafa0416
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/pygibbs_geom.c
@@ -0,0 +1,212 @@
+#include <stdio.h>
+#include <math.h>
+
+#define myrand() (double) (((unsigned long) randomMT()) / 4294967296.)
+
+#define W 30114
+#define N 831190
+#define KWMAX 5000
+
+#define NLOOPS 11000
+#define BURNIN 1000
+#define SAMPLEFREQ 10
+
+#define ALPHA 0.0 // PYB a
+//#define GAMMA 1000000000.0
+#define GAMMA .01 // Dirichlet over multinomial P0
+
+double BETA; // CRP alpha (PYB b)
+int w[N], z[N]; // words, table assignments
+double base[N]; // base prob of word under geometric
+int typecount[W], typetot; //# of tables of each type, total # tables
+int usedcount[W];
+double ztot[W][KWMAX];
+double k; // total # tables
+int nactive;
+
+void initialise(void);
+void anderson(void);
+void fileread(void);
+
+void initialise(void)
+{
+ int i,j;
+
+ for (i = 1; i < W; i++) {
+ typecount[i] = 0;
+ usedcount[i] = 0;
+ for (j = 0; j < KWMAX; j++) {
+ ztot[i][j] = 0;
+ }
+ }
+
+}
+
+double base_p(int len) {
+ double p = 1.0/26;
+ return pow(p,len)*pow(.5,len); //assume p_# = .5
+}
+
+void anderson(void) //stochastic Anderson-style initialisation
+{
+ int i,j, tag;
+ double max, totprob, r, runtot;
+ double probs[KWMAX];
+ int ind, temp;
+
+ ztot[w[0]][0] = 1;
+ z[0] = 0;
+ typecount[w[0]] = 1;
+ usedcount[w[0]] = 1;
+ k = 1;
+ typetot = 1;
+
+ for (i = 1; i < N; i++) {
+ // printf("%5d\n", w[i]);
+ max = 0; tag = 0; totprob = 0;
+ for (j = 0; j < usedcount[w[i]]; j++) {
+ probs[j] = ztot[w[i]][j] - ALPHA;
+ totprob += probs[j];
+ }
+ probs[usedcount[w[i]]] = (ALPHA*k+BETA)*base[i];
+ totprob += probs[usedcount[w[i]]];
+ // printf("%10.6lf\n",totprob);
+ r = myrand()*totprob;
+ max = probs[0];
+ j = 0;
+ while (r>max) {
+ j++;
+ max += probs[j];
+ }
+ // printf("%5d\n",j);
+ z[i] = j;
+ ztot[w[i]][j]++;
+ if (ztot[w[i]][j]==1) {
+ typecount[w[i]]++;
+ usedcount[w[i]]++;
+ if (usedcount[w[i]]==KWMAX) {
+ printf("Maximum number of tables exceeded!!!\n");
+ }
+ typetot++;
+ k++;
+ }
+ }
+}
+
+void fileread(void)
+{
+ int i,j, wt, len;
+ FILE *fileptr;
+
+ fileptr = fopen("wsj.dat", "r");
+
+ for (i = 1; i < N; i++) {
+ fscanf(fileptr, "%d", &wt);
+ w[i] = wt-1;
+ z[i] = 0;
+ }
+ printf("Total cases: %10d\n", N);
+ fclose(fileptr);
+
+ fileptr = fopen("wsj_lengths.dat", "r");
+
+ for (i = 1; i < N; i++) {
+ fscanf(fileptr, "%d", &len);
+ base[i] = base_p(len);
+ }
+ fclose(fileptr);
+}
+
+main(int argc, char* argv[])
+{
+ int i,j,loop,run;
+ int temp,ind, tag;
+ double newprob, WBETA;
+ double probs[KWMAX];
+ double max, totprob, r;
+ int sampcount;
+ FILE *fileptr;
+ char filename[30];
+ double score;
+
+ if (argc < 2) {
+ printf("Please provide a value of b\n");
+ exit(0);
+ }
+ BETA = strtol(argv[1]);
+ printf("Basic initialising...\n");
+
+ // you can seed with any uint32, but the best are odds in 0..(2^32 - 1)
+ seedMT(4157U);
+
+ sprintf(filename,"typecountrecordwsjgeom%0.1f.%0.1f.dat",ALPHA,BETA);
+ fileptr = fopen(filename, "w");
+
+ printf("Reading from file...\n");
+ fileread();
+
+ printf("Initialising...\n");
+ initialise();
+ printf("k = %1.0f, typetot = %d\n",k,typetot);
+
+ printf("Finding start state...\n");
+ anderson();
+ printf("Beginning burnin...\n");
+ for (loop = 0; loop < NLOOPS; loop++) {
+ for (i = 0; i < N; i++) {
+ j = z[i];
+ ztot[w[i]][j]--;
+ if (ztot[w[i]][j] == 0) {
+ if (j==usedcount[w[i]]) {
+ usedcount[w[i]]--;
+ }
+ typecount[w[i]]--;
+ typetot--;
+ k--;
+ }
+ max = 0; tag = 0; totprob = 0;
+ for (j = 0; j <= usedcount[w[i]]; j++) {
+ if (ztot[w[i]][j] > 0) {
+ probs[j] = ztot[w[i]][j] - ALPHA;
+ } else {
+ probs[j] = 0;
+ if (tag == 0) {
+ probs[j] = (ALPHA*k+BETA)*base[i];
+ tag = 1;
+ }
+ }
+ totprob += probs[j];
+ }
+ r = myrand()*totprob;
+ max = probs[0];
+ j = 0;
+ while (r>max) {
+ j++;
+ max += probs[j];
+ }
+ z[i] = j;
+ ztot[w[i]][j]++;
+ if (ztot[w[i]][j]==1) {
+ if (j == usedcount[w[i]]) {
+ usedcount[w[i]]++;
+ if (usedcount[w[i]]==KWMAX) {
+ printf("Maximum number of tables exceeded!!!\n");
+ }
+ }
+ typecount[w[i]]++;
+ typetot++;
+ k++;
+ }
+ }
+ printf("Completed sample # %5d\n", loop);
+ if (k != typetot) printf("k = %1.0f, typetot = %d\n",k,typetot);
+ if (loop >= BURNIN && loop % SAMPLEFREQ == 0) {
+ for (i = 0; i < W; i++) {
+ fprintf(fileptr," %d", typecount[i]); //print (table?) count for each word type
+ }
+ fprintf(fileptr,"\n");
+ }
+ }
+ fclose(fileptr);
+}
+
diff --git a/report/pyp_clustering/acl09-short/code/run-peak.prl b/report/pyp_clustering/acl09-short/code/run-peak.prl
new file mode 100755
index 00000000..fb1e798a
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/run-peak.prl
@@ -0,0 +1,8 @@
+#!/usr/bin/perl
+
+for $i (0..5) {
+$beta = 10**$i;
+$cmd = "pygibbs_peak $beta\n";
+print $cmd;
+`$cmd`;
+}
diff --git a/report/pyp_clustering/acl09-short/code/run.prl b/report/pyp_clustering/acl09-short/code/run.prl
new file mode 100755
index 00000000..ac69559c
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/run.prl
@@ -0,0 +1,8 @@
+#!/usr/bin/perl
+
+for $i (0..5) {
+$beta = 10**$i;
+$cmd = "pygibbs_geom $beta\n";
+print $cmd;
+`$cmd`;
+}
diff --git a/report/pyp_clustering/acl09-short/code/word_lengths.prl b/report/pyp_clustering/acl09-short/code/word_lengths.prl
new file mode 100755
index 00000000..4b4ed03b
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/word_lengths.prl
@@ -0,0 +1,21 @@
+#!/usr/bin/perl -w
+use Getopt::Std;
+use File::Basename;
+use List::Util qw(max maxstr min minstr reduce shuffle sum);
+use lib "$ENV{HOME}/src/perl/";
+use sg_utils;
+use strict;
+use vars qw();
+
+my $usage = "Usage: $0 \n";
+
+getopts('');
+
+die $usage unless (1);
+
+while (<>) {
+chomp;
+print length;
+print "\n";
+}
+
diff --git a/report/pyp_clustering/acl09-short/code/wsjplots2.m b/report/pyp_clustering/acl09-short/code/wsjplots2.m
new file mode 100644
index 00000000..eed41846
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/wsjplots2.m
@@ -0,0 +1,99 @@
+
+load wsj
+
+figure(1)
+clf
+subplot(1,2,2)
+hold on
+
+for i = 1:9
+ a = i/10;
+ [logbins predicted dummy] = logbinmean(counts,counts.^a,20,20);
+ ph = plot(log10(logbins),log10(predicted),'k');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+end
+
+for i = 1:9
+ a = i/10;
+ disp(['Loading results for a = ' num2str(a) ]);
+
+ typecountrecord= load([ 'typecountrecordwsjflat' num2str(a) '.1.0.dat']);
+
+ typecountrecordmean = mean(typecountrecord(500:1000,:));
+
+ save([ 'typecountrecordmeanwsjflat' num2str(a) '.1.0.mat'],'typecountrecordmean');
+
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20)
+ errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+ drawnow
+end
+
+
+
+
+[logbins meanval seval] = logbinmean(counts,counts,20,20)
+[logbins predicted dummy] = logbinmean(counts,counts,20,20)
+ph = plot(log10(logbins),log10(predicted),'r');
+hold on
+errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+
+set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-0.1 3.5])
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+
+title('Pitman-Yor process adaptor')
+ylabel('Mean number of lexical entries')
+xlabel('Word frequency (n_w)')
+box on
+
+subplot(1,2,1)
+
+for i = 1:5
+
+ b = 10^(i-1)
+
+ disp(['Loading results for b = ' num2str(b) ]);
+ typecountrecord= load([ 'typecountrecordwsjflat0.0.' num2str(b) '.0.dat']);
+
+ typecountrecordmean = mean(typecountrecord(500:1000,:));
+ save([ 'typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20)
+ [logbins predicted dummy] = logbinmean(counts,crppred(counts,b),20,20)
+% errorbar(log10(logbins),meanval,seval,'k.');
+ hold on
+ ph = plot(log10(logbins),log10(predicted),'r');
+ % ph = plot(log10(logbins),predicted,'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+ errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-0.1 1.5])
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries')
+xlabel('Word frequency (n_w)')
+box on
+
+
diff --git a/report/pyp_clustering/acl09-short/code/wsjplots_acl.m b/report/pyp_clustering/acl09-short/code/wsjplots_acl.m
new file mode 100644
index 00000000..50582e7f
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/wsjplots_acl.m
@@ -0,0 +1,74 @@
+%wsj_lengths = load([ 'wsj_lengths.dat']);
+%save([ 'wsj_lengths.mat'],'wsj_lengths');
+load wsj
+load wsj_lengths
+
+figure(1)
+clf
+
+hold on
+
+for i = 3:6
+
+ b = 10^(i-1)
+
+ % plot lines for CRP exact prediction using summation
+ [logbins predicted dummy] = logbinmean(counts, crppred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+
+ % plot lines for CRP Antoniak prediction
+ [logbins predicted dummy] = logbinmean(counts, antoniakpred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5,'linestyle','--')
+
+ % plot lines for CRP Cohn prediction
+ %[logbins predicted dummy] = logbinmean(counts, cohnpred(counts,b),20,20);
+ %ph = plot(log10(logbins),log10(predicted),'r');
+ %set(ph,'color',[0.2 0.2 1],'linewidth',1.5,'linestyle','.')
+
+ disp(['Loading results for b = ' num2str(b) ]);
+ %%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjflat0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(:,:));
+ %save([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat']);
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ plot(log10(logbins),log10(meanval),'k*');
+ %errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+
+ disp(['Loading results for b = ' num2str(b) ]);
+ %%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjpeak0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(:,:));
+ %save([ 'outputs/typecountrecordmeanwsjpeak0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjpeak0.0.' num2str(b) '.0.mat']);
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ plot(log10(logbins),log10(meanval),'ko');
+ %errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'ko');
+
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([.1:.1:1 2:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-1.1 2.5])
+set(gca,'FontSize',14)
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'0.1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+%title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries')
+xlabel('Word frequency (n_w)')
+legend('Expectation','Antoniak approx.','Empirical, fixed base','Empirical, inferred base','Location','NorthWest')
+box on
diff --git a/report/pyp_clustering/acl09-short/code/wsjplots_acl_monkeys.m b/report/pyp_clustering/acl09-short/code/wsjplots_acl_monkeys.m
new file mode 100644
index 00000000..33419845
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/wsjplots_acl_monkeys.m
@@ -0,0 +1,164 @@
+%wsj_lengths = load([ 'wsj_lengths.dat']);
+%save([ 'wsj_lengths.mat'],'wsj_lengths');
+load wsj
+load wsj_lengths
+
+figure(1)
+clf
+
+subplot(1,3,1);
+hold on
+
+for i = 2:6
+
+ b = 10^(i-1)
+
+ disp(['Loading results for b = ' num2str(b) ]);
+%%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjflat0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(500:999,:));
+ %save([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat']);
+
+ % plot lines for CRP exact prediction using summation
+ [logbins predicted dummy] = logbinmean(counts, crppred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+
+ % plot lines for CRP Antoniak prediction
+ [logbins predicted dummy] = logbinmean(counts, antoniakpred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5,'linestyle','--')
+
+ % plot lines for incorrect CRP Antoniak prediction (ACL07)
+ %[logbins predicted dummy] = logbinmean(counts, noP0pred(counts,b),20,20);
+ %ph = plot(log10(logbins),log10(predicted),'r');
+ %set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5,'linestyle','-.')
+
+ % plot lines for CRP Cohn prediction
+ %[logbins predicted dummy] = logbinmean(counts, cohnpred(counts,b),20,20);
+ %ph = plot(log10(logbins),log10(predicted),'r');
+ %set(ph,'color',[0.2 0.2 1],'linewidth',1.5,'linestyle','.')
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([.1:.1:1 2:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-1.1 1.5])
+set(gca,'FontSize',14)
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'0.1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+%title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries')
+xlabel('Word frequency (n_w)')
+legend('Expectation','Antoniak approx.','Empirical','Location','NorthWest')
+box on
+
+
+subplot(1,3,2);
+hold on
+
+for i =2:6
+
+ b = 10^(i-1)
+
+ disp(['Loading results for b = ' num2str(b) ]);
+%%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjpeak0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(500:999,:));
+ %save([ 'outputs/typecountrecordmeanwsjpeak0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjpeak0.0.' num2str(b) '.0.mat']);
+
+ % plot lines for CRP exact prediction using summation
+ [logbins predicted dummy] = logbinmean(counts, crppred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-.1 2.5])
+set(gca,'FontSize',14)
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {...%'0.1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+%title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries')
+xlabel('Word frequency (n_w)')
+legend('Expectation','Location','NorthWest')
+box on
+%axis square
+
+
+subplot(1,3,3);
+hold on
+
+for i =2:6
+
+ b = 10^(i-1)
+
+ disp(['Loading results for b = ' num2str(b) ]);
+%%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjgeom0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(500:999,:));
+ %save([ 'outputs/typecountrecordmeanwsjgeom0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjgeom0.0.' num2str(b) '.0.mat']);
+
+ % plot lines for CRP exact prediction using summation
+% [logbins meaneval seval] = logbinmean(counts, crppred_geom(counts,wsj_lengths,b),20,20)
+[logbins meaneval seval] = logbinmean(counts, crppred(counts,b),20,20)
+ plot(log10(logbins),log10(meaneval),'r.');
+%errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'r.');
+% ph = plot(log10(logbins),log10(meaneval),'r');
+% set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-.1 2.5])
+set(gca,'FontSize',14)
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {...%'0.1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+%title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries')
+xlabel('Word frequency (n_w)')
+legend('Expectation','Location','NorthWest')
+box on
+hold off
+%axis square
+
+
diff --git a/report/pyp_clustering/acl09-short/code/wsjplots_acl_pair.m b/report/pyp_clustering/acl09-short/code/wsjplots_acl_pair.m
new file mode 100644
index 00000000..1d07e54c
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/wsjplots_acl_pair.m
@@ -0,0 +1,117 @@
+%wsj_lengths = load([ 'wsj_lengths.dat']);
+%save([ 'wsj_lengths.mat'],'wsj_lengths');
+load wsj
+load wsj_lengths
+
+figure(1)
+clf
+
+subplot(1,2,1);
+hold on
+
+for i = 3:6
+
+ b = 10^(i-1)
+
+ disp(['Loading results for b = ' num2str(b) ]);
+%%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjflat0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(:,:));
+ %save([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat']);
+
+ % plot lines for CRP exact prediction using summation
+ [logbins predicted dummy] = logbinmean(counts, crppred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+
+ % plot lines for CRP Antoniak prediction
+ [logbins predicted dummy] = logbinmean(counts, antoniakpred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5,'linestyle','--')
+
+ %plot lines for incorrect CRP Antoniak prediction (ACL07)
+ %[logbins predicted dummy] = logbinmean(counts, noP0pred(counts,b),20,20);
+ %ph = plot(log10(logbins),log10(predicted),'r');
+ %set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5,'linestyle',':')
+
+ % plot lines for CRP Cohn prediction
+ %[logbins predicted dummy] = logbinmean(counts, cohnpred(counts,b),20,20);
+ %ph = plot(log10(logbins),log10(predicted),'r');
+ %set(ph,'color',[0.2 0.2 1],'linewidth',1.5,'linestyle','.')
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([.1:.1:1 2:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-1.1 1.5])
+set(gca,'FontSize',14)
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'0.1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+%title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries')
+xlabel('Word frequency (n_w)')
+legend('Expectation','Antoniak approx.','Empirical','Location','NorthWest')
+box on
+
+
+subplot(1,2,2);
+hold on
+
+for i =3:6
+
+ b = 10^(i-1)
+
+ disp(['Loading results for b = ' num2str(b) ]);
+%%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjpeak0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(:,:));
+ %save([ 'outputs/typecountrecordmeanwsjpeak0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjpeak0.0.' num2str(b) '.0.mat']);
+
+ % plot lines for CRP exact prediction using summation
+ [logbins predicted dummy] = logbinmean(counts, crppred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+
+ %plot lines for incorrect CRP Antoniak prediction (ACL07)
+ [logbins predicted dummy] = logbinmean(counts, noP0pred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5,'linestyle','-.')
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-.1 2.5])
+set(gca,'FontSize',14)
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {...%'0.1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+%title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries')
+xlabel('Word frequency (n_w)')
+legend('Expectation','GGJ07 approx.','Empirical','Location','NorthWest')
+box on
+%axis square \ No newline at end of file
diff --git a/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk0.m b/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk0.m
new file mode 100644
index 00000000..dc54dea4
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk0.m
@@ -0,0 +1,54 @@
+%wsj_lengths = load([ 'wsj_lengths.dat']);
+%save([ 'wsj_lengths.mat'],'wsj_lengths');
+load wsj
+load wsj_lengths
+
+figure(1)
+clf
+
+hold on
+
+%colors = [0 0 0; 0 0 1; 1 0 0; 0 1 0]; %pure black, red, blue, green
+colors = [0 0 0; 1 .4 .2; .4 .4 1; 0 .7 .5]; %same but less garish
+%colors = [0 0 0; .6 .4 .4; .9 .6 .6; 1 .8 .8]; %shades of pink
+%colors = [0 0 0; .3 .3 1; .4 .8 1; .5 1 .8]; %blue/green
+
+for i = 9-[3:6]
+
+ b = 10^(i-1)
+
+ disp(['Loading results for b = ' num2str(b) ]);
+ %%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjflat0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(:,:));
+ %save([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat']);
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ ph = plot(log10(logbins),log10(meanval));
+ set(ph,'color',colors(i-2,:),'linestyle','o','linewidth',2,'markersize',10);
+ %errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([.1:.1:1 2:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-.1 2])
+set(gca,'FontSize',16)
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'0.1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+%title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries (tables)')
+xlabel('Word frequency (n_w)')
+labs = {'\alpha = 100000','\alpha = 10000','\alpha = 1000','\alpha = 100'};
+legend(labs,'Location','NorthWest')
+box on
diff --git a/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk1.m b/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk1.m
new file mode 100644
index 00000000..dd3615ac
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk1.m
@@ -0,0 +1,59 @@
+%wsj_lengths = load([ 'wsj_lengths.dat']);
+%save([ 'wsj_lengths.mat'],'wsj_lengths');
+load wsj
+load wsj_lengths
+
+figure(1)
+clf
+
+hold on
+
+%colors = [0 0 0; 0 0 1; 1 0 0; 0 1 0]; %pure black, red, blue, green
+colors = [0 0 0; 1 .4 .2; .4 .4 1; 0 .7 .5]; %same but less garish
+%colors = [0 0 0; .6 .4 .4; .9 .6 .6; 1 .8 .8]; %shades of pink
+%colors = [0 0 0; .3 .3 1; .4 .8 1; .5 1 .8]; %blue/green
+
+for i = 3:6
+
+ b = 10^(i-1)
+
+ disp(['Loading results for b = ' num2str(b) ]);
+ %%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjflat0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(:,:));
+ %save([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat']);
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ ph = plot(log10(logbins),log10(meanval));
+ set(ph,'color',colors(i-2,:),'linestyle','o','linewidth',2,'markersize',8);
+ %errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+
+ % plot lines for CRP exact prediction using summation
+ [logbins predicted dummy] = logbinmean(counts, crppred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',colors(i-2,:),'linewidth',2);
+
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([.1:.1:1 2:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-.1 2])
+set(gca,'FontSize',16)
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'0.1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+%title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries (tables)')
+xlabel('Word frequency (n_w)')
+labs = {'Empirical','Expectation'};
+legend(labs,'Location','NorthWest')
+box on
diff --git a/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk2.m b/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk2.m
new file mode 100644
index 00000000..dd039289
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk2.m
@@ -0,0 +1,58 @@
+%wsj_lengths = load([ 'wsj_lengths.dat']);
+%save([ 'wsj_lengths.mat'],'wsj_lengths');
+load wsj
+load wsj_lengths
+
+figure(1)
+clf
+
+hold on
+
+%colors = [0 0 0; 0 0 1; 1 0 0; 0 1 0]; %pure black, red, blue, green
+colors = [0 0 0; 1 .4 .2; .4 .4 1; 0 .7 .5]; %same but less garish
+%colors = [0 0 0; .6 .4 .4; .9 .6 .6; 1 .8 .8]; %shades of pink
+%colors = [0 0 0; .3 .3 1; .4 .8 1; .5 1 .8]; %blue/green
+
+for i = 3:6
+
+ b = 10^(i-1)
+
+ disp(['Loading results for b = ' num2str(b) ]);
+ %%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjflat0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(:,:));
+ %save([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat']);
+
+ % plot lines for CRP exact prediction using summation
+ [logbins predicted dummy] = logbinmean(counts, crppred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',colors(i-2,:),'linewidth',2);
+
+ % plot lines for CRP Antoniak prediction
+ [logbins predicted dummy] = logbinmean(counts, antoniakpred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted),'r');
+ set(ph,'color',colors(i-2,:),'linewidth',2,'linestyle','--')
+
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([.1:.1:1 2:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-1.1 2])
+set(gca,'FontSize',16)
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'0.1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+%title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries (tables)')
+xlabel('Word frequency (n_w)')
+labs = {'Expectation','Antoniak approximation'};
+legend(labs,'Location','NorthWest')
+box on
diff --git a/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk3.m b/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk3.m
new file mode 100644
index 00000000..8d570b7a
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/wsjplots_acl_talk3.m
@@ -0,0 +1,74 @@
+%wsj_lengths = load([ 'wsj_lengths.dat']);
+%save([ 'wsj_lengths.mat'],'wsj_lengths');
+load wsj
+load wsj_lengths
+
+figure(1)
+clf
+
+hold on
+
+%colors = [0 0 0; 0 0 1; 1 0 0; 0 1 0]; %pure black, red, blue, green
+colors = [0 0 0; 1 .4 .2; .4 .4 1; 0 .7 .5]; %similar but less garish
+%colors = [0 0 0; .6 .4 .4; .9 .6 .6; 1 .8 .8]; %shades of pink
+%colors = [0 0 0; .3 .3 1; .4 .8 1; .5 1 .8]; %blue/green
+
+for i = 3:6
+ col = colors(i-2,:);
+ b = 10^(i-1)
+
+ % plot lines for CRP exact prediction using summation
+ [logbins predicted dummy] = logbinmean(counts, crppred(counts,b),20,20);
+ ph = plot(log10(logbins),log10(predicted));
+ set(ph,'color',col,'linewidth',2);
+
+ disp(['Loading results for b = ' num2str(b) ]);
+ %%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjflat0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(:,:));
+ %save([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat']);
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ ph = plot(log10(logbins),log10(meanval));
+ %set(ph,'color',col,'linestyle','o','markerfacecolor',col,'markersize',8);
+ set(ph,'color',col,'linestyle','o','linewidth',2,'markersize',8);
+ %errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+
+ disp(['Loading results for b = ' num2str(b) ]);
+ %%% uncomment these lines if .mat file is not yet generated. %%%
+ %typecountrecord= load([ 'outputs/typecountrecordwsjpeak0.0.' num2str(b) '.0.dat']);
+ %typecountrecordmean = mean(typecountrecord(:,:));
+ %save([ 'outputs/typecountrecordmeanwsjpeak0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+ load([ 'outputs/typecountrecordmeanwsjpeak0.0.' num2str(b) '.0.mat']);
+
+ %plot emprical counts with error bars
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20);
+ ph = plot(log10(logbins),log10(meanval));
+ %set(ph,'color',col,'linestyle','^','markerfacecolor',col,'markersize',8);
+ set(ph,'color',col,'linestyle','^','linewidth',2,'markersize',8);
+ %errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'ko');
+
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([.1:.1:1 2:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-.1 2.5])
+set(gca,'FontSize',16)
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'0.1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+%title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries (tables)')
+xlabel('Word frequency (n_w)')
+labs = {'Expectation','Empirical, fixed base','Empirical, inferred base'};
+legend(labs,'Location','NorthWest')
+box on
diff --git a/report/pyp_clustering/acl09-short/code/wsjplots_cl.m b/report/pyp_clustering/acl09-short/code/wsjplots_cl.m
new file mode 100644
index 00000000..eed41846
--- /dev/null
+++ b/report/pyp_clustering/acl09-short/code/wsjplots_cl.m
@@ -0,0 +1,99 @@
+
+load wsj
+
+figure(1)
+clf
+subplot(1,2,2)
+hold on
+
+for i = 1:9
+ a = i/10;
+ [logbins predicted dummy] = logbinmean(counts,counts.^a,20,20);
+ ph = plot(log10(logbins),log10(predicted),'k');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+end
+
+for i = 1:9
+ a = i/10;
+ disp(['Loading results for a = ' num2str(a) ]);
+
+ typecountrecord= load([ 'typecountrecordwsjflat' num2str(a) '.1.0.dat']);
+
+ typecountrecordmean = mean(typecountrecord(500:1000,:));
+
+ save([ 'typecountrecordmeanwsjflat' num2str(a) '.1.0.mat'],'typecountrecordmean');
+
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20)
+ errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+ drawnow
+end
+
+
+
+
+[logbins meanval seval] = logbinmean(counts,counts,20,20)
+[logbins predicted dummy] = logbinmean(counts,counts,20,20)
+ph = plot(log10(logbins),log10(predicted),'r');
+hold on
+errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+
+set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-0.1 3.5])
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+
+title('Pitman-Yor process adaptor')
+ylabel('Mean number of lexical entries')
+xlabel('Word frequency (n_w)')
+box on
+
+subplot(1,2,1)
+
+for i = 1:5
+
+ b = 10^(i-1)
+
+ disp(['Loading results for b = ' num2str(b) ]);
+ typecountrecord= load([ 'typecountrecordwsjflat0.0.' num2str(b) '.0.dat']);
+
+ typecountrecordmean = mean(typecountrecord(500:1000,:));
+ save([ 'typecountrecordmeanwsjflat0.0.' num2str(b) '.0.mat'],'typecountrecordmean');
+
+ [logbins meanval seval] = logbinmean(counts,typecountrecordmean,20,20)
+ [logbins predicted dummy] = logbinmean(counts,crppred(counts,b),20,20)
+% errorbar(log10(logbins),meanval,seval,'k.');
+ hold on
+ ph = plot(log10(logbins),log10(predicted),'r');
+ % ph = plot(log10(logbins),predicted,'r');
+ set(ph,'color',[0.7 0.7 0.7],'linewidth',1.5)
+ errorbar(log10(logbins),log10(meanval),log10(meanval+seval)-log10(meanval),log10(meanval-seval)-log10(meanval),'k.');
+end
+
+set(gca,'xtick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'ytick',log10([1:10 20:10:100 200:100:1000 2000:1000:5000]))
+set(gca,'xlim',[-0.1 3.5])
+set(gca,'ylim',[-0.1 1.5])
+set(gca,'xticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+set(gca,'yticklabel', {'1',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
+ '10',' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '100', ...
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '1000', ...
+ ' ', ' ', ' ', ' '});
+title('Chinese restaurant process adaptor')
+ylabel('Mean number of lexical entries')
+xlabel('Word frequency (n_w)')
+box on
+
+