Psst.. new poll here.
Psst.. new forums here.
Microsoft is blocking us again (TY IP Reputation!) so just use oauth login instead. :)
Paste
Pasted as C by Tim Menzies ( 16 years ago )
#naive bayes classifier in gawk
#usage: gawk -F, -f nbc.awk Pass=1 train.csv Pass=2 test.csv
Pass==1 {train()}
Pass==2 {print $NF "|" classify()}
function train( i,h) {
Total++;
h=$NF; # the hypotheis is in the last column
H[h]++; # remember how often we have seen "h"
for(i=1;i<=NF;i++) {
if ($i=="?")
continue; # skip unknown values
Freq[h,i,$i]++
if (++Seen[i,$i]==1)
Attr[i]++} # remember unique values
}
function classify( i,temp,what,like,h) {
like = -100000; # smaller than any log
for(h in H) { # for every hypothesis, do...
temp=log(H[h]/Total); # logs stop numeric errors
for(i=1;i<NF;i++) {
if ( $i=="?" )
continue; # skip unknwon values
temp += log((Freq[h,i,$i]+1)/(H[h]+Attr[NF])) }
if ( temp >= like ) { # better hypothesis
like = temp
what=h}
}
return what;
Revise this Paste