@@ -35,6 +35,21 @@ if (!is.null(plot.filename)) {
3535  ggsave(plot.filename , p );
3636}
3737
38+ #  computes the shared standard error, as used in the welch t-test
39+ welch.sd  =  function  (old.rate , new.rate ) {
40+   old.se.squared  =  var(old.rate ) /  length(old.rate )
41+   new.se.squared  =  var(new.rate ) /  length(new.rate )
42+   return (sqrt(old.se.squared  +  new.se.squared ))
43+ }
44+ 
45+ #  calculate the improvement confidence interval. The improvement is calculated
46+ #  by dividing by old.mu and not new.mu, because old.mu is what the mean
47+ #  improvement is calculated relative to.
48+ confidence.interval  =  function  (shared.se , old.mu , w , risk ) {
49+   interval  =  qt(1  -  (risk  /  2 ), w $ parameter ) *  shared.se ;
50+   return (sprintf(" ±%.2f%%" interval  /  old.mu ) *  100 ))
51+ }
52+ 
3853#  Print a table with results
3954statistics  =  ddply(dat , " name" function (subdat ) {
4055  old.rate  =  subset(subdat , binary  ==  " old" $ rate ;
@@ -45,33 +60,42 @@ statistics = ddply(dat, "name", function(subdat) {
4560  new.mu  =  mean(new.rate );
4661  improvement  =  sprintf(" %.2f %%" new.mu  -  old.mu ) /  old.mu  *  100 ));
4762
48-   p.value  =  NA ;
49-   confidence  =  ' NA' 
63+   r  =  list (
64+     confidence  =  " NA" 
65+     improvement  =  improvement ,
66+     " accuracy (*)" =  " NA" 
67+     " (**)" =  " NA" 
68+     " (***)" =  " NA" 
69+   );
70+ 
5071  #  Check if there is enough data to calculate the calculate the p-value
5172  if  (length(old.rate ) >  1  &&  length(new.rate ) >  1 ) {
5273    #  Perform a statistics test to see of there actually is a difference in
5374    #  performance.
5475    w  =  t.test(rate  ~  binary , data = subdat );
55-     p.value  =  w $ p.value ; 
76+     shared.se  =  welch.sd( old.rate ,  new.rate ) 
5677
5778    #  Add user friendly stars to the table. There should be at least one star
5879    #  before you can say that there is an improvement.
5980    confidence  =  ' ' 
60-     if  (p.value  <  0.001 ) {
81+     if  (w $ p.value  <  0.001 ) {
6182      confidence  =  ' ***' 
62-     } else  if  (p.value  <  0.01 ) {
83+     } else  if  (w $ p.value  <  0.01 ) {
6384      confidence  =  ' **' 
64-     } else  if  (p.value  <  0.05 ) {
85+     } else  if  (w $ p.value  <  0.05 ) {
6586      confidence  =  ' *' 
6687    }
88+ 
89+     r  =  list (
90+       confidence  =  confidence ,
91+       improvement  =  improvement ,
92+       " accuracy (*)" =  confidence.interval(shared.se , old.mu , w , 0.05 ),
93+       " (**)" =  confidence.interval(shared.se , old.mu , w , 0.01 ),
94+       " (***)" =  confidence.interval(shared.se , old.mu , w , 0.001 )
95+     );
6796  }
6897
69-   r  =  list (
70-     improvement  =  improvement ,
71-     confidence  =  confidence ,
72-     p.value  =  p.value 
73-   );
74-   return (data.frame (r ));
98+   return (data.frame (r , check.names = FALSE ));
7599});
76100
77101
@@ -81,3 +105,16 @@ statistics$name = NULL;
81105
82106options(width  =  200 );
83107print(statistics );
108+ cat(" \n " 
109+ cat(sprintf(
110+ " Be aware that when doing many comparisions the risk of a false-positive
111+ result increases. In this case there are %d comparisions, you can thus 
112+ expect the following amount of false-positive results: 
113+   %.2f false positives, when considering a   5%% risk acceptance (*, **, ***), 
114+   %.2f false positives, when considering a   1%% risk acceptance (**, ***), 
115+   %.2f false positives, when considering a 0.1%% risk acceptance (***) 
116+ " 
117+ nrow(statistics ),
118+ nrow(statistics ) *  0.05 ,
119+ nrow(statistics ) *  0.01 ,
120+ nrow(statistics ) *  0.001 ))
0 commit comments