library(DAAG)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.3.2
data(frogs)
plot(northing ~ easting, data=frogs, pch=c(1,16)[frogs$pres.abs+1],
     xlab="Meters east of reference point", ylab="Meters north")

plot of chunk unnamed-chunk-1

library(DAAG)
data(spam7)
head(spam7)
##   crl.tot dollar  bang money n000 make yesno
## 1     278  0.000 0.778  0.00 0.00 0.00     y
## 2    1028  0.180 0.372  0.43 0.43 0.21     y
## 3    2259  0.184 0.276  0.06 1.16 0.06     y
## 4     191  0.000 0.137  0.00 0.00 0.00     y
## 5     191  0.000 0.135  0.00 0.00 0.00     y
## 6      54  0.000 0.000  0.00 0.00 0.00     y
spam = spam7
m1 = glm(pres.abs ~ altitude + distance + NoOfPools + NoOfSites + avrain + meanmin + meanmax, family = binomial, data = frogs)
summary(m1)
## 
## Call:
## glm(formula = pres.abs ~ altitude + distance + NoOfPools + NoOfSites + 
##     avrain + meanmin + meanmax, family = binomial, data = frogs)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7215  -0.7590  -0.2237   0.8320   2.6789  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  1.105e+02  1.388e+02   0.796  0.42587   
## altitude    -3.086e-02  4.076e-02  -0.757  0.44901   
## distance    -4.800e-04  2.055e-04  -2.336  0.01949 * 
## NoOfPools    2.986e-02  9.276e-03   3.219  0.00129 **
## NoOfSites    4.364e-02  1.061e-01   0.411  0.68077   
## avrain      -1.140e-02  5.995e-02  -0.190  0.84920   
## meanmin      4.899e+00  1.564e+00   3.133  0.00173 **
## meanmax     -5.660e+00  5.049e+00  -1.121  0.26224   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 279.99  on 211  degrees of freedom
## Residual deviance: 198.74  on 204  degrees of freedom
## AIC: 214.74
## 
## Number of Fisher Scoring iterations: 6
x0 = c(1, 1700, 400, 30, 8, 150, 4, 16)
sum(x0 * m1$coefficients)
## [1] -13.58643
exp(sum(x0 * m1$coefficients))
## [1] 1.257443e-06
exp(sum(x0 * m1$coefficients))/(1 + exp(sum(x0 * m1$coefficients)))
## [1] 1.257441e-06
m1$fitted.values
##            2            3            4            5            6 
## 9.994421e-01 9.391188e-01 8.683363e-01 7.443973e-01 9.427198e-01 
##            7            8            9           10           11 
## 7.107780e-01 7.940785e-01 7.134759e-01 6.791648e-01 7.487912e-01 
##           12           13           14           15           16 
## 7.031597e-01 7.081588e-01 7.051631e-01 7.895711e-01 7.724424e-01 
##           17           18           19           20           21 
## 5.410053e-01 4.002440e-01 5.520545e-01 6.308968e-01 6.015028e-01 
##           22           23           24           25           26 
## 5.282532e-01 4.096861e-01 5.839915e-01 7.693920e-01 5.815080e-01 
##           27           28           29           30           31 
## 4.784519e-01 6.059715e-01 7.600451e-01 7.538528e-01 7.706771e-01 
##           32           33           34           35           36 
## 6.434393e-01 6.079763e-01 3.708934e-01 6.463018e-01 5.586491e-01 
##           37           38           39           40           41 
## 5.183736e-01 8.117926e-01 6.602267e-01 5.538921e-01 5.806614e-01 
##           42           43           44           45           46 
## 7.341727e-01 6.316477e-01 9.355365e-01 7.829397e-01 5.807378e-01 
##           47           48           49           50           51 
## 6.410415e-01 7.011278e-01 6.990184e-01 7.945711e-01 5.983744e-01 
##           52           53           54           55           56 
## 5.464413e-01 6.694340e-01 9.160177e-01 6.497045e-01 7.793881e-01 
##           57           58           59           60           61 
## 5.989858e-01 8.742394e-01 4.269678e-01 5.230343e-01 4.696082e-01 
##           62           63           64           65           66 
## 9.058378e-01 8.310582e-01 2.065644e-01 6.630455e-01 2.304770e-01 
##           67           68           69           70           71 
## 2.055509e-01 4.103215e-01 3.107167e-01 2.173230e-01 6.849344e-02 
##           72           73           74           75           76 
## 4.689979e-01 1.964217e-01 3.039997e-01 3.140598e-02 2.960040e-02 
##           77           78           79           80           81 
## 2.764354e-02 9.572332e-02 2.708695e-02 3.894189e-01 4.384454e-01 
##           82           83           84           85           86 
## 1.801431e-02 2.958477e-02 2.245579e-02 1.716036e-02 3.869230e-01 
##           87           88           89           90           91 
## 1.422145e-01 8.723242e-02 1.258070e-01 2.402273e-01 4.425048e-02 
##           92           93           94           95           96 
## 1.245244e-01 6.044002e-02 8.657481e-02 3.093076e-01 7.608919e-01 
##           97           98           99          100          101 
## 7.581414e-02 2.519808e-01 2.299453e-01 1.333157e-01 3.224717e-01 
##          102          103          104          105          106 
## 1.382807e-01 6.569506e-01 2.173365e-02 1.894038e-02 1.502382e-02 
##          107          108          109          110          111 
## 1.980114e-02 1.768114e-02 1.294026e-02 1.940694e-01 9.026678e-02 
##          112          113          114          115          116 
## 2.401582e-01 1.801129e-01 1.950594e-01 1.012234e-01 1.100011e-01 
##          117          118          119          120          121 
## 7.922404e-02 4.383750e-01 1.381195e-01 1.202370e-03 7.190976e-03 
##          122          123          124          125          126 
## 5.285983e-03 1.501354e-02 6.798822e-03 5.557685e-01 3.414967e-01 
##          127          128          129          130          131 
## 3.538713e-01 1.215839e-02 3.587468e-03 4.820373e-01 6.801676e-01 
##          132          133          134          135          136 
## 4.905071e-01 5.450916e-01 6.139790e-01 5.265536e-01 5.113742e-01 
##          137          138          139          140          141 
## 1.487969e-03 1.537802e-03 9.498472e-02 1.584109e-01 5.019242e-01 
##          142          143          144          145          146 
## 2.217258e-01 5.499423e-01 8.072697e-02 2.867668e-01 5.892181e-01 
##          147          148          149          150          151 
## 4.793328e-01 3.172925e-01 2.456935e-01 3.648957e-01 4.426478e-01 
##          152          153          154          155          156 
## 3.659800e-01 2.737078e-02 1.261571e-03 8.908085e-04 8.917574e-04 
##          157          158          159          160          161 
## 1.564091e-03 4.022912e-01 3.554192e-01 3.771886e-01 6.108751e-01 
##          162          163          164          165          166 
## 1.135652e-01 8.305509e-02 3.248273e-01 2.027767e-01 1.320571e-01 
##          167          168          169          170          171 
## 2.998112e-01 4.705620e-01 4.664971e-01 1.893874e-01 3.202811e-01 
##          172          173          174          175          176 
## 2.350189e-01 2.264083e-01 1.720698e-01 3.624017e-01 2.497301e-01 
##          177          178          179          180          181 
## 7.719488e-01 7.008708e-01 7.727554e-01 1.249298e-01 7.968601e-04 
##          182          183          184          185          186 
## 9.646891e-05 6.891946e-01 3.317216e-01 6.494793e-01 6.489840e-01 
##          187          188          189          190          191 
## 6.574312e-01 5.476705e-01 4.789413e-02 7.693637e-02 6.853117e-02 
##          192          193          194          195          196 
## 1.346704e-01 4.074532e-01 3.981571e-02 7.631064e-02 1.221353e-01 
##          197          198          199          200          201 
## 6.273025e-01 5.747749e-01 4.664689e-01 2.265998e-01 1.467408e-01 
##          202          203          204          205          206 
## 1.589058e-01 4.219191e-01 3.164506e-01 2.998835e-01 1.540243e-01 
##          207          208          209          210          211 
## 6.556601e-02 9.745198e-02 2.134439e-01 8.078311e-04 9.752315e-04 
##          212          213 
## 2.536105e-03 3.889125e-01
i = 45
rrg = c(1, frogs$altitude[i], frogs$distance[i], frogs$NoOfPools[i], frogs$NoOfSites[i], frogs$avrain[i], frogs$meanmin[i], frogs$meanmax[i])
#or use model.matrix(m1)[i,]
eta = sum(rrg*m1$coefficients)
prr = exp(eta)/(1 + exp(eta))
#This agrees with m1$fitted.values[i]
c(prr, m1$fitted.values[i])
##                  46 
## 0.5807378 0.5807378
plot(frogs$pres.abs, m1$fitted.values)

plot of chunk unnamed-chunk-1

Checking the unusual points:

sel <- frogs$pres.abs==0 & m1$fitted >.7
plot(northing ~ easting, data=frogs, pch=c(1,16)[frogs$pres.abs+1], col=sel+1, xlab="Meters east of reference point", ylab="Meters north") 

plot of chunk unnamed-chunk-2

sel <- frogs$pres.abs==1 & m1$fitted <.2
plot(northing ~ easting, data=frogs, pch=c(1,16)[frogs$pres.abs+1], col=sel+1, xlab="Meters east of reference point", ylab="Meters north") 

plot of chunk unnamed-chunk-3

thr = 0.5
y = frogs$pres.abs
yhat = m1$fitted.values
a <- sum((!y) & (yhat<=thr))
b <- sum((!y) & (yhat>thr))
c <- sum((y) & (yhat<=thr)) 
d <- sum((y) & (yhat>thr))
c(a, b, c, d)
## [1] 112  21  21  58
thr = 0.3
y = frogs$pres.abs
yhat = m1$fitted.values
a <- sum((!y) & (yhat<=thr))
b <- sum((!y) & (yhat>thr))
c <- sum((y) & (yhat<=thr)) 
d <- sum((y) & (yhat>thr))
c(a, b, c, d)
## [1] 84 49 10 69
conf <- matrix(0,nrow=21,ncol=5)
colnames(conf) <- c("thr","a","b","c","d")
conf[,1] <- seq(0,1,by=.05)
y <- frogs$pres.abs
yhat <- m1$fitted.values
for ( i in 1:21)
  {
    a <- sum((!y) & (yhat<=conf[i,1]))
    b <- sum((!y) & (yhat>conf[i,1]))
    c <- sum((y) & (yhat<=conf[i,1])) 
    d <- sum((y) & (yhat>conf[i,1]))
    conf[i,2:5] <- c(a,b,c,d)
  }
conf
##        thr   a   b  c  d
##  [1,] 0.00   0 133  0 79
##  [2,] 0.05  33 100  3 76
##  [3,] 0.10  47  86  5 74
##  [4,] 0.15  61  72  5 74
##  [5,] 0.20  69  64  6 73
##  [6,] 0.25  80  53 10 69
##  [7,] 0.30  84  49 10 69
##  [8,] 0.35  91  42 13 66
##  [9,] 0.40 100  33 14 65
## [10,] 0.45 106  27 18 61
## [11,] 0.50 112  21 21 58
## [12,] 0.55 118  15 26 53
## [13,] 0.60 121  12 35 44
## [14,] 0.65 126   7 44 35
## [15,] 0.70 129   4 50 29
## [16,] 0.75 130   3 59 20
## [17,] 0.80 133   0 69 10
## [18,] 0.85 133   0 71  8
## [19,] 0.90 133   0 73  6
## [20,] 0.95 133   0 78  1
## [21,] 1.00 133   0 79  0
plot(conf[,1],conf[,3]+conf[,4], xlab="threshold", ylab="b+c")

plot of chunk unnamed-chunk-3

library(DAAG)
data(spam7)
head(spam7)
##   crl.tot dollar  bang money n000 make yesno
## 1     278  0.000 0.778  0.00 0.00 0.00     y
## 2    1028  0.180 0.372  0.43 0.43 0.21     y
## 3    2259  0.184 0.276  0.06 1.16 0.06     y
## 4     191  0.000 0.137  0.00 0.00 0.00     y
## 5     191  0.000 0.135  0.00 0.00 0.00     y
## 6      54  0.000 0.000  0.00 0.00 0.00     y
summary(spam)
##     crl.tot            dollar             bang             money         
##  Min.   :    1.0   Min.   :0.00000   Min.   : 0.0000   Min.   : 0.00000  
##  1st Qu.:   35.0   1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.: 0.00000  
##  Median :   95.0   Median :0.00000   Median : 0.0000   Median : 0.00000  
##  Mean   :  283.3   Mean   :0.07581   Mean   : 0.2691   Mean   : 0.09427  
##  3rd Qu.:  266.0   3rd Qu.:0.05200   3rd Qu.: 0.3150   3rd Qu.: 0.00000  
##  Max.   :15841.0   Max.   :6.00300   Max.   :32.4780   Max.   :12.50000  
##       n000             make        yesno   
##  Min.   :0.0000   Min.   :0.0000   n:2788  
##  1st Qu.:0.0000   1st Qu.:0.0000   y:1813  
##  Median :0.0000   Median :0.0000           
##  Mean   :0.1016   Mean   :0.1046           
##  3rd Qu.:0.0000   3rd Qu.:0.0000           
##  Max.   :5.4500   Max.   :4.5400
par(mfrow=c(3,2))
for (i in 1:5)
  hist(spam[,i],main="",xlab=names(spam)[i],breaks=10000)
par(mfrow=c(1,1))

plot of chunk unnamed-chunk-3

pairs(spam, cex=0.5)

plot of chunk unnamed-chunk-3

s = 0.001 #correction for zero as log(0) is infinity. 
pairs(~log(crl.tot)+log(dollar+s) +log(bang+s)+log(money+s)+log(n000+s)+log(make+s) + yesno,       data=spam, cex=.5)

plot of chunk unnamed-chunk-3

spam.glm <- glm(yesno~ log(crl.tot) + log(dollar+s) + log(bang+s)
                +log(money+s) + log(n000+s) + log(make+s),
                family=binomial, data=spam)
summary(spam.glm)
## 
## Call:
## glm(formula = yesno ~ log(crl.tot) + log(dollar + s) + log(bang + 
##     s) + log(money + s) + log(n000 + s) + log(make + s), family = binomial, 
##     data = spam)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1657  -0.4367  -0.2863   0.3609   2.7152  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      4.11947    0.36342  11.335  < 2e-16 ***
## log(crl.tot)     0.30228    0.03693   8.185 2.71e-16 ***
## log(dollar + s)  0.32586    0.02365  13.777  < 2e-16 ***
## log(bang + s)    0.40984    0.01597  25.661  < 2e-16 ***
## log(money + s)   0.34563    0.02800  12.345  < 2e-16 ***
## log(n000 + s)    0.18947    0.02931   6.463 1.02e-10 ***
## log(make + s)   -0.11418    0.02206  -5.177 2.25e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6170.2  on 4600  degrees of freedom
## Residual deviance: 3245.1  on 4594  degrees of freedom
## AIC: 3259.1
## 
## Number of Fisher Scoring iterations: 6
spam.lm <-  lm(as.numeric(yesno=="y")~ log(crl.tot) + log(dollar+s)
               + log(bang+s) +log(money+s) + log(n000+s) + log(make+s)
               ,data=spam)
summary(spam.lm)
## 
## Call:
## lm(formula = as.numeric(yesno == "y") ~ log(crl.tot) + log(dollar + 
##     s) + log(bang + s) + log(money + s) + log(n000 + s) + log(make + 
##     s), data = spam)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10937 -0.13830 -0.05674  0.15262  1.05619 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.078531   0.034188  31.547  < 2e-16 ***
## log(crl.tot)     0.028611   0.003978   7.193 7.38e-13 ***
## log(dollar + s)  0.054878   0.002934  18.703  < 2e-16 ***
## log(bang + s)    0.064522   0.001919  33.619  < 2e-16 ***
## log(money + s)   0.039776   0.002751  14.457  < 2e-16 ***
## log(n000 + s)    0.018530   0.002815   6.582 5.16e-11 ***
## log(make + s)   -0.017380   0.002370  -7.335 2.61e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3391 on 4594 degrees of freedom
## Multiple R-squared:  0.5193, Adjusted R-squared:  0.5186 
## F-statistic: 827.1 on 6 and 4594 DF,  p-value: < 2.2e-16
plot(spam.lm$fitted.values, spam.glm$fitted.values)
abline(c(0, 1), col = "red")

plot of chunk unnamed-chunk-3

confusion <- function (y, yhat, thres)
  {
    n <- length(thres)
    conf <- matrix(0,length(thres),ncol=4)
    colnames(conf) <- c("a","b","c","d")
    for ( i in 1:n)
      {
        a <- sum((!y) & (yhat<=thres[i]))
        b <- sum((!y) & (yhat>thres[i]))
        c <- sum((y) & (yhat<=thres[i])) 
        d <- sum((y) & (yhat>thres[i]))
        conf[i,] <- c(a,b,c,d)
      }
    return(conf)
  }
v <- seq(0.05,0.95,by=0.05)
y <- as.numeric(spam$yesno=="y")
glm.conf <- confusion(y,spam.glm$fitted,v)
glm.conf
##          a    b    c    d
##  [1,]  769 2019   32 1781
##  [2,] 1709 1079  151 1662
##  [3,] 1825  963  164 1649
##  [4,] 1896  892  174 1639
##  [5,] 1996  792  189 1624
##  [6,] 2079  709  210 1603
##  [7,] 2231  557  247 1566
##  [8,] 2382  406  295 1518
##  [9,] 2536  252  351 1462
## [10,] 2607  181  406 1407
## [11,] 2642  146  478 1335
## [12,] 2671  117  546 1267
## [13,] 2685  103  623 1190
## [14,] 2700   88  667 1146
## [15,] 2714   74  704 1109
## [16,] 2735   53  771 1042
## [17,] 2747   41  901  912
## [18,] 2767   21 1027  786
## [19,] 2775   13 1235  578
lm.conf <- confusion(y,spam.lm$fitted,v)
lm.conf
##          a    b    c    d
##  [1,]  683 2105   22 1791
##  [2,] 1400 1388   97 1716
##  [3,] 1798  990  163 1650
##  [4,] 1833  955  166 1647
##  [5,] 1876  912  171 1642
##  [6,] 1972  816  178 1635
##  [7,] 2058  730  195 1618
##  [8,] 2235  553  257 1556
##  [9,] 2450  338  330 1483
## [10,] 2633  155  441 1372
## [11,] 2676  112  565 1248
## [12,] 2697   91  673 1140
## [13,] 2722   66  752 1061
## [14,] 2738   50  844  969
## [15,] 2755   33  928  885
## [16,] 2767   21 1057  756
## [17,] 2774   14 1158  655
## [18,] 2780    8 1249  564
## [19,] 2782    6 1361  452
matplot(v,cbind((glm.conf[,"b"]+glm.conf[,"c"])/4601, (lm.conf[,"b"]+lm.conf[,"c"])/4601), xlab="threshold", ylab="b+c",type="l")
legend(.8, .4, lty=1:2, col=1:2,c("glm","lm"))

plot of chunk unnamed-chunk-3

spam.lm1 = lm(as.numeric(yesno=="y")~ crl.tot + dollar
               + bang + money + n000 + make, data=spam)
summary(spam.lm1)
## 
## Call:
## lm(formula = as.numeric(yesno == "y") ~ crl.tot + dollar + bang + 
##     money + n000 + make, data = spam)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8650 -0.2758 -0.2519  0.4459  0.7499 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.498e-01  7.488e-03  33.365   <2e-16 ***
## crl.tot     1.241e-04  1.058e-05  11.734   <2e-16 ***
## dollar      3.481e-01  2.733e-02  12.740   <2e-16 ***
## bang        1.113e-01  7.725e-03  14.407   <2e-16 ***
## money       1.765e-01  1.440e-02  12.262   <2e-16 ***
## n000        3.218e-01  1.891e-02  17.014   <2e-16 ***
## make        3.212e-02  2.101e-02   1.529    0.126    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4223 on 4594 degrees of freedom
## Multiple R-squared:  0.2543, Adjusted R-squared:  0.2533 
## F-statistic: 261.1 on 6 and 4594 DF,  p-value: < 2.2e-16
summary(spam.lm)
## 
## Call:
## lm(formula = as.numeric(yesno == "y") ~ log(crl.tot) + log(dollar + 
##     s) + log(bang + s) + log(money + s) + log(n000 + s) + log(make + 
##     s), data = spam)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10937 -0.13830 -0.05674  0.15262  1.05619 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.078531   0.034188  31.547  < 2e-16 ***
## log(crl.tot)     0.028611   0.003978   7.193 7.38e-13 ***
## log(dollar + s)  0.054878   0.002934  18.703  < 2e-16 ***
## log(bang + s)    0.064522   0.001919  33.619  < 2e-16 ***
## log(money + s)   0.039776   0.002751  14.457  < 2e-16 ***
## log(n000 + s)    0.018530   0.002815   6.582 5.16e-11 ***
## log(make + s)   -0.017380   0.002370  -7.335 2.61e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3391 on 4594 degrees of freedom
## Multiple R-squared:  0.5193, Adjusted R-squared:  0.5186 
## F-statistic: 827.1 on 6 and 4594 DF,  p-value: < 2.2e-16

Precision and recall for the GLM:

cbind(glm.conf[,"d"]/(glm.conf[,"b"]+glm.conf[,"d"]),glm.conf[,"d"]/(glm.conf[,"c"]+glm.conf[,"d"]) )
##            [,1]      [,2]
##  [1,] 0.4686842 0.9823497
##  [2,] 0.6063480 0.9167126
##  [3,] 0.6313170 0.9095422
##  [4,] 0.6475701 0.9040265
##  [5,] 0.6721854 0.8957529
##  [6,] 0.6933391 0.8841699
##  [7,] 0.7376354 0.8637617
##  [8,] 0.7889813 0.8372863
##  [9,] 0.8529755 0.8063982
## [10,] 0.8860202 0.7760618
## [11,] 0.9014180 0.7363486
## [12,] 0.9154624 0.6988417
## [13,] 0.9203403 0.6563707
## [14,] 0.9286872 0.6321015
## [15,] 0.9374472 0.6116933
## [16,] 0.9515982 0.5747380
## [17,] 0.9569780 0.5030336
## [18,] 0.9739777 0.4335356
## [19,] 0.9780034 0.3188086
plot(glm.conf[,"d"]/(glm.conf[,"b"]+glm.conf[,"d"]),glm.conf[,"d"]/(glm.conf[,"c"]+glm.conf[,"d"]) )

plot of chunk unnamed-chunk-4

matplot(x = cbind(glm.conf[,"d"]/(glm.conf[,"b"]+glm.conf[,"d"]),
              lm.conf[,"d"]/(lm.conf[,"b"]+lm.conf[,"d"])),
        y = cbind(glm.conf[,"d"]/(glm.conf[,"c"]+glm.conf[,"d"]),
              lm.conf[,"d"]/(lm.conf[,"c"]+lm.conf[,"d"])),
        xlab="Precision", ylab="Recall",type="l")
legend(.8, .4, lty=1:2, col=1:2,c("glm","lm"))

plot of chunk unnamed-chunk-4

library(DAAG)
data(frogs)
m1 = glm(pres.abs ~ altitude + distance + NoOfPools + NoOfSites + avrain + meanmin + meanmax, family = binomial, data = frogs)
summary(m1)
## 
## Call:
## glm(formula = pres.abs ~ altitude + distance + NoOfPools + NoOfSites + 
##     avrain + meanmin + meanmax, family = binomial, data = frogs)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7215  -0.7590  -0.2237   0.8320   2.6789  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  1.105e+02  1.388e+02   0.796  0.42587   
## altitude    -3.086e-02  4.076e-02  -0.757  0.44901   
## distance    -4.800e-04  2.055e-04  -2.336  0.01949 * 
## NoOfPools    2.986e-02  9.276e-03   3.219  0.00129 **
## NoOfSites    4.364e-02  1.061e-01   0.411  0.68077   
## avrain      -1.140e-02  5.995e-02  -0.190  0.84920   
## meanmin      4.899e+00  1.564e+00   3.133  0.00173 **
## meanmax     -5.660e+00  5.049e+00  -1.121  0.26224   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 279.99  on 211  degrees of freedom
## Residual deviance: 198.74  on 204  degrees of freedom
## AIC: 214.74
## 
## Number of Fisher Scoring iterations: 6
deviance(m1)
## [1] 198.7384
rd = sum(frogs$pres.abs * (-2) * log(m1$fitted.values)) + sum((1 - frogs$pres.abs)*(-2) * log(1 - m1$fitted.values))  
rd
## [1] 198.7384
m2 = glm(pres.abs ~ altitude + distance + NoOfSites + avrain + meanmin + meanmax, family = binomial, data = frogs) 
summary(m2)
## 
## Call:
## glm(formula = pres.abs ~ altitude + distance + NoOfSites + avrain + 
##     meanmin + meanmax, family = binomial, data = frogs)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7814  -0.7902  -0.2938   0.9300   3.3822  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  3.496e+01  1.376e+02   0.254  0.79938   
## altitude    -8.375e-03  4.034e-02  -0.208  0.83552   
## distance    -5.350e-04  2.066e-04  -2.589  0.00962 **
## NoOfSites    5.748e-02  1.025e-01   0.561  0.57481   
## avrain       1.506e-02  5.993e-02   0.251  0.80154   
## meanmin      4.358e+00  1.491e+00   2.922  0.00347 **
## meanmax     -2.782e+00  4.986e+00  -0.558  0.57680   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 279.99  on 211  degrees of freedom
## Residual deviance: 210.84  on 205  degrees of freedom
## AIC: 224.84
## 
## Number of Fisher Scoring iterations: 6

AIC

m1$aic
## [1] 214.7384
m1$deviance + 2*(7+1)
## [1] 214.7384
step(m1)
## Start:  AIC=214.74
## pres.abs ~ altitude + distance + NoOfPools + NoOfSites + avrain + 
##     meanmin + meanmax
## 
##             Df Deviance    AIC
## - avrain     1   198.78 212.78
## - NoOfSites  1   198.91 212.91
## - altitude   1   199.30 213.30
## - meanmax    1   199.97 213.97
## <none>           198.74 214.74
## - distance   1   206.39 220.39
## - meanmin    1   209.60 223.60
## - NoOfPools  1   210.84 224.84
## 
## Step:  AIC=212.77
## pres.abs ~ altitude + distance + NoOfPools + NoOfSites + meanmin + 
##     meanmax
## 
##             Df Deviance    AIC
## - NoOfSites  1   198.94 210.94
## - altitude   1   199.58 211.58
## <none>           198.78 212.78
## - meanmax    1   201.76 213.76
## - distance   1   206.39 218.39
## - meanmin    1   210.75 222.75
## - NoOfPools  1   210.90 222.90
## 
## Step:  AIC=210.94
## pres.abs ~ altitude + distance + NoOfPools + meanmin + meanmax
## 
##             Df Deviance    AIC
## - altitude   1   199.63 209.63
## <none>           198.94 210.94
## - meanmax    1   201.76 211.76
## - distance   1   209.68 219.68
## - meanmin    1   210.83 220.83
## - NoOfPools  1   211.22 221.22
## 
## Step:  AIC=209.63
## pres.abs ~ distance + NoOfPools + meanmin + meanmax
## 
##             Df Deviance    AIC
## <none>           199.63 209.63
## - distance   1   209.73 217.73
## - NoOfPools  1   211.43 219.43
## - meanmax    1   216.10 224.10
## - meanmin    1   226.94 234.94
## 
## Call:  glm(formula = pres.abs ~ distance + NoOfPools + meanmin + meanmax, 
##     family = binomial, data = frogs)
## 
## Coefficients:
## (Intercept)     distance    NoOfPools      meanmin      meanmax  
##  14.0074032   -0.0005138    0.0285643    5.6230647   -2.3717579  
## 
## Degrees of Freedom: 211 Total (i.e. Null);  207 Residual
## Null Deviance:       280 
## Residual Deviance: 199.6     AIC: 209.6
step(m1, direction = "both")
## Start:  AIC=214.74
## pres.abs ~ altitude + distance + NoOfPools + NoOfSites + avrain + 
##     meanmin + meanmax
## 
##             Df Deviance    AIC
## - avrain     1   198.78 212.78
## - NoOfSites  1   198.91 212.91
## - altitude   1   199.30 213.30
## - meanmax    1   199.97 213.97
## <none>           198.74 214.74
## - distance   1   206.39 220.39
## - meanmin    1   209.60 223.60
## - NoOfPools  1   210.84 224.84
## 
## Step:  AIC=212.77
## pres.abs ~ altitude + distance + NoOfPools + NoOfSites + meanmin + 
##     meanmax
## 
##             Df Deviance    AIC
## - NoOfSites  1   198.94 210.94
## - altitude   1   199.58 211.58
## <none>           198.78 212.78
## - meanmax    1   201.76 213.76
## + avrain     1   198.74 214.74
## - distance   1   206.39 218.39
## - meanmin    1   210.75 222.75
## - NoOfPools  1   210.90 222.90
## 
## Step:  AIC=210.94
## pres.abs ~ altitude + distance + NoOfPools + meanmin + meanmax
## 
##             Df Deviance    AIC
## - altitude   1   199.63 209.63
## <none>           198.94 210.94
## - meanmax    1   201.76 211.76
## + NoOfSites  1   198.78 212.78
## + avrain     1   198.91 212.91
## - distance   1   209.68 219.68
## - meanmin    1   210.83 220.83
## - NoOfPools  1   211.22 221.22
## 
## Step:  AIC=209.63
## pres.abs ~ distance + NoOfPools + meanmin + meanmax
## 
##             Df Deviance    AIC
## <none>           199.63 209.63
## + altitude   1   198.94 210.94
## + avrain     1   199.39 211.39
## + NoOfSites  1   199.58 211.58
## - distance   1   209.73 217.73
## - NoOfPools  1   211.43 219.43
## - meanmax    1   216.10 224.10
## - meanmin    1   226.94 234.94
## 
## Call:  glm(formula = pres.abs ~ distance + NoOfPools + meanmin + meanmax, 
##     family = binomial, data = frogs)
## 
## Coefficients:
## (Intercept)     distance    NoOfPools      meanmin      meanmax  
##  14.0074032   -0.0005138    0.0285643    5.6230647   -2.3717579  
## 
## Degrees of Freedom: 211 Total (i.e. Null);  207 Residual
## Null Deviance:       280 
## Residual Deviance: 199.6     AIC: 209.6