dataDir <- "../../finalDataSets"
body = read.csv(file.path(dataDir, "bodyfat_short.csv"), header = T)

Regression Tree. The function we will use is called rpart. rpart stands for recursive partitioning. It is available in the R package rpart

library(rpart)
## Warning: package 'rpart' was built under R version 3.3.2
rt = rpart(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
rt
## n= 252 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 252 17578.99000 19.150790  
##    2) ABDOMEN< 91.9 132  4698.25500 13.606060  
##      4) ABDOMEN< 85.45 66  1303.62400 10.054550  
##        8) ABDOMEN< 75.5 7   113.54860  5.314286 *
##        9) ABDOMEN>=75.5 59  1014.12300 10.616950 *
##      5) ABDOMEN>=85.45 66  1729.68100 17.157580  
##       10) HEIGHT>=71.875 19   407.33790 13.189470 *
##       11) HEIGHT< 71.875 47   902.23110 18.761700 *
##    3) ABDOMEN>=91.9 120  4358.48000 25.250000  
##      6) ABDOMEN< 103 81  1752.42000 22.788890 *
##      7) ABDOMEN>=103 39  1096.45200 30.361540  
##       14) ABDOMEN< 112.3 28   413.60000 28.300000  
##         28) HEIGHT>=72.125 8    89.39875 23.937500 *
##         29) HEIGHT< 72.125 20   111.04950 30.045000 *
##       15) ABDOMEN>=112.3 11   260.94910 35.609090 *
plot(rt)
text(rt)

plot of chunk unnamed-chunk-2

(4698.255 + 4358.48)/17578.99
## [1] 0.5152022
scorecard = read.csv(file.path(dataDir, "college_short.csv"))

Variable Names

names(scorecard)
## [1] "SAT_AVG_ALL"    "AVGFACSAL"      "TUITIONFEE_IN"  "TUITIONFEE_OUT"
## [5] "UGDS"           "RET_FT4"        "PCTFLOAN"       "PFTFAC"        
## [9] "TYPE"
req = rpart(RET_FT4 ~ TUITIONFEE_OUT + as.factor(TYPE), data = scorecard)
plot(req)
text(req)

plot of chunk unnamed-chunk-3

req
## n= 1241 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1241 16.8642000 0.7609155  
##    2) TUITIONFEE_OUT< 34290 1005 11.1054700 0.7312472  
##      4) TUITIONFEE_OUT< 26855 731  8.1311670 0.7128963  
##        8) as.factor(TYPE)=2,3 345  4.0915040 0.6787971  
##         16) TUITIONFEE_OUT< 18785 100  1.3470310 0.6421740 *
##         17) TUITIONFEE_OUT>=18785 245  2.5556030 0.6937453 *
##        9) as.factor(TYPE)=1 386  3.2799700 0.7433736  
##         18) TUITIONFEE_OUT< 16097.5 118  1.0737510 0.7004661 *
##         19) TUITIONFEE_OUT>=16097.5 268  1.8933230 0.7622657 *
##      5) TUITIONFEE_OUT>=26855 274  2.0713890 0.7802051  
##       10) as.factor(TYPE)=2 225  1.4400930 0.7613084 *
##       11) as.factor(TYPE)=1,3 49  0.1820255 0.8669755 *
##    3) TUITIONFEE_OUT>=34290 236  1.1070330 0.8872572  
##      6) TUITIONFEE_OUT< 41516 142  0.5717404 0.8537951 *
##      7) TUITIONFEE_OUT>=41516 94  0.1361027 0.9378064 *
rt = rpart(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
#"R^2" for regression tree:
1 - (sum(residuals(rt)^2))/(sum((body$BODYFAT- mean(body$BODYFAT))^2))
## [1] 0.7354195
printcp(rt)
## 
## Regression tree:
## rpart(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + 
##     HIP + THIGH, data = body)
## 
## Variables actually used in tree construction:
## [1] ABDOMEN HEIGHT 
## 
## Root node error: 17579/252 = 69.758
## 
## n= 252 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.484798      0   1.00000 1.00090 0.081056
## 2 0.094713      1   0.51520 0.58529 0.050030
## 3 0.085876      2   0.42049 0.49520 0.045670
## 4 0.024000      3   0.33461 0.40791 0.035493
## 5 0.023899      4   0.31061 0.40456 0.033840
## 6 0.012125      5   0.28672 0.37648 0.031886
## 7 0.010009      6   0.27459 0.37376 0.028851
## 8 0.010000      7   0.26458 0.37867 0.028838
rt = rpart(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body, cp = 0.0122)
plot(rt)
text(rt)
library(DAAG)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.3.2

plot of chunk unnamed-chunk-3

data(spam7)
spam = spam7
library(rpart)
sprt = rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make, method = "class", data = spam)
plot(sprt)
text(sprt)

plot of chunk unnamed-chunk-3

sprt
## n= 4601 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 4601 1813 n (0.6059552 0.3940448)  
##    2) dollar< 0.0555 3471  816 n (0.7649092 0.2350908)  
##      4) bang< 0.0915 2420  246 n (0.8983471 0.1016529) *
##      5) bang>=0.0915 1051  481 y (0.4576594 0.5423406)  
##       10) crl.tot< 85.5 535  175 n (0.6728972 0.3271028)  
##         20) bang< 0.7735 418  106 n (0.7464115 0.2535885) *
##         21) bang>=0.7735 117   48 y (0.4102564 0.5897436)  
##           42) crl.tot< 17 43   12 n (0.7209302 0.2790698) *
##           43) crl.tot>=17 74   17 y (0.2297297 0.7702703) *
##       11) crl.tot>=85.5 516  121 y (0.2344961 0.7655039) *
##    3) dollar>=0.0555 1130  133 y (0.1176991 0.8823009) *
printcp(sprt)
## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam, method = "class")
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar 
## 
## Root node error: 1813/4601 = 0.39404
## 
## n= 4601 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.476558      0   1.00000 1.00000 0.018282
## 2 0.075565      1   0.52344 0.55323 0.015447
## 3 0.011583      3   0.37231 0.38720 0.013453
## 4 0.010480      4   0.36073 0.37728 0.013310
## 5 0.010000      5   0.35025 0.37617 0.013294
sprt = rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make, method = "class", cp = 0.001, data = spam)
printcp(sprt)
## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam, method = "class", cp = 0.001)
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar  money   n000   
## 
## Root node error: 1813/4601 = 0.39404
## 
## n= 4601 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.4765582      0   1.00000 1.00000 0.018282
## 2  0.0755654      1   0.52344 0.55985 0.015514
## 3  0.0115830      3   0.37231 0.38776 0.013461
## 4  0.0104799      4   0.36073 0.38224 0.013382
## 5  0.0063431      5   0.35025 0.36073 0.013065
## 6  0.0055157     10   0.31660 0.34859 0.012879
## 7  0.0044126     11   0.31109 0.34197 0.012775
## 8  0.0038610     12   0.30667 0.33370 0.012643
## 9  0.0027579     16   0.29123 0.32543 0.012509
## 10 0.0022063     17   0.28847 0.31991 0.012418
## 11 0.0019305     18   0.28627 0.32212 0.012455
## 12 0.0016547     20   0.28240 0.32377 0.012482
## 13 0.0010000     25   0.27413 0.32432 0.012491
sprt = rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make, method = "class", cp = 0.0028, data = spam)
plot(sprt)
text(sprt)

plot of chunk unnamed-chunk-3

x0 = data.frame(crl.tot = 100, dollar = 3, bang = 0.33, money = 1.2, n000 = 0, make = .3)
predict(sprt, x0)
##            n        y
## 1 0.04916201 0.950838
y.tr = predict(sprt, spam)[,2]
#We can form the confusion matrix with these predicted probabilities: 
confusion <- function (y, yhat, thres)
  {
    n <- length(thres)
    conf <- matrix(0,length(thres),ncol=4)
    colnames(conf) <- c("a","b","c","d")
    for ( i in 1:n)
      {
        a <- sum((!y) & (yhat<=thres[i]))
        b <- sum((!y) & (yhat>thres[i]))
        c <- sum((y) & (yhat<=thres[i])) 
        d <- sum((y) & (yhat>thres[i]))
        conf[i,] <- c(a,b,c,d)
      }
    return(conf)
  }
v = seq(0.05, 0.95, by = 0.05)
y = as.numeric(spam$yesno == "y")
tree.conf = confusion(y, y.tr, v)
plot(v,tree.conf[,2]+tree.conf[,3], xlab="threshold", ylab="b+c", type = "l")

plot of chunk unnamed-chunk-3

#Choose cut-off = 0.5
precision = 1449/(271 + 1449) #d/(b+d)
recall = 1449/(364 + 1449) #d/(c+d)
c(precision, recall)
## [1] 0.8424419 0.7992278
body = read.csv(file.path(dataDir, "bodyfat_short.csv"), header = T)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
ft = randomForest(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body, importance = TRUE)
ft
## 
## Call:
##  randomForest(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST +      ABDOMEN + HIP + THIGH, data = body, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 22.67328
##                     % Var explained: 67.5
importance(ft)
##           %IncMSE IncNodePurity
## AGE      9.163072      1081.715
## WEIGHT  13.375800      1960.025
## HEIGHT  11.878048      1238.542
## CHEST   16.350402      2941.289
## ABDOMEN 36.497968      6184.717
## HIP     12.958616      2095.318
## THIGH   12.658238      1519.375
x0 = data.frame(AGE = 40, WEIGHT = 170, HEIGHT = 76, CHEST = 120, ABDOMEN = 100, HIP = 101, THIGH = 60)
predict(ft, x0)
##        1 
## 23.90053
library(DAAG)
data(spam7)
spam = spam7
sprf = randomForest(as.factor(yesno) ~ crl.tot + dollar + bang + money + n000 + make, data = spam)
sprf
## 
## Call:
##  randomForest(formula = as.factor(yesno) ~ crl.tot + dollar +      bang + money + n000 + make, data = spam) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 11.82%
## Confusion matrix:
##      n    y class.error
## n 2645  143  0.05129125
## y  401 1412  0.22118036
#Prediction:
x0 = data.frame(crl.tot = 100, dollar = 3, bang = 0.33, money = 1.2, n000 = 0, make = 0.3)
predict(sprf, x0)
## 1 
## y 
## Levels: n y