dataDir <- "../../finalDataSets"
body = read.csv(file.path(dataDir, "bodyfat_short.csv"), header = T)
Regression Tree. The function we will use is called rpart. rpart stands for recursive partitioning. It is available in the R package rpart
library(rpart)
## Warning: package 'rpart' was built under R version 3.3.2
rt = rpart(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
rt
## n= 252
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 252 17578.99000 19.150790
## 2) ABDOMEN< 91.9 132 4698.25500 13.606060
## 4) ABDOMEN< 85.45 66 1303.62400 10.054550
## 8) ABDOMEN< 75.5 7 113.54860 5.314286 *
## 9) ABDOMEN>=75.5 59 1014.12300 10.616950 *
## 5) ABDOMEN>=85.45 66 1729.68100 17.157580
## 10) HEIGHT>=71.875 19 407.33790 13.189470 *
## 11) HEIGHT< 71.875 47 902.23110 18.761700 *
## 3) ABDOMEN>=91.9 120 4358.48000 25.250000
## 6) ABDOMEN< 103 81 1752.42000 22.788890 *
## 7) ABDOMEN>=103 39 1096.45200 30.361540
## 14) ABDOMEN< 112.3 28 413.60000 28.300000
## 28) HEIGHT>=72.125 8 89.39875 23.937500 *
## 29) HEIGHT< 72.125 20 111.04950 30.045000 *
## 15) ABDOMEN>=112.3 11 260.94910 35.609090 *
plot(rt)
text(rt)
(4698.255 + 4358.48)/17578.99
## [1] 0.5152022
scorecard = read.csv(file.path(dataDir, "college_short.csv"))
Variable Names
names(scorecard)
## [1] "SAT_AVG_ALL" "AVGFACSAL" "TUITIONFEE_IN" "TUITIONFEE_OUT"
## [5] "UGDS" "RET_FT4" "PCTFLOAN" "PFTFAC"
## [9] "TYPE"
req = rpart(RET_FT4 ~ TUITIONFEE_OUT + as.factor(TYPE), data = scorecard)
plot(req)
text(req)
req
## n= 1241
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1241 16.8642000 0.7609155
## 2) TUITIONFEE_OUT< 34290 1005 11.1054700 0.7312472
## 4) TUITIONFEE_OUT< 26855 731 8.1311670 0.7128963
## 8) as.factor(TYPE)=2,3 345 4.0915040 0.6787971
## 16) TUITIONFEE_OUT< 18785 100 1.3470310 0.6421740 *
## 17) TUITIONFEE_OUT>=18785 245 2.5556030 0.6937453 *
## 9) as.factor(TYPE)=1 386 3.2799700 0.7433736
## 18) TUITIONFEE_OUT< 16097.5 118 1.0737510 0.7004661 *
## 19) TUITIONFEE_OUT>=16097.5 268 1.8933230 0.7622657 *
## 5) TUITIONFEE_OUT>=26855 274 2.0713890 0.7802051
## 10) as.factor(TYPE)=2 225 1.4400930 0.7613084 *
## 11) as.factor(TYPE)=1,3 49 0.1820255 0.8669755 *
## 3) TUITIONFEE_OUT>=34290 236 1.1070330 0.8872572
## 6) TUITIONFEE_OUT< 41516 142 0.5717404 0.8537951 *
## 7) TUITIONFEE_OUT>=41516 94 0.1361027 0.9378064 *
rt = rpart(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
#"R^2" for regression tree:
1 - (sum(residuals(rt)^2))/(sum((body$BODYFAT- mean(body$BODYFAT))^2))
## [1] 0.7354195
printcp(rt)
##
## Regression tree:
## rpart(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN +
## HIP + THIGH, data = body)
##
## Variables actually used in tree construction:
## [1] ABDOMEN HEIGHT
##
## Root node error: 17579/252 = 69.758
##
## n= 252
##
## CP nsplit rel error xerror xstd
## 1 0.484798 0 1.00000 1.00090 0.081056
## 2 0.094713 1 0.51520 0.58529 0.050030
## 3 0.085876 2 0.42049 0.49520 0.045670
## 4 0.024000 3 0.33461 0.40791 0.035493
## 5 0.023899 4 0.31061 0.40456 0.033840
## 6 0.012125 5 0.28672 0.37648 0.031886
## 7 0.010009 6 0.27459 0.37376 0.028851
## 8 0.010000 7 0.26458 0.37867 0.028838
rt = rpart(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body, cp = 0.0122)
plot(rt)
text(rt)
library(DAAG)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.3.2
data(spam7)
spam = spam7
library(rpart)
sprt = rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make, method = "class", data = spam)
plot(sprt)
text(sprt)
sprt
## n= 4601
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4601 1813 n (0.6059552 0.3940448)
## 2) dollar< 0.0555 3471 816 n (0.7649092 0.2350908)
## 4) bang< 0.0915 2420 246 n (0.8983471 0.1016529) *
## 5) bang>=0.0915 1051 481 y (0.4576594 0.5423406)
## 10) crl.tot< 85.5 535 175 n (0.6728972 0.3271028)
## 20) bang< 0.7735 418 106 n (0.7464115 0.2535885) *
## 21) bang>=0.7735 117 48 y (0.4102564 0.5897436)
## 42) crl.tot< 17 43 12 n (0.7209302 0.2790698) *
## 43) crl.tot>=17 74 17 y (0.2297297 0.7702703) *
## 11) crl.tot>=85.5 516 121 y (0.2344961 0.7655039) *
## 3) dollar>=0.0555 1130 133 y (0.1176991 0.8823009) *
printcp(sprt)
##
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 +
## make, data = spam, method = "class")
##
## Variables actually used in tree construction:
## [1] bang crl.tot dollar
##
## Root node error: 1813/4601 = 0.39404
##
## n= 4601
##
## CP nsplit rel error xerror xstd
## 1 0.476558 0 1.00000 1.00000 0.018282
## 2 0.075565 1 0.52344 0.55323 0.015447
## 3 0.011583 3 0.37231 0.38720 0.013453
## 4 0.010480 4 0.36073 0.37728 0.013310
## 5 0.010000 5 0.35025 0.37617 0.013294
sprt = rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make, method = "class", cp = 0.001, data = spam)
printcp(sprt)
##
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 +
## make, data = spam, method = "class", cp = 0.001)
##
## Variables actually used in tree construction:
## [1] bang crl.tot dollar money n000
##
## Root node error: 1813/4601 = 0.39404
##
## n= 4601
##
## CP nsplit rel error xerror xstd
## 1 0.4765582 0 1.00000 1.00000 0.018282
## 2 0.0755654 1 0.52344 0.55985 0.015514
## 3 0.0115830 3 0.37231 0.38776 0.013461
## 4 0.0104799 4 0.36073 0.38224 0.013382
## 5 0.0063431 5 0.35025 0.36073 0.013065
## 6 0.0055157 10 0.31660 0.34859 0.012879
## 7 0.0044126 11 0.31109 0.34197 0.012775
## 8 0.0038610 12 0.30667 0.33370 0.012643
## 9 0.0027579 16 0.29123 0.32543 0.012509
## 10 0.0022063 17 0.28847 0.31991 0.012418
## 11 0.0019305 18 0.28627 0.32212 0.012455
## 12 0.0016547 20 0.28240 0.32377 0.012482
## 13 0.0010000 25 0.27413 0.32432 0.012491
sprt = rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make, method = "class", cp = 0.0028, data = spam)
plot(sprt)
text(sprt)
x0 = data.frame(crl.tot = 100, dollar = 3, bang = 0.33, money = 1.2, n000 = 0, make = .3)
predict(sprt, x0)
## n y
## 1 0.04916201 0.950838
y.tr = predict(sprt, spam)[,2]
#We can form the confusion matrix with these predicted probabilities:
confusion <- function (y, yhat, thres)
{
n <- length(thres)
conf <- matrix(0,length(thres),ncol=4)
colnames(conf) <- c("a","b","c","d")
for ( i in 1:n)
{
a <- sum((!y) & (yhat<=thres[i]))
b <- sum((!y) & (yhat>thres[i]))
c <- sum((y) & (yhat<=thres[i]))
d <- sum((y) & (yhat>thres[i]))
conf[i,] <- c(a,b,c,d)
}
return(conf)
}
v = seq(0.05, 0.95, by = 0.05)
y = as.numeric(spam$yesno == "y")
tree.conf = confusion(y, y.tr, v)
plot(v,tree.conf[,2]+tree.conf[,3], xlab="threshold", ylab="b+c", type = "l")
#Choose cut-off = 0.5
precision = 1449/(271 + 1449) #d/(b+d)
recall = 1449/(364 + 1449) #d/(c+d)
c(precision, recall)
## [1] 0.8424419 0.7992278
body = read.csv(file.path(dataDir, "bodyfat_short.csv"), header = T)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
ft = randomForest(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body, importance = TRUE)
ft
##
## Call:
## randomForest(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 22.67328
## % Var explained: 67.5
importance(ft)
## %IncMSE IncNodePurity
## AGE 9.163072 1081.715
## WEIGHT 13.375800 1960.025
## HEIGHT 11.878048 1238.542
## CHEST 16.350402 2941.289
## ABDOMEN 36.497968 6184.717
## HIP 12.958616 2095.318
## THIGH 12.658238 1519.375
x0 = data.frame(AGE = 40, WEIGHT = 170, HEIGHT = 76, CHEST = 120, ABDOMEN = 100, HIP = 101, THIGH = 60)
predict(ft, x0)
## 1
## 23.90053
library(DAAG)
data(spam7)
spam = spam7
sprf = randomForest(as.factor(yesno) ~ crl.tot + dollar + bang + money + n000 + make, data = spam)
sprf
##
## Call:
## randomForest(formula = as.factor(yesno) ~ crl.tot + dollar + bang + money + n000 + make, data = spam)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 11.82%
## Confusion matrix:
## n y class.error
## n 2645 143 0.05129125
## y 401 1412 0.22118036
#Prediction:
x0 = data.frame(crl.tot = 100, dollar = 3, bang = 0.33, money = 1.2, n000 = 0, make = 0.3)
predict(sprf, x0)
## 1
## y
## Levels: n y