Housing price data from Ames, Iowa. The full dataset is available at https://ww2.amstat.org/publications/jse/v19n3/decock/AmesHousing.txt A description of the full dataset is at https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt We will be working with a slightly smaller version of the full dataset. Set the right working directory (you need to change the directory to the local directory in your machine containing the data)

dataDir<-"../../finalDataSets"

Read the dataset:

dd = read.csv(file.path(dataDir, "Ames_Short.csv"), header = T)

The size of the dataset (number of rows and columns) can be obtained by

dim(dd)

## [1] 1314    7

Variable names:

names(dd)

## [1] "Lot.Area"      "Total.Bsmt.SF" "Gr.Liv.Area"   "Garage.Cars"  
## [5] "Fireplaces.YN" "Year.Built"    "SalePrice"

head(dd)

##   Lot.Area Total.Bsmt.SF Gr.Liv.Area Garage.Cars Fireplaces.YN Year.Built
## 1    11622           882         896           1             N       1961
## 2    14267          1329        1329           1             N       1958
## 3     4920          1338        1338           2             N       2001
## 4     5005          1280        1280           2             N       1992
## 5     7980          1168        1187           2             N       1992
## 6     8402           789        1465           2             Y       1998
##   SalePrice
## 1    105000
## 2    172000
## 3    213500
## 4    191500
## 5    185000
## 6    180400

summary(dd)

##     Lot.Area     Total.Bsmt.SF     Gr.Liv.Area    Garage.Cars  
##  Min.   : 1300   Min.   : 105.0   Min.   : 438   Min.   :0.00  
##  1st Qu.: 6346   1st Qu.: 732.0   1st Qu.: 984   1st Qu.:1.00  
##  Median : 8472   Median : 923.0   Median :1151   Median :2.00  
##  Mean   : 8515   Mean   : 938.1   Mean   :1149   Mean   :1.49  
##  3rd Qu.:10200   3rd Qu.:1138.0   3rd Qu.:1344   3rd Qu.:2.00  
##  Max.   :41600   Max.   :1645.0   Max.   :1500   Max.   :5.00  
##  Fireplaces.YN   Year.Built     SalePrice     
##  N:830         Min.   :1875   Min.   : 35000  
##  Y:484         1st Qu.:1950   1st Qu.:120000  
##                Median :1966   Median :138750  
##                Mean   :1964   Mean   :141447  
##                3rd Qu.:1981   3rd Qu.:160500  
##                Max.   :2010   Max.   :290000

Dataset can also be found here \url{https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset} Read the data

bike <- read.csv(file.path(dataDir, "BikeSharingDataset.csv"))

dim(bike)

## [1] 17379    17

names(bike)

##  [1] "instant"    "dteday"     "season"     "yr"         "mnth"      
##  [6] "hr"         "holiday"    "weekday"    "workingday" "weathersit"
## [11] "temp"       "atemp"      "hum"        "windspeed"  "casual"    
## [16] "registered" "cnt"

head(bike)

##   instant     dteday season yr mnth hr holiday weekday workingday
## 1       1 2011-01-01      1  0    1  0       0       6          0
## 2       2 2011-01-01      1  0    1  1       0       6          0
## 3       3 2011-01-01      1  0    1  2       0       6          0
## 4       4 2011-01-01      1  0    1  3       0       6          0
## 5       5 2011-01-01      1  0    1  4       0       6          0
## 6       6 2011-01-01      1  0    1  5       0       6          0
##   weathersit temp  atemp  hum windspeed casual registered cnt
## 1          1 0.24 0.2879 0.81    0.0000      3         13  16
## 2          1 0.22 0.2727 0.80    0.0000      8         32  40
## 3          1 0.22 0.2727 0.80    0.0000      5         27  32
## 4          1 0.24 0.2879 0.75    0.0000      3         10  13
## 5          1 0.24 0.2879 0.75    0.0000      0          1   1
## 6          2 0.24 0.2576 0.75    0.0896      0          1   1

summary(bike)

##     instant             dteday          season            yr        
##  Min.   :    1   2011-01-01:   24   Min.   :1.000   Min.   :0.0000  
##  1st Qu.: 4346   2011-01-08:   24   1st Qu.:2.000   1st Qu.:0.0000  
##  Median : 8690   2011-01-09:   24   Median :3.000   Median :1.0000  
##  Mean   : 8690   2011-01-10:   24   Mean   :2.502   Mean   :0.5026  
##  3rd Qu.:13034   2011-01-13:   24   3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :17379   2011-01-15:   24   Max.   :4.000   Max.   :1.0000  
##                  (Other)   :17235                                   
##       mnth              hr           holiday           weekday     
##  Min.   : 1.000   Min.   : 0.00   Min.   :0.00000   Min.   :0.000  
##  1st Qu.: 4.000   1st Qu.: 6.00   1st Qu.:0.00000   1st Qu.:1.000  
##  Median : 7.000   Median :12.00   Median :0.00000   Median :3.000  
##  Mean   : 6.538   Mean   :11.55   Mean   :0.02877   Mean   :3.004  
##  3rd Qu.:10.000   3rd Qu.:18.00   3rd Qu.:0.00000   3rd Qu.:5.000  
##  Max.   :12.000   Max.   :23.00   Max.   :1.00000   Max.   :6.000  
##                                                                    
##    workingday       weathersit         temp           atemp       
##  Min.   :0.0000   Min.   :1.000   Min.   :0.020   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.340   1st Qu.:0.3333  
##  Median :1.0000   Median :1.000   Median :0.500   Median :0.4848  
##  Mean   :0.6827   Mean   :1.425   Mean   :0.497   Mean   :0.4758  
##  3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:0.660   3rd Qu.:0.6212  
##  Max.   :1.0000   Max.   :4.000   Max.   :1.000   Max.   :1.0000  
##                                                                   
##       hum           windspeed          casual         registered   
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00   Min.   :  0.0  
##  1st Qu.:0.4800   1st Qu.:0.1045   1st Qu.:  4.00   1st Qu.: 34.0  
##  Median :0.6300   Median :0.1940   Median : 17.00   Median :115.0  
##  Mean   :0.6272   Mean   :0.1901   Mean   : 35.68   Mean   :153.8  
##  3rd Qu.:0.7800   3rd Qu.:0.2537   3rd Qu.: 48.00   3rd Qu.:220.0  
##  Max.   :1.0000   Max.   :0.8507   Max.   :367.00   Max.   :886.0  
##                                                                    
##       cnt       
##  Min.   :  1.0  
##  1st Qu.: 40.0  
##  Median :142.0  
##  Mean   :189.5  
##  3rd Qu.:281.0  
##  Max.   :977.0  
##

https://collegescorecard.ed.gov/assets/CollegeScorecardDataDictionary-10-26-2016.xlsx https://collegescorecard.ed.gov/assets/FullDataDocumentation.pdf

scorecard = read.csv(file.path(dataDir, "college_short.csv"))

dim(scorecard)

## [1] 1241    9

names(scorecard)

## [1] "SAT_AVG_ALL"    "AVGFACSAL"      "TUITIONFEE_IN"  "TUITIONFEE_OUT"
## [5] "UGDS"           "RET_FT4"        "PCTFLOAN"       "PFTFAC"        
## [9] "TYPE"

head(scorecard)

##   SAT_AVG_ALL AVGFACSAL TUITIONFEE_IN TUITIONFEE_OUT  UGDS RET_FT4
## 1         823      7079          7182          12774  4051  0.6314
## 2        1146     10170          7206          16398 11200  0.8016
## 3        1180      9341          9192          21506  5525  0.8098
## 4         830      6557          8720          15656  5354  0.6219
## 5        1171      9605          9450          23950 28692  0.8700
## 6        1215      9429          9852          26364 19761  0.8946
##   PCTFLOAN PFTFAC TYPE
## 1   0.8204 0.8856    1
## 2   0.5397 0.9106    1
## 3   0.4728 0.6555    1
## 4   0.8735 0.6641    1
## 5   0.4148 0.7109    1
## 6   0.3610 0.8780    1

summary(scorecard)

##   SAT_AVG_ALL     AVGFACSAL     TUITIONFEE_IN   TUITIONFEE_OUT 
##  Min.   : 666   Min.   : 1476   Min.   : 2082   Min.   : 3850  
##  1st Qu.: 980   1st Qu.: 6152   1st Qu.: 9356   1st Qu.:18395  
##  Median :1050   Median : 7280   Median :22290   Median :24566  
##  Mean   :1067   Mean   : 7594   Mean   :21708   Mean   :25651  
##  3rd Qu.:1137   3rd Qu.: 8626   3rd Qu.:30990   3rd Qu.:31500  
##  Max.   :1534   Max.   :19862   Max.   :49138   Max.   :49138  
##       UGDS          RET_FT4          PCTFLOAN          PFTFAC      
##  Min.   :   82   Min.   :0.0000   Min.   :0.0000   Min.   :0.0403  
##  1st Qu.: 1313   1st Qu.:0.6852   1st Qu.:0.4922   1st Qu.:0.5281  
##  Median : 2466   Median :0.7640   Median :0.6133   Median :0.7111  
##  Mean   : 5545   Mean   :0.7609   Mean   :0.5990   Mean   :0.7037  
##  3rd Qu.: 6308   3rd Qu.:0.8457   3rd Qu.:0.7209   3rd Qu.:0.9363  
##  Max.   :50919   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       TYPE      
##  Min.   :1.000  
##  1st Qu.:1.000  
##  Median :2.000  
##  Mean   :1.646  
##  3rd Qu.:2.000  
##  Max.   :3.000

Explanation of the dataset can be found at http://lib.stat.cmu.edu/datasets/bodyfat

body = read.csv(file.path(dataDir, "bodyfat_short.csv"), header = T)

dim(body)

## [1] 252   8

names(body)

## [1] "BODYFAT" "AGE"     "WEIGHT"  "HEIGHT"  "CHEST"   "ABDOMEN" "HIP"    
## [8] "THIGH"

head(body)

##   BODYFAT AGE WEIGHT HEIGHT CHEST ABDOMEN   HIP THIGH
## 1    12.3  23 154.25  67.75  93.1    85.2  94.5  59.0
## 2     6.1  22 173.25  72.25  93.6    83.0  98.7  58.7
## 3    25.3  22 154.00  66.25  95.8    87.9  99.2  59.6
## 4    10.4  26 184.75  72.25 101.8    86.4 101.2  60.1
## 5    28.7  24 184.25  71.25  97.3   100.0 101.9  63.2
## 6    20.9  24 210.25  74.75 104.5    94.4 107.8  66.0

summary(body)

##     BODYFAT           AGE            WEIGHT          HEIGHT     
##  Min.   : 0.00   Min.   :22.00   Min.   :118.5   Min.   :29.50  
##  1st Qu.:12.47   1st Qu.:35.75   1st Qu.:159.0   1st Qu.:68.25  
##  Median :19.20   Median :43.00   Median :176.5   Median :70.00  
##  Mean   :19.15   Mean   :44.88   Mean   :178.9   Mean   :70.15  
##  3rd Qu.:25.30   3rd Qu.:54.00   3rd Qu.:197.0   3rd Qu.:72.25  
##  Max.   :47.50   Max.   :81.00   Max.   :363.1   Max.   :77.75  
##      CHEST           ABDOMEN            HIP            THIGH      
##  Min.   : 79.30   Min.   : 69.40   Min.   : 85.0   Min.   :47.20  
##  1st Qu.: 94.35   1st Qu.: 84.58   1st Qu.: 95.5   1st Qu.:56.00  
##  Median : 99.65   Median : 90.95   Median : 99.3   Median :59.00  
##  Mean   :100.82   Mean   : 92.56   Mean   : 99.9   Mean   :59.41  
##  3rd Qu.:105.38   3rd Qu.: 99.33   3rd Qu.:103.5   3rd Qu.:62.35  
##  Max.   :136.20   Max.   :148.10   Max.   :147.7   Max.   :87.30

This is a dataset from the book “Introductory Econometrics: A Modern Approach” by Wooldridge.

load(file.path(dataDir, "campus.Rdata"))

campus = data

campus.desc = desc

This gives data for 97 colleges and universities for the year 1992.

#campus

summary(campus)

##      enroll           priv            police          crime       
##  Min.   : 1799   Min.   :0.0000   Min.   : 1.00   Min.   :   1.0  
##  1st Qu.: 6485   1st Qu.:0.0000   1st Qu.: 9.00   1st Qu.:  85.0  
##  Median :11990   Median :0.0000   Median :16.00   Median : 187.0  
##  Mean   :16076   Mean   :0.1237   Mean   :20.49   Mean   : 394.5  
##  3rd Qu.:21836   3rd Qu.:0.0000   3rd Qu.:27.00   3rd Qu.: 491.0  
##  Max.   :56350   Max.   :1.0000   Max.   :74.00   Max.   :2052.0  
##      lcrime         lenroll          lpolice     
##  Min.   :0.000   Min.   : 7.495   Min.   :0.000  
##  1st Qu.:4.443   1st Qu.: 8.777   1st Qu.:2.197  
##  Median :5.231   Median : 9.392   Median :2.773  
##  Mean   :5.277   Mean   : 9.379   Mean   :2.731  
##  3rd Qu.:6.196   3rd Qu.: 9.991   3rd Qu.:3.296  
##  Max.   :7.627   Max.   :10.939   Max.   :4.304

plot(crime ~ police, data = campus)

plot of chunk unnamed-chunk-27

m1 = lm(crime ~ police, data = campus)
summary(m1)

## 
## Call:
## lm(formula = crime ~ police, data = campus)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1024.79  -152.96   -35.38    89.48  1540.16 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -42.557     53.730  -0.792     0.43    
## police        21.323      2.089  10.210   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 319.9 on 95 degrees of freedom
## Multiple R-squared:  0.5232, Adjusted R-squared:  0.5182 
## F-statistic: 104.2 on 1 and 95 DF,  p-value: < 2.2e-16

plot(crime ~ police, data = campus)
abline(m1)

plot of chunk unnamed-chunk-28

m2 = lm(lcrime ~ lpolice, data = campus)
summary(m2)

## 
## Call:
## lm(formula = lcrime ~ lpolice, data = campus)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0819 -0.3726  0.0857  0.6388  2.0967 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.0643     0.3641   5.670 1.53e-07 ***
## lpolice       1.1765     0.1279   9.197 8.62e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.01 on 95 degrees of freedom
## Multiple R-squared:  0.471,  Adjusted R-squared:  0.4654 
## F-statistic: 84.58 on 1 and 95 DF,  p-value: 8.621e-15

plot(lcrime ~ lpolice, data = campus)
abline(m2)

plot of chunk unnamed-chunk-29

y = campus$lcrime
x = campus$lpolice
nrep = 100
bslope = rep(-999, nrep)
bint = rep(-999, nrep)
for(i in 1:nrep)
    {

        sampled = sample(1:length(y), size = length(y), replace = T)
        bslope[i] = coef(lm(y[sampled] ~ x[sampled]))[2]
        bint[i] = coef(lm(y[sampled] ~ x[sampled]))[1]
        }
#quartz()
#par(mfrow = c(1, 2))
#hist(bslope, breaks = 500)
#abline(v = coef(m2)[2], col = "red")
#hist(bint, breaks = 500)
#abline(v = coef(m2)[1], col = "red")
#par(mfrow = c(1, 1))

The bootstrap standard errors are given by:

c(sd(bint), sd(bslope))

## [1] 0.4434089 0.1500200

These seem slightly larger than the standard errors outputted by the lm() function:

summary(m2)

## 
## Call:
## lm(formula = lcrime ~ lpolice, data = campus)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0819 -0.3726  0.0857  0.6388  2.0967 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.0643     0.3641   5.670 1.53e-07 ***
## lpolice       1.1765     0.1279   9.197 8.62e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.01 on 95 degrees of freedom
## Multiple R-squared:  0.471,  Adjusted R-squared:  0.4654 
## F-statistic: 84.58 on 1 and 95 DF,  p-value: 8.621e-15

Explanation of the dataset can be found at http://lib.stat.cmu.edu/datasets/bodyfat

body <- read.csv(file.path(dataDir, "bodyfat_short.csv"), header = T)

dim(body)

## [1] 252   8

names(body)

## [1] "BODYFAT" "AGE"     "WEIGHT"  "HEIGHT"  "CHEST"   "ABDOMEN" "HIP"    
## [8] "THIGH"

head(body)

##   BODYFAT AGE WEIGHT HEIGHT CHEST ABDOMEN   HIP THIGH
## 1    12.3  23 154.25  67.75  93.1    85.2  94.5  59.0
## 2     6.1  22 173.25  72.25  93.6    83.0  98.7  58.7
## 3    25.3  22 154.00  66.25  95.8    87.9  99.2  59.6
## 4    10.4  26 184.75  72.25 101.8    86.4 101.2  60.1
## 5    28.7  24 184.25  71.25  97.3   100.0 101.9  63.2
## 6    20.9  24 210.25  74.75 104.5    94.4 107.8  66.0

summary(body)

##     BODYFAT           AGE            WEIGHT          HEIGHT     
##  Min.   : 0.00   Min.   :22.00   Min.   :118.5   Min.   :29.50  
##  1st Qu.:12.47   1st Qu.:35.75   1st Qu.:159.0   1st Qu.:68.25  
##  Median :19.20   Median :43.00   Median :176.5   Median :70.00  
##  Mean   :19.15   Mean   :44.88   Mean   :178.9   Mean   :70.15  
##  3rd Qu.:25.30   3rd Qu.:54.00   3rd Qu.:197.0   3rd Qu.:72.25  
##  Max.   :47.50   Max.   :81.00   Max.   :363.1   Max.   :77.75  
##      CHEST           ABDOMEN            HIP            THIGH      
##  Min.   : 79.30   Min.   : 69.40   Min.   : 85.0   Min.   :47.20  
##  1st Qu.: 94.35   1st Qu.: 84.58   1st Qu.: 95.5   1st Qu.:56.00  
##  Median : 99.65   Median : 90.95   Median : 99.3   Median :59.00  
##  Mean   :100.82   Mean   : 92.56   Mean   : 99.9   Mean   :59.41  
##  3rd Qu.:105.38   3rd Qu.: 99.33   3rd Qu.:103.5   3rd Qu.:62.35  
##  Max.   :136.20   Max.   :148.10   Max.   :147.7   Max.   :87.30

Notice the unusual values in the bodyfat, age, weight and height variables.

body[body$HEIGHT < 30,]

##    BODYFAT AGE WEIGHT HEIGHT CHEST ABDOMEN   HIP THIGH
## 42    32.9  44    205   29.5   106   104.3 115.5  70.6

par(mfrow = c(3, 3))
for (i in 1:8)
    {
        hist(body[,i], xlab = "", main = names(body)[i], breaks = 500)
        }
par(mfrow = c(1, 1))

plot of chunk unnamed-chunk-39

Again notice the presence of outliers.

body[body$HIP > 140,]

##    BODYFAT AGE WEIGHT HEIGHT CHEST ABDOMEN   HIP THIGH
## 39    35.2  46 363.15  72.25 136.2   148.1 147.7  87.3

Subject 39 seems to have large values in many variables.

pairs(body)

ou1 = which(body$HEIGHT < 30)
ou2 = which(body$WEIGHT > 300)
ou3 = which(body$HIP > 120)
ou = c(ou1, ou2, ou3)
pairs(body[-ou,])

plot of chunk unnamed-chunk-42

par(mfrow = c(3, 3))
for(i in 2:8)
{
    plot(body[,i], body[,1], xlab = names(body)[i], ylab = "BODYFAT")
    }
par(mfrow = c(1, 1))

plot of chunk unnamed-chunk-43

par(mfrow = c(3, 3))
for(i in 2:8)
{
    plot(body[-ou,i], body[-ou,1], xlab = names(body)[i], ylab = "BODYFAT")
    }
par(mfrow = c(1, 1))

plot of chunk unnamed-chunk-44

Multiple Regression

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
summary(ft)

## 
## Call:
## lm(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + 
##     HIP + THIGH, data = body)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0729  -3.2387  -0.0782   3.0623  10.3611 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.748e+01  1.449e+01  -2.585  0.01031 *  
## AGE          1.202e-02  2.934e-02   0.410  0.68246    
## WEIGHT      -1.392e-01  4.509e-02  -3.087  0.00225 ** 
## HEIGHT      -1.028e-01  9.787e-02  -1.051  0.29438    
## CHEST       -8.312e-04  9.989e-02  -0.008  0.99337    
## ABDOMEN      9.685e-01  8.531e-02  11.352  < 2e-16 ***
## HIP         -1.834e-01  1.448e-01  -1.267  0.20648    
## THIGH        2.857e-01  1.362e-01   2.098  0.03693 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.438 on 244 degrees of freedom
## Multiple R-squared:  0.7266, Adjusted R-squared:  0.7187 
## F-statistic: 92.62 on 7 and 244 DF,  p-value: < 2.2e-16

Sum of squares of the regression equation given by R:

eqn = -37.48 + 0.012*body$AGE - 0.139*body$WEIGHT - 0.102*body$HEIGHT - 0.0008*body$CHEST + 0.968*body$ABDOMEN - 0.183*body$HIP + 0.286*body$THIGH 
sum.sq = sum((body$BODYFAT - eqn)^2)
sum.sq

## [1] 4809.504

Sum of squares of my arbitrary equation:

my.eqn = -30 + 0.05*body$AGE - 0.013*body$WEIGHT - 0.2*body$HEIGHT - 0.8*body$CHEST + 0.0009*body$ABDOMEN - 0.03*body$HIP + 0.6*body$THIGH 
my.sum.sq = sum((body$BODYFAT - my.eqn)^2)
my.sum.sq

## [1] 3153233

Correlation between chest circumeference and abdomen circumference

cor(body$CHEST, body$ABDOMEN)

## [1] 0.9158277

Another regression for the bodyfat dataset (we dropped the variables Abdomen and Hip)

ft1 = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + THIGH, data = body) 
summary(ft1)

## 
## Call:
## lm(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + THIGH, 
##     data = body)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.4106  -3.8409  -0.1898   3.6800  15.0222 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -45.22628   13.79574  -3.278 0.001195 ** 
## AGE           0.15899    0.03271   4.860 2.09e-06 ***
## WEIGHT       -0.02991    0.04384  -0.682 0.495714    
## HEIGHT       -0.31266    0.11466  -2.727 0.006854 ** 
## CHEST         0.52668    0.10763   4.893 1.79e-06 ***
## THIGH         0.52895    0.15701   3.369 0.000876 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.503 on 246 degrees of freedom
## Multiple R-squared:  0.5762, Adjusted R-squared:  0.5676 
## F-statistic: 66.89 on 5 and 246 DF,  p-value: < 2.2e-16

Regression for bodyfat percentage using only age and chest circumference

ft2 = lm(BODYFAT ~ AGE + CHEST, data = body) 
summary(ft2)

## 
## Call:
## lm(formula = BODYFAT ~ AGE + CHEST, data = body)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.4657  -4.3271   0.1406   3.9607  14.9866 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -53.27135    4.43061 -12.023  < 2e-16 ***
## AGE           0.11479    0.02954   3.886 0.000131 ***
## CHEST         0.66720    0.04416  15.109  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.805 on 249 degrees of freedom
## Multiple R-squared:  0.5226, Adjusted R-squared:  0.5188 
## F-statistic: 136.3 on 2 and 249 DF,  p-value: < 2.2e-16

Plotting the 3d scatterplot along with the fitted regression PLANE

library(scatterplot3d)

## Warning: package 'scatterplot3d' was built under R version 3.3.2

sp = scatterplot3d(body$AGE, body$CHEST, body$BODYFAT)
sp$plane3d(ft2, lty.box = "solid", draw_lines = TRUE, draw_polygon = TRUE, col = "red")

plot of chunk unnamed-chunk-51

Prediction using the regression equation

bf.pred = -37.48 + 0.01202*30 - 0.1392*180 - 0.1028*70 - 0.0008312*90 + 0.9685*86 - 0.1834*97 + 0.2857*60
bf.pred

## [1] 13.19699

First observation (row) of the bodyfat dataset

obs1 = body[1,]
obs1

##   BODYFAT AGE WEIGHT HEIGHT CHEST ABDOMEN  HIP THIGH
## 1    12.3  23 154.25  67.75  93.1    85.2 94.5    59

Fitted value for the first obsevation

-37.48 + 0.01202*body[1,"AGE"] - 0.1392*body[1,"WEIGHT"] - 0.1028*body[1,"HEIGHT"] - 0.0008312*body[1, "CHEST"] + 0.9685*body[1,"ABDOMEN"] - 0.1834*body[1, "HIP"] + 0.2857*body[1, "THIGH"]

## [1] 16.32398

Fitted values in R

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
names(ft)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

head(ft$fitted.values)

##        1        2        3        4        5        6 
## 16.32670 10.22019 18.42600 11.89502 25.97564 16.28529

Plotting fitted values against response

plot(ft$fitted.values, body$BODYFAT, xlab = "Fitted Values", ylab = "Bodyfat Percentage")

plot of chunk unnamed-chunk-56

Coefficient of Determination (or Multiple R²⁾

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
Rsquared = (cor(body$BODYFAT, ft$fitted.values))^2
Rsquared

## [1] 0.7265596

R² in lm summary

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
summary(ft)

## 
## Call:
## lm(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + 
##     HIP + THIGH, data = body)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0729  -3.2387  -0.0782   3.0623  10.3611 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.748e+01  1.449e+01  -2.585  0.01031 *  
## AGE          1.202e-02  2.934e-02   0.410  0.68246    
## WEIGHT      -1.392e-01  4.509e-02  -3.087  0.00225 ** 
## HEIGHT      -1.028e-01  9.787e-02  -1.051  0.29438    
## CHEST       -8.312e-04  9.989e-02  -0.008  0.99337    
## ABDOMEN      9.685e-01  8.531e-02  11.352  < 2e-16 ***
## HIP         -1.834e-01  1.448e-01  -1.267  0.20648    
## THIGH        2.857e-01  1.362e-01   2.098  0.03693 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.438 on 244 degrees of freedom
## Multiple R-squared:  0.7266, Adjusted R-squared:  0.7187 
## F-statistic: 92.62 on 7 and 244 DF,  p-value: < 2.2e-16

Residual for the first observation

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
res.1 = body[1,"BODYFAT"] - ft$fitted.values[1]
res.1

##         1 
## -4.026695

Residuals via lm

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
names(ft)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

head(ft$residuals)

##         1         2         3         4         5         6 
## -4.026695 -4.120189  6.874004 -1.495017  2.724355  4.614712

Residuals plotted against the fitted values

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
plot(ft$fitted.values, ft$residuals, xlab = "Fitted Values", ylab = "Residuals")

plot of chunk unnamed-chunk-61

Residuals plotted against the explanatory variables

par(mfrow = c(3, 3))
for(i in 2:8)
{
    plot(body[,i], ft$residuals, xlab = names(body)[i], ylab = "Residuals")
    }
par(mfrow = c(1, 1))

plot of chunk unnamed-chunk-62

# Note here the presence of the outliers.

Residuals are uncorrelated with every explanatory variable

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
cor(ft$residuals, body$AGE)

## [1] -1.754044e-17

cor(ft$residuals, body$WEIGHT)

## [1] 4.71057e-17

cor(ft$residuals, body$HEIGHT)

## [1] -1.720483e-15

cor(ft$residuals, body$CHEST)

## [1] -4.672628e-16

cor(ft$residuals, body$ABDOMEN)

## [1] -7.012368e-16

cor(ft$residuals, body$HIP)

## [1] -8.493675e-16

cor(ft$residuals, body$THIGH)

## [1] -5.509094e-16

Residuals have mean zero.

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
mean(ft$residuals)

## [1] 2.467747e-16

Fitting a linear model to residuals:

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
m.res = lm(ft$residuals ~ body$AGE + body$WEIGHT + body$HEIGHT + body$CHEST + body$ABDOMEN + body$HIP + body$THIGH)
summary(m.res)

## 
## Call:
## lm(formula = ft$residuals ~ body$AGE + body$WEIGHT + body$HEIGHT + 
##     body$CHEST + body$ABDOMEN + body$HIP + body$THIGH)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0729  -3.2387  -0.0782   3.0623  10.3611 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)   2.154e-14  1.449e+01       0        1
## body$AGE      1.282e-17  2.934e-02       0        1
## body$WEIGHT   1.057e-16  4.509e-02       0        1
## body$HEIGHT  -1.509e-16  9.787e-02       0        1
## body$CHEST    1.180e-16  9.989e-02       0        1
## body$ABDOMEN -2.452e-16  8.531e-02       0        1
## body$HIP     -1.284e-16  1.448e-01       0        1
## body$THIGH   -1.090e-16  1.362e-01       0        1
## 
## Residual standard error: 4.438 on 244 degrees of freedom
## Multiple R-squared:  6.384e-32,  Adjusted R-squared:  -0.02869 
## F-statistic: 2.225e-30 on 7 and 244 DF,  p-value: 1

Residual Sum of Squares

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
rss.ft = sum((ft$residuals)^2)
rss.ft

## [1] 4806.806

Residual Sum of Squares and R²

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
rss.ft = sum((ft$residuals)^2)
rss.ft

## [1] 4806.806

tss = sum(((body$BODYFAT) - mean(body$BODYFAT))^2)
1 - (rss.ft/tss)

## [1] 0.7265596

summary(ft)

## 
## Call:
## lm(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + 
##     HIP + THIGH, data = body)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0729  -3.2387  -0.0782   3.0623  10.3611 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.748e+01  1.449e+01  -2.585  0.01031 *  
## AGE          1.202e-02  2.934e-02   0.410  0.68246    
## WEIGHT      -1.392e-01  4.509e-02  -3.087  0.00225 ** 
## HEIGHT      -1.028e-01  9.787e-02  -1.051  0.29438    
## CHEST       -8.312e-04  9.989e-02  -0.008  0.99337    
## ABDOMEN      9.685e-01  8.531e-02  11.352  < 2e-16 ***
## HIP         -1.834e-01  1.448e-01  -1.267  0.20648    
## THIGH        2.857e-01  1.362e-01   2.098  0.03693 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.438 on 244 degrees of freedom
## Multiple R-squared:  0.7266, Adjusted R-squared:  0.7187 
## F-statistic: 92.62 on 7 and 244 DF,  p-value: < 2.2e-16

RSS after removing the variable Abdomen

ft.1 = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + HIP + THIGH, data = body)
rss.ft1 = sum((ft.1$residuals)^2)
rss.ft1

## [1] 7345.724

Compare this to the RSS of the full regression.

rss.ft

## [1] 4806.806

RSS after removing the variable CHEST

ft.2 = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + ABDOMEN + HIP + THIGH, data = body)
rss.ft2 = sum((ft.2$residuals)^2)
rss.ft2

## [1] 4806.808

Compare this to the RSS of the full regression.

rss.ft

## [1] 4806.806

R2 after removing the variable Abdomen

ft.1 = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + HIP + THIGH, data = body)
R2.ft1 = (cor(body$BODYFAT, ft.1$fitted.values))^2
R2.ft1

## [1] 0.5821305

Compare this to the RSS of the full regression.

R2.ft  = (cor(body$BODYFAT, ft$fitted.values))^2
R2.ft

## [1] 0.7265596

R2 after removing the variable chest

ft.2 = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + ABDOMEN + HIP + THIGH, data = body)
R2.ft2 = (cor(body$BODYFAT, ft.2$fitted.values))^2
R2.ft2

## [1] 0.7265595

Compare this to the RSS of the full regression.

R2.ft

## [1] 0.7265596

Residual Degrees of Freedom

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
n = nrow(body)
p = 7
rs.df = n-p-1
rs.df

## [1] 244

Residual Standard Error

ft = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
rss = sum((ft$residuals)^2)
rse = sqrt(rss/rs.df)
rse

## [1] 4.438471

Summary (look for residual degrees of freedom and residual sum of squares)

summary(ft)

## 
## Call:
## lm(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + 
##     HIP + THIGH, data = body)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0729  -3.2387  -0.0782   3.0623  10.3611 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.748e+01  1.449e+01  -2.585  0.01031 *  
## AGE          1.202e-02  2.934e-02   0.410  0.68246    
## WEIGHT      -1.392e-01  4.509e-02  -3.087  0.00225 ** 
## HEIGHT      -1.028e-01  9.787e-02  -1.051  0.29438    
## CHEST       -8.312e-04  9.989e-02  -0.008  0.99337    
## ABDOMEN      9.685e-01  8.531e-02  11.352  < 2e-16 ***
## HIP         -1.834e-01  1.448e-01  -1.267  0.20648    
## THIGH        2.857e-01  1.362e-01   2.098  0.03693 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.438 on 244 degrees of freedom
## Multiple R-squared:  0.7266, Adjusted R-squared:  0.7187 
## F-statistic: 92.62 on 7 and 244 DF,  p-value: < 2.2e-16

https://collegescorecard.ed.gov/assets/CollegeScorecardDataDictionary-10-26-2016.xlsx https://collegescorecard.ed.gov/assets/FullDataDocumentation.pdf Set the right working directory

scorecard <- read.csv(file.path(dataDir, "college_short.csv"))

Number of observations and Variables

dim(scorecard)

## [1] 1241    9

Variable Names

names(scorecard)

## [1] "SAT_AVG_ALL"    "AVGFACSAL"      "TUITIONFEE_IN"  "TUITIONFEE_OUT"
## [5] "UGDS"           "RET_FT4"        "PCTFLOAN"       "PFTFAC"        
## [9] "TYPE"

Fitting a regression equation in the usual way.

req.bad = lm(RET_FT4~ TUITIONFEE_OUT + TYPE, data = scorecard)
summary(req.bad)

## 
## Call:
## lm(formula = RET_FT4 ~ TUITIONFEE_OUT + TYPE, data = scorecard)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69041 -0.04915  0.00516  0.05554  0.33165 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.661e-01  9.265e-03   71.90   <2e-16 ***
## TUITIONFEE_OUT  9.405e-06  3.022e-07   31.12   <2e-16 ***
## TYPE           -8.898e-02  5.741e-03  -15.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08741 on 1238 degrees of freedom
## Multiple R-squared:  0.4391, Adjusted R-squared:  0.4382 
## F-statistic: 484.5 on 2 and 1238 DF,  p-value: < 2.2e-16

How can the coefficients be interpreted here? Is type being treated as a numeric variable by R?

is.numeric(scorecard$TYPE)

## [1] TRUE

Regression equation:

req = lm(RET_FT4~ TUITIONFEE_OUT + as.factor(TYPE), data = scorecard)
summary(req)

## 
## Call:
## lm(formula = RET_FT4 ~ TUITIONFEE_OUT + as.factor(TYPE), data = scorecard)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.68856 -0.04910  0.00505  0.05568  0.33150 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.765e-01  7.257e-03  79.434  < 2e-16 ***
## TUITIONFEE_OUT    9.494e-06  3.054e-07  31.090  < 2e-16 ***
## as.factor(TYPE)2 -9.204e-02  5.948e-03 -15.474  < 2e-16 ***
## as.factor(TYPE)3 -1.218e-01  3.116e-02  -3.909 9.75e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08732 on 1237 degrees of freedom
## Multiple R-squared:  0.4408, Adjusted R-squared:  0.4394 
## F-statistic:   325 on 3 and 1237 DF,  p-value: < 2.2e-16

Plotting the three regression lines in one plot:

plot(RET_FT4 ~ TUITIONFEE_OUT, data = scorecard, xlab = "Tuition Fee (out of state)", ylab = "Retention Rate", type = "n") 
w1 = (scorecard$TYPE == 1)
points(RET_FT4[w1] ~ TUITIONFEE_OUT[w1], data = scorecard, col = "blue")
abline(c(req$coefficients[1], req$coefficients[2]), col = "blue")
w2 = (scorecard$TYPE == 2)
points(RET_FT4[w2] ~ TUITIONFEE_OUT[w2], data = scorecard, col = "red")
abline(c(req$coefficients[1] + req$coefficients[3], req$coefficients[2]), col = "red")
w3 = (scorecard$TYPE == 3)
points(RET_FT4[w3] ~ TUITIONFEE_OUT[w3], data = scorecard, col = "green")
abline(c(req$coefficients[1] + req$coefficients[4], req$coefficients[2]), col = "green")

plot of chunk unnamed-chunk-85

Regression equation with interaction:

req.1 = lm(RET_FT4~ TUITIONFEE_OUT + as.factor(TYPE) + TUITIONFEE_OUT : as.factor(TYPE), data = scorecard) 
summary(req.1)

## 
## Call:
## lm(formula = RET_FT4 ~ TUITIONFEE_OUT + as.factor(TYPE) + TUITIONFEE_OUT:as.factor(TYPE), 
##     data = scorecard)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.68822 -0.04982  0.00491  0.05555  0.32900 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      5.814e-01  1.405e-02  41.372  < 2e-16 ***
## TUITIONFEE_OUT                   9.240e-06  6.874e-07  13.441  < 2e-16 ***
## as.factor(TYPE)2                -9.830e-02  1.750e-02  -5.617  2.4e-08 ***
## as.factor(TYPE)3                -2.863e-01  1.568e-01  -1.826   0.0681 .  
## TUITIONFEE_OUT:as.factor(TYPE)2  2.988e-07  7.676e-07   0.389   0.6971    
## TUITIONFEE_OUT:as.factor(TYPE)3  7.215e-06  6.716e-06   1.074   0.2829    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08734 on 1235 degrees of freedom
## Multiple R-squared:  0.4413, Adjusted R-squared:  0.4391 
## F-statistic: 195.1 on 5 and 1235 DF,  p-value: < 2.2e-16

Regression equation with interaction (another equivalent way):

req.2 = lm(RET_FT4~ TUITIONFEE_OUT * as.factor(TYPE), data = scorecard) 
summary(req.2)

## 
## Call:
## lm(formula = RET_FT4 ~ TUITIONFEE_OUT * as.factor(TYPE), data = scorecard)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.68822 -0.04982  0.00491  0.05555  0.32900 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      5.814e-01  1.405e-02  41.372  < 2e-16 ***
## TUITIONFEE_OUT                   9.240e-06  6.874e-07  13.441  < 2e-16 ***
## as.factor(TYPE)2                -9.830e-02  1.750e-02  -5.617  2.4e-08 ***
## as.factor(TYPE)3                -2.863e-01  1.568e-01  -1.826   0.0681 .  
## TUITIONFEE_OUT:as.factor(TYPE)2  2.988e-07  7.676e-07   0.389   0.6971    
## TUITIONFEE_OUT:as.factor(TYPE)3  7.215e-06  6.716e-06   1.074   0.2829    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08734 on 1235 degrees of freedom
## Multiple R-squared:  0.4413, Adjusted R-squared:  0.4391 
## F-statistic: 195.1 on 5 and 1235 DF,  p-value: < 2.2e-16

Plotting the three regression lines in one plot:

plot(RET_FT4 ~ TUITIONFEE_OUT, data = scorecard, xlab = "Tuition Fee (out of state)", ylab = "Retention Rate", type = "n") 
w1 = (scorecard$TYPE == 1)
points(RET_FT4[w1] ~ TUITIONFEE_OUT[w1], data = scorecard, col = "blue")
abline(c(req.1$coefficients[1], req.1$coefficients[2]), col = "blue")
w2 = (scorecard$TYPE == 2)
points(RET_FT4[w2] ~ TUITIONFEE_OUT[w2], data = scorecard, col = "red")
abline(c(req.1$coefficients[1] + req.1$coefficients[3], req.1$coefficients[2] + req.1$coefficients[5]), col = "red")
w3 = (scorecard$TYPE == 3)
points(RET_FT4[w3] ~ TUITIONFEE_OUT[w3], data = scorecard, col = "green")
abline(c(req.1$coefficients[1] + req.1$coefficients[4], req.1$coefficients[2] + req.1$coefficients[6]), col = "green")

plot of chunk unnamed-chunk-88

Dataset can also be found here \url{https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset} Set the right working directory Read the data

bike = read.csv(file.path(dataDir, "BikeSharingDataset.csv"))

dim(bike)

## [1] 17379    17

names(bike)

##  [1] "instant"    "dteday"     "season"     "yr"         "mnth"      
##  [6] "hr"         "holiday"    "weekday"    "workingday" "weathersit"
## [11] "temp"       "atemp"      "hum"        "windspeed"  "casual"    
## [16] "registered" "cnt"

head(bike)

##   instant     dteday season yr mnth hr holiday weekday workingday
## 1       1 2011-01-01      1  0    1  0       0       6          0
## 2       2 2011-01-01      1  0    1  1       0       6          0
## 3       3 2011-01-01      1  0    1  2       0       6          0
## 4       4 2011-01-01      1  0    1  3       0       6          0
## 5       5 2011-01-01      1  0    1  4       0       6          0
## 6       6 2011-01-01      1  0    1  5       0       6          0
##   weathersit temp  atemp  hum windspeed casual registered cnt
## 1          1 0.24 0.2879 0.81    0.0000      3         13  16
## 2          1 0.22 0.2727 0.80    0.0000      8         32  40
## 3          1 0.22 0.2727 0.80    0.0000      5         27  32
## 4          1 0.24 0.2879 0.75    0.0000      3         10  13
## 5          1 0.24 0.2879 0.75    0.0000      0          1   1
## 6          2 0.24 0.2576 0.75    0.0896      0          1   1

Summary Statistics

summary(bike$atemp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3333  0.4848  0.4758  0.6212  1.0000

summary(as.factor(bike$workingday))

##     0     1 
##  5514 11865

summary(as.factor(bike$weathersit))

##     1     2     3     4 
## 11413  4544  1419     3

#Regression Equation
md1 = lm(casual ~ atemp + as.factor(workingday) +
as.factor(weathersit), data = bike)
summary(md1)

## 
## Call:
## lm(formula = casual ~ atemp + as.factor(workingday) + as.factor(weathersit), 
##     data = bike)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -100.104  -24.209   -3.655   14.379  292.104 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -1.4416     1.0366  -1.391    0.164    
## atemp                  132.7107     1.8076  73.418  < 2e-16 ***
## as.factor(workingday)1 -34.1359     0.6642 -51.393  < 2e-16 ***
## as.factor(weathersit)2  -5.5858     0.7156  -7.805 6.27e-15 ***
## as.factor(weathersit)3 -15.3972     1.1488 -13.403  < 2e-16 ***
## as.factor(weathersit)4   2.0619    23.4727   0.088    0.930    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.64 on 17373 degrees of freedom
## Multiple R-squared:  0.3208, Adjusted R-squared:  0.3206 
## F-statistic:  1641 on 5 and 17373 DF,  p-value: < 2.2e-16

#Regression Equation
md2 = lm(casual ~ atemp + as.factor(workingday) +
as.factor(weathersit) + as.factor(workingday) : as.factor(weathersit),
data = bike) 
summary(md2)

## 
## Call:
## lm(formula = casual ~ atemp + as.factor(workingday) + as.factor(weathersit) + 
##     as.factor(workingday):as.factor(weathersit), data = bike)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -102.67  -24.25   -3.42   14.37  289.49 
## 
## Coefficients:
##                                               Estimate Std. Error t value
## (Intercept)                                     1.2457     1.0820   1.151
## atemp                                         132.5508     1.8039  73.480
## as.factor(workingday)1                        -38.0378     0.8073 -47.119
## as.factor(weathersit)2                        -12.8922     1.2866 -10.021
## as.factor(weathersit)3                        -27.2829     2.1876 -12.471
## as.factor(weathersit)4                        -18.3256    40.5639  -0.452
## as.factor(workingday)1:as.factor(weathersit)2  10.5834     1.5432   6.858
## as.factor(workingday)1:as.factor(weathersit)3  16.5474     2.5635   6.455
## as.factor(workingday)1:as.factor(weathersit)4  30.4970    49.6749   0.614
##                                               Pr(>|t|)    
## (Intercept)                                      0.250    
## atemp                                          < 2e-16 ***
## as.factor(workingday)1                         < 2e-16 ***
## as.factor(weathersit)2                         < 2e-16 ***
## as.factor(weathersit)3                         < 2e-16 ***
## as.factor(weathersit)4                           0.651    
## as.factor(workingday)1:as.factor(weathersit)2 7.22e-12 ***
## as.factor(workingday)1:as.factor(weathersit)3 1.11e-10 ***
## as.factor(workingday)1:as.factor(weathersit)4    0.539    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.55 on 17370 degrees of freedom
## Multiple R-squared:  0.3238, Adjusted R-squared:  0.3235 
## F-statistic:  1040 on 8 and 17370 DF,  p-value: < 2.2e-16

#Regression Equation
md3 = lm(casual ~ atemp + as.factor(workingday) +
as.factor(workingday) : atemp + as.factor(weathersit), data = bike)  
summary(md3)

## 
## Call:
## lm(formula = casual ~ atemp + as.factor(workingday) + as.factor(workingday):atemp + 
##     as.factor(weathersit), data = bike)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -126.311  -19.432   -3.966   12.335  290.198 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -40.4566     1.5124 -26.751  < 2e-16 ***
## atemp                         217.1180     3.0094  72.147  < 2e-16 ***
## as.factor(workingday)1         25.3616     1.8420  13.768  < 2e-16 ***
## as.factor(weathersit)2         -5.4654     0.6924  -7.894 3.11e-15 ***
## as.factor(weathersit)3        -15.5520     1.1115 -13.992  < 2e-16 ***
## as.factor(weathersit)4          3.5877    22.7098   0.158    0.874    
## atemp:as.factor(workingday)1 -126.9260     3.6827 -34.465  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.32 on 17372 degrees of freedom
## Multiple R-squared:  0.3643, Adjusted R-squared:  0.3641 
## F-statistic:  1659 on 6 and 17372 DF,  p-value: < 2.2e-16

This is a dataset from the book “Introductory Econometrics: A Modern Approach” by Wooldridge.

load(file.path(dataDir, "wage1.Rdata"))
wages = data
wages.desc = desc

This gives data on wage (hourly earning) and various other variables; data collected from 1976.

head(wages)

##   wage educ exper tenure nonwhite female married numdep smsa northcen
## 1 3.10   11     2      0        0      1       0      2    1        0
## 2 3.24   12    22      2        0      1       1      3    1        0
## 3 3.00   11     2      0        0      0       0      2    0        0
## 4 6.00    8    44     28        0      0       1      0    1        0
## 5 5.30   12     7      2        0      0       1      1    0        0
## 6 8.75   16     9      8        0      0       1      0    1        0
##   south west construc ndurman trcommpu trade services profserv profocc
## 1     0    1        0       0        0     0        0        0       0
## 2     0    1        0       0        0     0        1        0       0
## 3     0    1        0       0        0     1        0        0       0
## 4     0    1        0       0        0     0        0        0       0
## 5     0    1        0       0        0     0        0        0       0
## 6     0    1        0       0        0     0        0        1       1
##   clerocc servocc    lwage expersq tenursq
## 1       0       0 1.131402       4       0
## 2       0       1 1.175573     484       4
## 3       0       0 1.098612       4       0
## 4       1       0 1.791759    1936     784
## 5       0       0 1.667707      49       4
## 6       0       0 2.169054      81      64

dim(wages)

## [1] 526  24

wages.desc

##    variable                           label
## 1      wage         average hourly earnings
## 2      educ              years of education
## 3     exper      years potential experience
## 4    tenure     years with current employer
## 5  nonwhite                  =1 if nonwhite
## 6    female                    =1 if female
## 7   married                   =1 if married
## 8    numdep            number of dependents
## 9      smsa              =1 if live in SMSA
## 10 northcen =1 if live in north central U.S
## 11    south   =1 if live in southern region
## 12     west    =1 if live in western region
## 13 construc  =1 if work in construc. indus.
## 14  ndurman  =1 if in nondur. manuf. indus.
## 15 trcommpu  =1 if in trans, commun, pub ut
## 16    trade    =1 if in wholesale or retail
## 17 services        =1 if in services indus.
## 18 profserv     =1 if in prof. serv. indus.
## 19  profocc    =1 if in profess. occupation
## 20  clerocc    =1 if in clerical occupation
## 21  servocc     =1 if in service occupation
## 22    lwage                       log(wage)
## 23  expersq                         exper^2
## 24  tenursq                        tenure^2

This is a dataset from the book “Introductory Econometrics: A Modern Approach” by Wooldridge.

load(file.path(dataDir, "campus.Rdata"))
campus = data
campus.desc = desc

This gives data for 97 colleges and universities for the year 1992.

summary(campus)

##      enroll           priv            police          crime       
##  Min.   : 1799   Min.   :0.0000   Min.   : 1.00   Min.   :   1.0  
##  1st Qu.: 6485   1st Qu.:0.0000   1st Qu.: 9.00   1st Qu.:  85.0  
##  Median :11990   Median :0.0000   Median :16.00   Median : 187.0  
##  Mean   :16076   Mean   :0.1237   Mean   :20.49   Mean   : 394.5  
##  3rd Qu.:21836   3rd Qu.:0.0000   3rd Qu.:27.00   3rd Qu.: 491.0  
##  Max.   :56350   Max.   :1.0000   Max.   :74.00   Max.   :2052.0  
##      lcrime         lenroll          lpolice     
##  Min.   :0.000   Min.   : 7.495   Min.   :0.000  
##  1st Qu.:4.443   1st Qu.: 8.777   1st Qu.:2.197  
##  Median :5.231   Median : 9.392   Median :2.773  
##  Mean   :5.277   Mean   : 9.379   Mean   :2.731  
##  3rd Qu.:6.196   3rd Qu.: 9.991   3rd Qu.:3.296  
##  Max.   :7.627   Max.   :10.939   Max.   :4.304

This is a dataset from the book “Introductory Econometrics: A Modern Approach” by Wooldridge.

load(file.path(dataDir, "twoyear.Rdata"))
ty = data
ty.desc = desc
head(ty)

##   female phsrank BA AA black hispanic  id exper        jc     univ
## 1      1      65  0  0     0        0  19   161 0.0000000 0.000000
## 2      1      97  0  0     0        0  93   119 0.0000000 7.033333
## 3      1      44  0  0     0        0  96    81 0.0000000 0.000000
## 4      1      34  0  0     0        1 119    39 0.2666667 0.000000
## 5      1      80  0  0     0        0 132   141 0.0000000 0.000000
## 6      0      59  0  0     0        0 156   165 0.0000000 0.000000
##      lwage     stotal smcity medcity submed lgcity sublg vlgcity subvlg ne
## 1 1.925291 -0.4417497      0       0      0      0     1       0      0  1
## 2 2.796494  0.0000000      1       0      0      0     0       0      0  0
## 3 1.625600 -1.3570027      0       0      0      0     1       0      0  1
## 4 2.223312 -0.1900551      1       0      0      0     0       0      0  0
## 5 1.642083  0.0000000      0       0      0      0     0       0      0  0
## 6 2.079442  1.3887565      1       0      0      0     0       0      0  0
##   nc south   totcoll
## 1  0     0 0.0000000
## 2  1     0 7.0333333
## 3  0     0 0.0000000
## 4  0     0 0.2666667
## 5  0     1 0.0000000
## 6  0     1 0.0000000

Data is collected for a sample of working people with a high school degree. One objective of analyzing this data is to comare the returns to education at junior colleges an four-year colleges (universities). This is a dataset from the book “Introductory Econometrics: A Modern Approach” by Wooldridge.

load(file.path(dataDir, "mlb1.Rdata"))
bb = data
bb.desc = desc
bb.desc

##    variable                      label
## 1    salary         1993 season salary
## 2   teamsal               team payroll
## 3        nl      =1 if national league
## 4     years     years in major leagues
## 5     games        career games played
## 6    atbats             career at bats
## 7      runs         career runs scored
## 8      hits                career hits
## 9   doubles             career doubles
## 10  triples             career triples
## 11    hruns           career home runs
## 12     rbis      career runs batted in
## 13     bavg     career batting average
## 14       bb               career walks
## 15       so         career strike outs
## 16   sbases        career stolen bases
## 17  fldperc       career fielding perc
## 18 frstbase          = 1 if first base
## 19 scndbase          =1 if second base
## 20 shrtstop            =1 if shortstop
## 21 thrdbase           =1 if third base
## 22 outfield             =1 if outfield
## 23  catcher              =1 if catcher
## 24 yrsallst          years as all-star
## 25   hispan             =1 if hispanic
## 26    black                =1 if black
## 27 whitepop         white pop. in city
## 28 blackpop         black pop. in city
## 29  hisppop      hispanic pop. in city
## 30    pcinc     city per capita income
## 31  gamesyr   games per year in league
## 32  hrunsyr         home runs per year
## 33 atbatsyr           at bats per year
## 34  allstar perc. of years an all-star
## 35  slugavg    career slugging average
## 36   rbisyr              rbis per year
## 37 sbasesyr      stolen bases per year
## 38   runsyr       runs scored per year
## 39 percwhte      percent white in city
## 40 percblck      percent black in city
## 41 perchisp   percent hispanic in city
## 42   blckpb             black*percblck
## 43   hispph            hispan*perchisp
## 44   whtepw             white*percwhte
## 45   blckph             black*perchisp
## 46   hisppb            hispan*percblck
## 47  lsalary                log(salary)

head(bb)

##    salary  teamsal nl years games atbats runs hits doubles triples hruns
## 1 6329213 38407380  1    12  1705   6705 1076 1939     320      67   231
## 2 3375000 38407380  1     8   918   3333  407  863     156      38    73
## 3 3100000 38407380  1     5   751   2807  370  840     148      18    46
## 4 2900000 38407380  1     8  1056   3337  405  816     143      18   107
## 5 1650000 38407380  1    12  1196   3603  437  928      19      16   124
## 6  700000 38407380  1    17  2032   7489 1136 2145     270     142    40
##   rbis bavg  bb   so sbases fldperc frstbase scndbase shrtstop thrdbase
## 1  836  289 619  948    314     989        0        1        0        0
## 2  342  259 137  582    133     968        0        0        1        0
## 3  355  299 341  228     41     994        1        0        0        0
## 4  421  245 306  653     15     971        0        0        0        1
## 5  541  258 316  725     32     977        0        0        0        0
## 6  574  286 416 1098    660     987        0        0        0        0
##   outfield catcher yrsallst hispan black whitepop blackpop hisppop pcinc
## 1        0       0        9      0     0  5772110  1547725  893422 18840
## 2        0       0        2      0     1  5772110  1547725  893422 18840
## 3        0       0        0      0     0  5772110  1547725  893422 18840
## 4        0       0        0      0     0  5772110  1547725  893422 18840
## 5        1       0        0      0     1  5772110  1547725  893422 18840
## 6        1       0        2      0     1  5772110  1547725  893422 18840
##     gamesyr   hrunsyr atbatsyr  allstar  slugavg   rbisyr  sbasesyr
## 1 142.08333 19.250000 558.7500 75.00000 46.02535 69.66666 26.166666
## 2 114.75000  9.125000 416.6250 25.00000 39.42394 42.75000 16.625000
## 3 150.20000  9.200000 561.4000  0.00000 41.39651 71.00000  8.200000
## 4 132.00000 13.375000 417.1250  0.00000 39.43662 52.62500  1.875000
## 5  99.66666 10.333333 300.2500  0.00000 37.49653 45.08333  2.666667
## 6 119.52941  2.352941 440.5294 11.76471 37.64188 33.76471 38.823528
##     runsyr percwhte percblck perchisp   blckpb hispph   whtepw  blckph
## 1 89.66666 70.27797 18.84423  10.8778  0.00000      0 70.27797  0.0000
## 2 50.87500 70.27797 18.84423  10.8778 18.84423      0  0.00000 10.8778
## 3 74.00000 70.27797 18.84423  10.8778  0.00000      0 70.27797  0.0000
## 4 50.62500 70.27797 18.84423  10.8778  0.00000      0 70.27797  0.0000
## 5 36.41667 70.27797 18.84423  10.8778 18.84423      0  0.00000 10.8778
## 6 66.82353 70.27797 18.84423  10.8778 18.84423      0  0.00000 10.8778
##   hisppb  lsalary
## 1      0 15.66069
## 2      0 15.03191
## 3      0 14.94691
## 4      0 14.88022
## 5      0 14.31629
## 6      0 13.45884

This is a dataset from the book “Introductory Econometrics: A Modern Approach” by Wooldridge.

load(file.path(dataDir, "wage1.Rdata"))
wages = data
wages.desc = desc

This gives data on wage (hourly earning) and various other variables; data collected from 1976.

The model M

M = lm(lwage ~ educ + exper + tenure, data = wages)
summary(M)

## 
## Call:
## lm(formula = lwage ~ educ + exper + tenure, data = wages)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

The model m

m = lm(lwage ~ educ + tenure, data = wages)
summary(m)

## 
## Call:
## lm(formula = lwage ~ educ + tenure, data = wages)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10350 -0.29287 -0.04081  0.28672  1.44967 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.404474   0.091696   4.411 1.25e-05 ***
## educ        0.086528   0.006991  12.377  < 2e-16 ***
## tenure      0.025814   0.002680   9.634  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4428 on 523 degrees of freedom
## Multiple R-squared:  0.3085, Adjusted R-squared:  0.3059 
## F-statistic: 116.7 on 2 and 523 DF,  p-value: < 2.2e-16

This is a dataset from the book “Introductory Econometrics: A Modern Approach” by Wooldridge.

load(file.path(dataDir, "campus.Rdata"))
campus = data
campus.desc = desc

This gives data for 97 colleges and universities for the year 1992.

The model M

M = lm(lcrime ~ lenroll + lpolice, data = campus)
summary(M)

## 
## Call:
## lm(formula = lcrime ~ lenroll + lpolice, data = campus)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6362 -0.2223  0.1047  0.4480  1.7519 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -4.7938     1.1120  -4.311 4.01e-05 ***
## lenroll       0.9235     0.1440   6.414 5.67e-09 ***
## lpolice       0.5164     0.1487   3.473 0.000779 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8467 on 94 degrees of freedom
## Multiple R-squared:  0.632,  Adjusted R-squared:  0.6242 
## F-statistic: 80.72 on 2 and 94 DF,  p-value: < 2.2e-16

The model m

m = lm(lcrime ~ offset(1*lenroll) + lpolice, data = campus)
summary(m)

## 
## Call:
## lm(formula = lcrime ~ offset(1 * lenroll) + lpolice, data = campus)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5992 -0.2191  0.0912  0.4321  1.7575 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5.3621     0.3041  -17.63  < 2e-16 ***
## lpolice       0.4617     0.1069    4.32 3.83e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8435 on 95 degrees of freedom
## Multiple R-squared:  0.6413, Adjusted R-squared:  0.6375 
## F-statistic: 169.8 on 1 and 95 DF,  p-value: < 2.2e-16

This is a dataset from the book “Introductory Econometrics: A Modern Approach” by Wooldridge.

load(file.path(dataDir, "twoyear.Rdata"))
ty = data
ty.desc = desc

M = lm(lwage ~ jc + univ + exper, data = ty)
summary(M)

## 
## Call:
## lm(formula = lwage ~ jc + univ + exper, data = ty)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10362 -0.28132  0.00551  0.28518  1.78167 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.4723256  0.0210602  69.910   <2e-16 ***
## jc          0.0666967  0.0068288   9.767   <2e-16 ***
## univ        0.0768762  0.0023087  33.298   <2e-16 ***
## exper       0.0049442  0.0001575  31.397   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4301 on 6759 degrees of freedom
## Multiple R-squared:  0.2224, Adjusted R-squared:  0.2221 
## F-statistic: 644.5 on 3 and 6759 DF,  p-value: < 2.2e-16

The model m

m = lm(lwage ~ I(jc + univ) + exper, data = ty)
summary(m)

## 
## Call:
## lm(formula = lwage ~ I(jc + univ) + exper, data = ty)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.09708 -0.28069  0.00532  0.28324  1.78332 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.4719702  0.0210606   69.89   <2e-16 ***
## I(jc + univ) 0.0761563  0.0022562   33.75   <2e-16 ***
## exper        0.0049323  0.0001573   31.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4302 on 6760 degrees of freedom
## Multiple R-squared:  0.2222, Adjusted R-squared:  0.222 
## F-statistic: 965.6 on 2 and 6760 DF,  p-value: < 2.2e-16

This is a dataset from the book “Introductory Econometrics: A Modern Approach” by Wooldridge.

load(file.path(dataDir, "mlb1.Rdata"))
bb = data
bb.desc = desc

The model M

M = lm(lsalary ~ years + gamesyr + bavg + hrunsyr + rbisyr, data = bb)
summary(M)

## 
## Call:
## lm(formula = lsalary ~ years + gamesyr + bavg + hrunsyr + rbisyr, 
##     data = bb)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.02508 -0.45034 -0.04013  0.47014  2.68924 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.119e+01  2.888e-01  38.752  < 2e-16 ***
## years       6.886e-02  1.211e-02   5.684 2.79e-08 ***
## gamesyr     1.255e-02  2.647e-03   4.742 3.09e-06 ***
## bavg        9.786e-04  1.104e-03   0.887    0.376    
## hrunsyr     1.443e-02  1.606e-02   0.899    0.369    
## rbisyr      1.077e-02  7.175e-03   1.500    0.134    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7266 on 347 degrees of freedom
## Multiple R-squared:  0.6278, Adjusted R-squared:  0.6224 
## F-statistic: 117.1 on 5 and 347 DF,  p-value: < 2.2e-16

The model m

m = lm(lsalary ~ years + gamesyr, data = bb)
summary(m)

## 
## Call:
## lm(formula = lsalary ~ years + gamesyr, data = bb)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.66858 -0.46412 -0.01176  0.49219  2.68829 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.223804   0.108312 103.625  < 2e-16 ***
## years        0.071318   0.012505   5.703  2.5e-08 ***
## gamesyr      0.020174   0.001343  15.023  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7527 on 350 degrees of freedom
## Multiple R-squared:  0.5971, Adjusted R-squared:  0.5948 
## F-statistic: 259.3 on 2 and 350 DF,  p-value: < 2.2e-16

Plotting the density of the \(F\)-distribution Choose first the degrees of freedom:

d1 = 5
d2 = 30
xx = seq(0, 10, 0.01)
plot(xx, df(xx, d1, d2), type = "l", xlab = "x", ylab = "F-density")

plot of chunk unnamed-chunk-115

This is a dataset from the book “Introductory Econometrics: A Modern Approach” by Wooldridge.

load(file.path(dataDir, "wage1.Rdata"))
wages = data
wages.desc = desc

This gives data on wage (hourly earning) and various other variables; data collected from 1976.

The model M

M = lm(lwage ~ educ + exper + tenure, data = wages)
summary(M)

## 
## Call:
## lm(formula = lwage ~ educ + exper + tenure, data = wages)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

The model m

m = lm(lwage ~ educ + tenure, data = wages)
summary(m)

## 
## Call:
## lm(formula = lwage ~ educ + tenure, data = wages)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10350 -0.29287 -0.04081  0.28672  1.44967 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.404474   0.091696   4.411 1.25e-05 ***
## educ        0.086528   0.006991  12.377  < 2e-16 ***
## tenure      0.025814   0.002680   9.634  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4428 on 523 degrees of freedom
## Multiple R-squared:  0.3085, Adjusted R-squared:  0.3059 
## F-statistic: 116.7 on 2 and 523 DF,  p-value: < 2.2e-16

Calculating the F-statistic.

rss.M = sum((M$residuals)^2)
rss.M

## [1] 101.4556

p = 3
rss.m = sum((m$residuals)^2)
rss.m

## [1] 102.5671

q = 2
n = nrow(wages)
F = ((rss.m - rss.M)/(p-q))/(rss.M/(n-p-1))
F

## [1] 5.718971

Plotting the density of the \(F\)-distribution

d1 = p-q
d2 = n-p-1
xx = seq(0, 10, 0.01)
plot(xx, df(xx, d1, d2), type = "l", xlab = "x", ylab = "F-density")
abline(v = F)

plot of chunk unnamed-chunk-120

Computing the p-value

1 - pf(F, d1, d2)

## [1] 0.01713562

p-value for testing H_0: beta_2 = 0 from summary:

summary(M)

## 
## Call:
## lm(formula = lwage ~ educ + exper + tenure, data = wages)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

Testing via the anova function:

anova(m, M)

## Analysis of Variance Table
## 
## Model 1: lwage ~ educ + tenure
## Model 2: lwage ~ educ + exper + tenure
##   Res.Df    RSS Df Sum of Sq     F  Pr(>F)  
## 1    523 102.57                             
## 2    522 101.46  1    1.1115 5.719 0.01714 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The model M

M = lm(lcrime ~ lenroll + lpolice, data = campus)

The model m

m = lm(lcrime ~ offset(1*lenroll) + lpolice, data = campus)

Testing m against M

anova(m, M)

## Analysis of Variance Table
## 
## Model 1: lcrime ~ offset(1 * lenroll) + lpolice
## Model 2: lcrime ~ lenroll + lpolice
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     95 67.589                           
## 2     94 67.387  1   0.20249 0.2825 0.5963

The model M

M = lm(lwage ~ jc + univ + exper, data = ty)

The submodel m

m = lm(lwage ~ I(jc + univ) + exper, data = ty)

Test

anova(m, M)

## Analysis of Variance Table
## 
## Model 1: lwage ~ I(jc + univ) + exper
## Model 2: lwage ~ jc + univ + exper
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1   6760 1250.9                          
## 2   6759 1250.5  1   0.39853 2.154 0.1422

M = lm(lsalary ~ years + gamesyr + bavg + hrunsyr + rbisyr, data = bb)

m = lm(lsalary ~ years + gamesyr, data = bb)

Test

anova(m, M)

## Analysis of Variance Table
## 
## Model 1: lsalary ~ years + gamesyr
## Model 2: lsalary ~ years + gamesyr + bavg + hrunsyr + rbisyr
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    350 198.31                                  
## 2    347 183.19  3    15.125 9.5503 4.474e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Illustrating the F statistic given by R in summary()

M = lm(lsalary ~ years + gamesyr + bavg + hrunsyr + rbisyr, data = bb)
summary(M)

## 
## Call:
## lm(formula = lsalary ~ years + gamesyr + bavg + hrunsyr + rbisyr, 
##     data = bb)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.02508 -0.45034 -0.04013  0.47014  2.68924 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.119e+01  2.888e-01  38.752  < 2e-16 ***
## years       6.886e-02  1.211e-02   5.684 2.79e-08 ***
## gamesyr     1.255e-02  2.647e-03   4.742 3.09e-06 ***
## bavg        9.786e-04  1.104e-03   0.887    0.376    
## hrunsyr     1.443e-02  1.606e-02   0.899    0.369    
## rbisyr      1.077e-02  7.175e-03   1.500    0.134    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7266 on 347 degrees of freedom
## Multiple R-squared:  0.6278, Adjusted R-squared:  0.6224 
## F-statistic: 117.1 on 5 and 347 DF,  p-value: < 2.2e-16

Obtaining the F-statistic given by R summary via the anova function

m = lm(lsalary ~ 1, data = bb)
summary(m)

## 
## Call:
## lm(formula = lsalary ~ 1, data = bb)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.89308 -1.04867 -0.06971  1.13426  2.16850 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.49218    0.06294   214.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.182 on 352 degrees of freedom

anova(m, M)

## Analysis of Variance Table
## 
## Model 1: lsalary ~ 1
## Model 2: lsalary ~ years + gamesyr + bavg + hrunsyr + rbisyr
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    352 492.18                                  
## 2    347 183.19  5    308.99 117.06 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Back to the bodyfat datset

body = read.csv(file.path(dataDir, "bodyfat_short.csv"), header = T)
#Let us fit a linear regression equation to bodyfat based on Age, Weight, Height, Chest, Abdomen, Hip and Thigh:
md = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
summary(md)

## 
## Call:
## lm(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + 
##     HIP + THIGH, data = body)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0729  -3.2387  -0.0782   3.0623  10.3611 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.748e+01  1.449e+01  -2.585  0.01031 *  
## AGE          1.202e-02  2.934e-02   0.410  0.68246    
## WEIGHT      -1.392e-01  4.509e-02  -3.087  0.00225 ** 
## HEIGHT      -1.028e-01  9.787e-02  -1.051  0.29438    
## CHEST       -8.312e-04  9.989e-02  -0.008  0.99337    
## ABDOMEN      9.685e-01  8.531e-02  11.352  < 2e-16 ***
## HIP         -1.834e-01  1.448e-01  -1.267  0.20648    
## THIGH        2.857e-01  1.362e-01   2.098  0.03693 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.438 on 244 degrees of freedom
## Multiple R-squared:  0.7266, Adjusted R-squared:  0.7187 
## F-statistic: 92.62 on 7 and 244 DF,  p-value: < 2.2e-16

Confidence interval

n = nrow(body)
p = 7
t.cut.off = qt(0.975, (n-p-1))
t.cut.off

## [1] 1.969734

The 95% confidence interval for the true regression coefficient corresponding to ABDOMEN is:

lci = 0.9685 - t.cut.off*(0.08531)
uci = 0.9685 + t.cut.off*(0.08531)
c(lci, uci)

## [1] 0.800462 1.136538

Confidence intervals for all the parameters:

confint(md)

##                    2.5 %      97.5 %
## (Intercept) -66.02663751 -8.92482947
## AGE          -0.04577114  0.06980505
## WEIGHT       -0.22800670 -0.05039445
## HEIGHT       -0.29563565  0.08993870
## CHEST        -0.19757912  0.19591678
## ABDOMEN       0.80042723  1.13649684
## HIP          -0.46849414  0.10177426
## THIGH         0.01747363  0.55397187

Prediction

x0 = c(1, 30, 180, 70, 95, 90, 100, 60)
pred.bodyfat = sum(x0*md$coefficients)
pred.bodyfat

## [1] 16.51927

Intervals using the predict function

x0 = data.frame(AGE = 30, WEIGHT = 180, HEIGHT = 70, CHEST = 95, ABDOMEN = 90, HIP = 100, THIGH = 60)
predict(md, x0, interval = "confidence")

##        fit      lwr      upr
## 1 16.51927 15.20692 17.83162

predict(md, x0, interval = "prediction")

##        fit      lwr      upr
## 1 16.51927 7.678715 25.35983

The car seat position dataset

library(faraway)
data(seatpos)

help(seatpos)

names(seatpos)

## [1] "Age"       "Weight"    "HtShoes"   "Ht"        "Seated"    "Arm"      
## [7] "Thigh"     "Leg"       "hipcenter"

pairs(seatpos)

plot of chunk unnamed-chunk-142

g = lm(hipcenter ~ ., seatpos)
summary(g)

## 
## Call:
## lm(formula = hipcenter ~ ., data = seatpos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -73.827 -22.833  -3.678  25.017  62.337 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 436.43213  166.57162   2.620   0.0138 *
## Age           0.77572    0.57033   1.360   0.1843  
## Weight        0.02631    0.33097   0.080   0.9372  
## HtShoes      -2.69241    9.75304  -0.276   0.7845  
## Ht            0.60134   10.12987   0.059   0.9531  
## Seated        0.53375    3.76189   0.142   0.8882  
## Arm          -1.32807    3.90020  -0.341   0.7359  
## Thigh        -1.14312    2.66002  -0.430   0.6706  
## Leg          -6.43905    4.71386  -1.366   0.1824  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.72 on 29 degrees of freedom
## Multiple R-squared:  0.6866, Adjusted R-squared:  0.6001 
## F-statistic:  7.94 on 8 and 29 DF,  p-value: 1.306e-05

Stepwise Procedures We illustrate the backward method - at each stage we remove the predictor with the largest p-value over a critical value (say, chosen to be 0.05):

g <- update(g, . ~ . - Ht)
summary(g)

## 
## Call:
## lm(formula = hipcenter ~ Age + Weight + HtShoes + Seated + Arm + 
##     Thigh + Leg, data = seatpos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.107 -22.467  -4.207  25.106  62.225 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 436.84207  163.64104   2.670   0.0121 *
## Age           0.76574    0.53590   1.429   0.1634  
## Weight        0.02897    0.32244   0.090   0.9290  
## HtShoes      -2.13409    2.53896  -0.841   0.4073  
## Seated        0.54959    3.68958   0.149   0.8826  
## Arm          -1.30087    3.80833  -0.342   0.7350  
## Thigh        -1.09039    2.46534  -0.442   0.6615  
## Leg          -6.40612    4.60272  -1.392   0.1742  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.09 on 30 degrees of freedom
## Multiple R-squared:  0.6865, Adjusted R-squared:  0.6134 
## F-statistic: 9.385 on 7 and 30 DF,  p-value: 4.014e-06

g <- update(g, . ~ . - Weight)
summary(g)

## 
## Call:
## lm(formula = hipcenter ~ Age + HtShoes + Seated + Arm + Thigh + 
##     Leg, data = seatpos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.263 -22.571  -4.842  24.647  61.926 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 427.5073   124.3877   3.437   0.0017 **
## Age           0.7757     0.5158   1.504   0.1427   
## HtShoes      -2.0823     2.4329  -0.856   0.3986   
## Seated        0.5858     3.6083   0.162   0.8721   
## Arm          -1.2826     3.7415  -0.343   0.7341   
## Thigh        -1.1153     2.4101  -0.463   0.6468   
## Leg          -6.3572     4.4966  -1.414   0.1674   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.49 on 31 degrees of freedom
## Multiple R-squared:  0.6864, Adjusted R-squared:  0.6257 
## F-statistic: 11.31 on 6 and 31 DF,  p-value: 1.122e-06

g <- update(g, . ~ . - Seated)
summary(g)

## 
## Call:
## lm(formula = hipcenter ~ Age + HtShoes + Arm + Thigh + Leg, data = seatpos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -73.966 -22.403  -4.725  24.989  60.834 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 436.5463   109.5266   3.986 0.000365 ***
## Age           0.7667     0.5049   1.518 0.138717    
## HtShoes      -1.7716     1.4786  -1.198 0.239648    
## Arm          -1.3390     3.6683  -0.365 0.717498    
## Thigh        -1.1983     2.3193  -0.517 0.608955    
## Leg          -6.4910     4.3527  -1.491 0.145686    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.93 on 32 degrees of freedom
## Multiple R-squared:  0.6862, Adjusted R-squared:  0.6371 
## F-statistic: 13.99 on 5 and 32 DF,  p-value: 2.823e-07

g <- update(g, . ~ . - Arm)
summary(g)

## 
## Call:
## lm(formula = hipcenter ~ Age + HtShoes + Thigh + Leg, data = seatpos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -77.069 -24.643  -3.584  26.092  59.182 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 445.7977   105.1452   4.240  0.00017 ***
## Age           0.6525     0.3910   1.669  0.10462    
## HtShoes      -1.9171     1.4050  -1.365  0.18164    
## Thigh        -1.3732     2.2392  -0.613  0.54391    
## Leg          -6.9502     4.1118  -1.690  0.10040    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.46 on 33 degrees of freedom
## Multiple R-squared:  0.6849, Adjusted R-squared:  0.6467 
## F-statistic: 17.93 on 4 and 33 DF,  p-value: 6.535e-08

g <- update(g, . ~ . - Thigh)
summary(g)

## 
## Call:
## lm(formula = hipcenter ~ Age + HtShoes + Leg, data = seatpos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -79.269 -22.770  -4.342  21.853  60.907 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 456.2137   102.8078   4.438 9.09e-05 ***
## Age           0.5998     0.3779   1.587   0.1217    
## HtShoes      -2.3023     1.2452  -1.849   0.0732 .  
## Leg          -6.8297     4.0693  -1.678   0.1024    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.13 on 34 degrees of freedom
## Multiple R-squared:  0.6813, Adjusted R-squared:  0.6531 
## F-statistic: 24.22 on 3 and 34 DF,  p-value: 1.437e-08

g <- update(g, . ~ . - Age)
summary(g)

## 
## Call:
## lm(formula = hipcenter ~ HtShoes + Leg, data = seatpos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -89.713 -25.787   2.549  18.445  71.735 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  493.794    102.192   4.832 2.66e-05 ***
## HtShoes       -2.496      1.266  -1.971   0.0566 .  
## Leg           -6.369      4.146  -1.536   0.1335    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.88 on 35 degrees of freedom
## Multiple R-squared:  0.6577, Adjusted R-squared:  0.6381 
## F-statistic: 33.62 on 2 and 35 DF,  p-value: 7.132e-09

g <- update(g, . ~ . - Leg)
summary(g)

## 
## Call:
## lm(formula = hipcenter ~ HtShoes, data = seatpos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -99.981 -27.150   2.983  22.637  73.731 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 565.5927    92.5794   6.109 4.97e-07 ***
## HtShoes      -4.2621     0.5391  -7.907 2.21e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.55 on 36 degrees of freedom
## Multiple R-squared:  0.6346, Adjusted R-squared:  0.6244 
## F-statistic: 62.51 on 1 and 36 DF,  p-value: 2.207e-09

Forward Selection for the car seat position data

for( i in 1:8)
  {
    g1 <- lm(hipcenter ~ ., seatpos[,c(i,9)])
    print((summary(g1))$coef)
}

##                Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -192.964532 24.3015104 -7.940434 1.997902e-09
## Age            0.796289  0.6330851  1.257791 2.165650e-01
##              Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)  1.242164 34.0562959  0.03647384 9.711060e-01
## Weight      -1.067438  0.2134036 -5.00196673 1.493391e-05
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 565.592659 92.5794472  6.109268 4.966825e-07
## HtShoes      -4.262091  0.5390607 -7.906513 2.206673e-09
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 556.255344 90.6704339  6.134914 4.590529e-07
## Ht           -4.264977  0.5351079 -7.970312 1.830624e-09
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 621.823241 122.488378  5.076590 1.188644e-05
## Seated       -8.844124   1.374951 -6.432321 1.844619e-07
##             Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 168.5938  77.445423  2.176936 0.0361229045
## Arm         -10.3514   2.391242 -4.328881 0.0001142407
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 186.890582  80.373022  2.325290 2.580801e-02
## Thigh        -9.100325   2.069128 -4.398145 9.290168e-05
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 335.35118  65.601153  5.111971 1.066600e-05
## Leg         -13.79461   1.801321 -7.658051 4.587375e-09

The first selected predictor: Ht

for( i in c(1:3,5:8))
  {
     g2 <- lm(hipcenter ~ ., seatpos[,c(i,4,9)])
     print((summary(g2))$coef)
}

##                Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 526.9588909 92.2478769  5.712423 1.848352e-06
## Age           0.5210614  0.3862472  1.349036 1.859889e-01
## Ht           -4.2003806  0.5312787 -7.906172 2.691691e-09
##                Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept) 589.9005691 127.4824497  4.6273081 4.918677e-05
## Weight        0.1148334   0.3020236  0.3802135 7.060846e-01
## Ht           -4.5696588   0.9671936 -4.7246577 3.675055e-05
##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) 552.568840  95.754525  5.7706812 1.548571e-06
## HtShoes       1.230074   8.937649  0.1376284 8.913228e-01
## Ht           -5.490019   8.917605 -0.6156382 5.421163e-01
##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) 538.486199 112.509336  4.7861468 3.055654e-05
## Seated        0.903040   3.301534  0.2735214 7.860601e-01
## Ht           -4.634962   1.457264 -3.1805911 3.074214e-03
##                Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) 560.2221531 93.4055743  5.9977379 7.774137e-07
## Arm           0.6441307  2.7270290  0.2362024 8.146525e-01
## Ht           -4.4111641  0.8228605 -5.3607676 5.378845e-06
##                Estimate Std. Error     t value     Pr(>|t|)
## (Intercept) 555.6572893 92.5218032  6.00569023 7.588931e-07
## Thigh        -0.1346205  2.3075402 -0.05833939 9.538101e-01
## Ht           -4.2306633  0.8002714 -5.28653594 6.737776e-06
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 491.243757  99.543358  4.934973 1.952182e-05
## Leg          -6.135518   4.164051 -1.473449 1.495675e-01
## Ht           -2.564612   1.268480 -2.021799 5.089444e-02

The second selected predictor: Leg Continue this. Regsubsets on the car seat position data:

library(leaps)

## Warning: package 'leaps' was built under R version 3.3.2

b = regsubsets(hipcenter ~ ., seatpos)
rs = summary(b)
rs$which

##   (Intercept)   Age Weight HtShoes    Ht Seated   Arm Thigh   Leg
## 1        TRUE FALSE  FALSE   FALSE  TRUE  FALSE FALSE FALSE FALSE
## 2        TRUE FALSE  FALSE   FALSE  TRUE  FALSE FALSE FALSE  TRUE
## 3        TRUE  TRUE  FALSE   FALSE  TRUE  FALSE FALSE FALSE  TRUE
## 4        TRUE  TRUE  FALSE    TRUE FALSE  FALSE FALSE  TRUE  TRUE
## 5        TRUE  TRUE  FALSE    TRUE FALSE  FALSE  TRUE  TRUE  TRUE
## 6        TRUE  TRUE  FALSE    TRUE FALSE   TRUE  TRUE  TRUE  TRUE
## 7        TRUE  TRUE   TRUE    TRUE FALSE   TRUE  TRUE  TRUE  TRUE
## 8        TRUE  TRUE   TRUE    TRUE  TRUE   TRUE  TRUE  TRUE  TRUE

Leave One Out Cross Validation Score for model using all explanatory variables.

n <- nrow(seatpos)
pred.y <- rep(NA,n) 
for (i in 1:nrow(seatpos))
  {
   g <- lm(hipcenter~., seatpos, subset=(1:n)[-i])
   pred.y[i] <- predict(g, seatpos[i,-9])
  }
cv.err.full <-  sum((seatpos[,9]-pred.y)^2)
cv.err.full

## [1] 75065.76

for (i in 1:nrow(seatpos))
  {
   g <- lm(hipcenter ~ Age+Ht+Leg, seatpos, subset=(1:n)[-i])
   pred.y[i] <- predict(g, seatpos[i,c("Age","Ht","Leg")])
  }
cv.err.Age.Ht.Leg <-  sum((seatpos[,9]-pred.y)^2)
cv.err.Age.Ht.Leg

## [1] 53794.79

Back to the bodyfat datset

body = read.csv(file.path(dataDir, "bodyfat_short.csv"), header = T)
#Let us fit a linear regression equation to bodyfat based on Age, Weight, Height, Chest, Abdomen, Hip and Thigh:
md = lm(BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + HIP + THIGH, data = body)
summary(md)

## 
## Call:
## lm(formula = BODYFAT ~ AGE + WEIGHT + HEIGHT + CHEST + ABDOMEN + 
##     HIP + THIGH, data = body)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0729  -3.2387  -0.0782   3.0623  10.3611 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.748e+01  1.449e+01  -2.585  0.01031 *  
## AGE          1.202e-02  2.934e-02   0.410  0.68246    
## WEIGHT      -1.392e-01  4.509e-02  -3.087  0.00225 ** 
## HEIGHT      -1.028e-01  9.787e-02  -1.051  0.29438    
## CHEST       -8.312e-04  9.989e-02  -0.008  0.99337    
## ABDOMEN      9.685e-01  8.531e-02  11.352  < 2e-16 ***
## HIP         -1.834e-01  1.448e-01  -1.267  0.20648    
## THIGH        2.857e-01  1.362e-01   2.098  0.03693 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.438 on 244 degrees of freedom
## Multiple R-squared:  0.7266, Adjusted R-squared:  0.7187 
## F-statistic: 92.62 on 7 and 244 DF,  p-value: < 2.2e-16

Regression diagnostics:

par(mfrow = c(2, 2))
plot(md)

plot of chunk unnamed-chunk-150

par(mfrow = c(1, 1))

body[c(39, 42, 36),]

##    BODYFAT AGE WEIGHT HEIGHT CHEST ABDOMEN   HIP THIGH
## 39    35.2  46 363.15  72.25 136.2   148.1 147.7  87.3
## 42    32.9  44 205.00  29.50 106.0   104.3 115.5  70.6
## 36    40.1  49 191.75  65.00 118.5   113.1 113.8  61.9

apply(body, 2, mean)

##   BODYFAT       AGE    WEIGHT    HEIGHT     CHEST   ABDOMEN       HIP 
##  19.15079  44.88492 178.92440  70.14881 100.82421  92.55595  99.90476 
##     THIGH 
##  59.40595

Non-linearity

n = 200
xx = 3 + 4*abs(rnorm(n))
yy = -2 + 0.5*xx^(1.85) + rnorm(n)
m1 = lm(yy~xx)
par(mfrow = c(2, 2))
plot(m1)

plot of chunk unnamed-chunk-152

par(mfrow = c(1, 1))
plot(yy~xx)

plot of chunk unnamed-chunk-152

Including square of x as another explanatory variable

m2 = lm(yy ~ xx + I(xx^2))
summary(m2)

## 
## Call:
## lm(formula = yy ~ xx + I(xx^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.76377 -0.73391 -0.00791  0.74016  2.62428 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.326461   0.458585  -7.254 9.07e-12 ***
## xx           0.800597   0.131967   6.067 6.57e-09 ***
## I(xx^2)      0.285887   0.008715  32.804  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.046 on 197 degrees of freedom
## Multiple R-squared:  0.9927, Adjusted R-squared:  0.9927 
## F-statistic: 1.345e+04 on 2 and 197 DF,  p-value: < 2.2e-16

par(mfrow = c(2, 2))
plot(m2)

plot of chunk unnamed-chunk-153

par(mfrow = c(1, 1))

An example involving heteroscedasticity

n = 200
xx = 3 + 4*abs(rnorm(n))
yy = -2 + 5*xx + (xx^(1.5))*rnorm(n)
m1 = lm(yy ~ xx)
par(mfrow = c(2, 2))
plot(m1)

plot of chunk unnamed-chunk-154

par(mfrow = c(1, 1))
plot(yy~xx)

plot of chunk unnamed-chunk-154