# R has a built-in data set called "trees"
trees
## Girth Height Volume
## 1 8.3 70 10.3
## 2 8.6 65 10.3
## 3 8.8 63 10.2
## 4 10.5 72 16.4
## 5 10.7 81 18.8
## 6 10.8 83 19.7
## 7 11.0 66 15.6
## 8 11.0 75 18.2
## 9 11.1 80 22.6
## 10 11.2 75 19.9
## 11 11.3 79 24.2
## 12 11.4 76 21.0
## 13 11.4 76 21.4
## 14 11.7 69 21.3
## 15 12.0 75 19.1
## 16 12.9 74 22.2
## 17 12.9 85 33.8
## 18 13.3 86 27.4
## 19 13.7 71 25.7
## 20 13.8 64 24.9
## 21 14.0 78 34.5
## 22 14.2 80 31.7
## 23 14.5 74 36.3
## 24 16.0 72 38.3
## 25 16.3 77 42.6
## 26 17.3 81 55.4
## 27 17.5 82 55.7
## 28 17.9 80 58.3
## 29 18.0 80 51.5
## 30 18.0 80 51.0
## 31 20.6 87 77.0
str(trees)
## 'data.frame': 31 obs. of 3 variables:
## $ Girth : num 8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
## $ Height: num 70 65 63 72 81 83 66 75 80 75 ...
## $ Volume: num 10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 ...
# it is more convenient to refer to the columns of the data directly (e.g. Height)
# instead of having to write something like trees$Height - attach the data set to enable this
attach(trees)
# let us explore the relationship between trunk volume and height:
# always start by plotting the data
plot(x = Height,
y = Volume)
# Volume and Height look like they might co-vary, test for correlation
cor.test(Height, Volume)
##
## Pearson's product-moment correlation
##
## data: Height and Volume
## t = 4.0205, df = 29, p-value = 0.0003784
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3095235 0.7859756
## sample estimates:
## cor
## 0.5982497
# the two variables do indeed co-vary;
# moreover, Volume should depend on Height (think of how the volume of a cylinder is measured),
# so let us build a regression model where Volume is dependent on Height
# let us calculate the slope and the intercept using the formula offered by the least square technique
slope <- cov(Height, Volume) / var(Height)
intercept <- mean(Volume) - slope * mean(Height)
slope
## [1] 1.54335
intercept
## [1] -87.12361
abline(a = intercept,
b = slope)
# rather than performing "manual" calculations, lm() can do all this for you
our_model <- lm(formula = Volume ~ Height)
our_model
##
## Call:
## lm(formula = Volume ~ Height)
##
## Coefficients:
## (Intercept) Height
## -87.124 1.543
summary(our_model)
##
## Call:
## lm(formula = Volume ~ Height)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.274 -9.894 -2.894 12.068 29.852
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.1236 29.2731 -2.976 0.005835 **
## Height 1.5433 0.3839 4.021 0.000378 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.4 on 29 degrees of freedom
## Multiple R-squared: 0.3579, Adjusted R-squared: 0.3358
## F-statistic: 16.16 on 1 and 29 DF, p-value: 0.0003784
# moreover, R offers a special plotting method for the objects of the class "lm" (outputs of the lm() function):
# several diagnostic plots are produced
# which help you evaluate whether the assumptions of linear regression model (think L.I.N.E.) are fulfilled
plot(our_model)
# since the equality of variance is violated (it clearly increases with increasing Height),
# try transforming one or all of the variables
our_model_t <- lm(formula = log(Volume) ~ Height)
plot(our_model_t)
# variance has become more equal,
# although the distribution of residuals still slightly deviates from normality (but it is close, so probably ok)