# R has a built-in data set called "trees"
trees
##    Girth Height Volume
## 1    8.3     70   10.3
## 2    8.6     65   10.3
## 3    8.8     63   10.2
## 4   10.5     72   16.4
## 5   10.7     81   18.8
## 6   10.8     83   19.7
## 7   11.0     66   15.6
## 8   11.0     75   18.2
## 9   11.1     80   22.6
## 10  11.2     75   19.9
## 11  11.3     79   24.2
## 12  11.4     76   21.0
## 13  11.4     76   21.4
## 14  11.7     69   21.3
## 15  12.0     75   19.1
## 16  12.9     74   22.2
## 17  12.9     85   33.8
## 18  13.3     86   27.4
## 19  13.7     71   25.7
## 20  13.8     64   24.9
## 21  14.0     78   34.5
## 22  14.2     80   31.7
## 23  14.5     74   36.3
## 24  16.0     72   38.3
## 25  16.3     77   42.6
## 26  17.3     81   55.4
## 27  17.5     82   55.7
## 28  17.9     80   58.3
## 29  18.0     80   51.5
## 30  18.0     80   51.0
## 31  20.6     87   77.0
str(trees)
## 'data.frame':    31 obs. of  3 variables:
##  $ Girth : num  8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
##  $ Height: num  70 65 63 72 81 83 66 75 80 75 ...
##  $ Volume: num  10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 ...
# it is more convenient to refer to the columns of the data directly (e.g. Height)
# instead of having to write something like trees$Height - attach the data set to enable this
attach(trees)

# let us explore the relationship between trunk volume and height:
# always start by plotting the data
plot(x = Height,
     y = Volume)

# Volume and Height look like they might co-vary, test for correlation
cor.test(Height, Volume)
## 
##  Pearson's product-moment correlation
## 
## data:  Height and Volume
## t = 4.0205, df = 29, p-value = 0.0003784
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3095235 0.7859756
## sample estimates:
##       cor 
## 0.5982497
# the two variables do indeed co-vary;
# moreover, Volume should depend on Height (think of how the volume of a cylinder is measured),
# so let us build a regression model where Volume is dependent on Height

# let us calculate the slope and the intercept using the formula offered by the least square technique
slope <- cov(Height, Volume) / var(Height)
intercept <- mean(Volume) - slope * mean(Height)

slope
## [1] 1.54335
intercept
## [1] -87.12361
abline(a = intercept,
       b = slope)

# rather than performing "manual" calculations, lm() can do all this for you
our_model <- lm(formula = Volume ~ Height)
our_model
## 
## Call:
## lm(formula = Volume ~ Height)
## 
## Coefficients:
## (Intercept)       Height  
##     -87.124        1.543
summary(our_model)
## 
## Call:
## lm(formula = Volume ~ Height)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.274  -9.894  -2.894  12.068  29.852 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -87.1236    29.2731  -2.976 0.005835 ** 
## Height        1.5433     0.3839   4.021 0.000378 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.4 on 29 degrees of freedom
## Multiple R-squared:  0.3579, Adjusted R-squared:  0.3358 
## F-statistic: 16.16 on 1 and 29 DF,  p-value: 0.0003784
# moreover, R offers a special plotting method for the objects of the class "lm" (outputs of the lm() function):
# several diagnostic plots are produced
# which help you evaluate whether the assumptions of linear regression model (think L.I.N.E.) are fulfilled
plot(our_model)

# since the equality of variance is violated (it clearly increases with increasing Height),
# try transforming one or all of the variables
our_model_t <- lm(formula = log(Volume) ~ Height)

plot(our_model_t)

# variance has become more equal,
# although the distribution of residuals still slightly deviates from normality (but it is close, so probably ok)