This dataset contains protein coding genes from the mouse genome along with their chromosomal coordinates and GC content.

Note: Non-standard chromosomes and ~1000 genes 200kb and longer have been removed to make the data easier to work with

0. Load required libraries (ggplot and dplyr)

library(ggplot2)
# install.packages("dplyr") # only do this once
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1. Load the data

Import the “mousegenes.csv” file into an object called ‘mgenes’ and take a look at it.

mgenes = read.csv("mouse_genes.csv", stringsAsFactors = FALSE)
str(mgenes)
## 'data.frame':    19699 obs. of  5 variables:
##  $ Id        : chr  "ENSMUSG00000024566" "ENSMUSG00000022652" "ENSMUSG00000068663" "ENSMUSG00000051586" ...
##  $ Chr       : chr  "chr18" "chr16" "chr16" "chr6" ...
##  $ Start     : int  80734141 48431237 10545339 120931707 83925145 30174482 97689824 4920540 83866712 194618218 ...
##  $ End       : int  80934060 48630900 10744878 121130999 84124240 30373375 97888707 5119299 84065379 194816869 ...
##  $ GC_Percent: num  41 41.4 47.4 49.8 41.5 ...

2. Compute gene lengths

Add a column called ‘Gene_Length_kb’ containing gene lengths.

lengths = round((abs(mgenes$Start-mgenes$End) + 1)/1000, digits = 1)
mgenes$Gene_Length_kb = lengths
tail(mgenes)
##                       Id   Chr    Start      End GC_Percent Gene_Length_kb
## 19694 ENSMUSG00000095655  chrX 11315158 11315475      51.57            0.3
## 19695 ENSMUSG00000099443  chrX 11321492 11321809      51.57            0.3
## 19696 ENSMUSG00000095445  chrX 11324659 11324976      51.57            0.3
## 19697 ENSMUSG00000064288 chr13 21750194 21750505      65.38            0.3
## 19698 ENSMUSG00000043982 chr16 88884786 88885089      55.92            0.3
## 19699 ENSMUSG00000045566  chr3 92500263 92500493      51.52            0.2

3. Histogram of gene lengths

Use ggplot to make a histogram of gene lengths.

After you are happy with the graph, store it in an object called “mgenes_hist” and take a look at it.

mgenes_hist = ggplot(mgenes, aes(x=Gene_Length_kb)) + 
  geom_histogram(col="black",fill="lightblue", alpha=0.5, 
                 boundary = 0, closed = "left", binwidth = 5) + 
  xlab("Mouse genes (kb)") +
  ylab("Frequency")
mgenes_hist

4. Mean and median

Compute the mean and median of the distribution. Then, add some vertical lines to the “genes_hist” graph using geom_vline():

Now take a look at the graph again. Do these data look skewed?

mean(mgenes$Gene_Length_kb)
## [1] 31.62101
median(mgenes$Gene_Length_kb)
## [1] 16.8
mgenes_hist = mgenes_hist +
  geom_vline(aes(xintercept = mean(Gene_Length_kb)),
             color="blue", linetype="dashed") +
    geom_vline(aes(xintercept = median(Gene_Length_kb)),
             color="blue")
mgenes_hist

5. Label mean and median lines

Add text labels to the lines calling them “mean” and “median” using geom_text(). Below we’ve added all the fancy bits for you!

mgenes_hist = mgenes_hist +  # original plot
  
  geom_text(aes(x= median(Gene_Length_kb) , # add this
                y=3000, label="Median"),
            color="blue", angle=90, vjust = 1.5) +
  
  geom_text(aes(x= mean(Gene_Length_kb),    # add this
                y=3000, label="Mean"),
            color="blue", angle=90, vjust = 1.5)

mgenes_hist

6. IQR and boxplot of gene lengths

Use the summary() and IQR() function to find the IQR of gene lengths, and make a boxplot of gene lengths (use base R or ggplot, as you like).

summary(mgenes$Gene_Length_kb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.20    6.70   16.80   31.62   40.60  199.90
IQR(mgenes$Gene_Length_kb)
## [1] 33.9
# ========================================================================= #
# base R
boxplot(mgenes$Gene_Length_kb)

# ggplot
ggplot(data = mgenes, mapping = aes(y=Gene_Length_kb)) +
  geom_boxplot()

7. Remove outliers

  1. Trim the dataset to remove outliers that fall beyond the whiskers in the boxplot.
# x is now a vector of stats for the distribution with named elements
x = summary(mgenes$Gene_Length_kb)

# calculate the highest and lowest values that are not outliers
# hint: where do the whiskers typically end on a boxplot?
min_len = x["1st Qu."] - 1.5*IQR(mgenes$Gene_Length_kb)
max_len = x[5]         + 1.5*IQR(mgenes$Gene_Length_kb)
c(min_len,max_len)
## 1st Qu. 3rd Qu. 
##  -44.15   91.45
# ========================================================================= #
# get trimmed data using base R
trimmed = mgenes[ which(mgenes$Gene_Length_kb >= min_len &  # filter rows
                  mgenes$Gene_Length_kb <= max_len), 
           "Gene_Length_kb" ]                               # select column
str(trimmed)
##  num [1:17967] 91.4 91.4 91.4 91.4 91.3 91.3 91.3 91.3 91.2 91.2 ...
is(trimmed)
## [1] "numeric" "vector"
# OR, get trimmed data use dplyr
#library(dplyr)
trimmed = mgenes %>%  # data frame
  
  filter(mgenes$Gene_Length_kb >= min_len  &     # filter by rows
         mgenes$Gene_Length_kb <= max_len) %>% 
  
  select(Gene_Length_kb)                         # select specific columns

# take a look at the result
str(trimmed)
## 'data.frame':    17967 obs. of  1 variable:
##  $ Gene_Length_kb: num  91.4 91.4 91.4 91.4 91.3 91.3 91.3 91.3 91.2 91.2 ...
is(trimmed)
## [1] "data.frame" "list"       "oldClass"   "vector"

8. Mean and median of trimmed data

  1. Recompute the mean and median for the trimmed data and compare to the original values. How did the mean and median change? Which one changed more, and why?
# compare trimmed estimates to previous values
mean(mgenes$Gene_Length_kb)
## [1] 31.62101
mean(trimmed$Gene_Length_kb)
## [1] 22.04246
median(mgenes$Gene_Length_kb)
## [1] 16.8
median(trimmed$Gene_Length_kb)
## [1] 14.4
  1. To visualize the changes, add the same kind of mean and median vlines for the “trimmed” data to “mgenes_hist”, using a reddish color.
# add trimmed estimates to our histogram

mgenes_hist +  # uses the "mgenes" data frame
  
  # uses the "trimmed" data frame
  geom_histogram(data = trimmed, aes(x=Gene_Length_kb),
                 col="black",fill="salmon", alpha=0.2, 
                  binwidth = 5) +
  
  # also using "trimmed" data frame
  geom_vline(data = trimmed, aes(xintercept = mean(Gene_Length_kb)),
             color="red", linetype="dashed") +
  geom_vline(data = trimmed, aes(xintercept = median(Gene_Length_kb)),
             color="red")

9. Variation in gene lengths by chromosome

Look at the variation in gene lengths among chromosomes by generating a ggplot containing separate boxplots for each chromosome.

Add some horizontal lines showing the limits of the IQR for all genes together.

ggplot( data=mgenes, (aes(x=Chr, y=Gene_Length_kb, color=Chr)) ) + 
  geom_boxplot() +

  # 1st and 3rd quartiles of combined data
  geom_hline(data = mgenes, aes(yintercept = summary(mgenes$Gene_Length_kb)[5]),
             color="black", linetype="dashed") +
  geom_hline(data = mgenes, aes(yintercept = summary(mgenes$Gene_Length_kb)[2]),
             color="black", linetype="dashed")
## Warning: Use of `mgenes$Gene_Length_kb` is discouraged. Use `Gene_Length_kb`
## instead.

## Warning: Use of `mgenes$Gene_Length_kb` is discouraged. Use `Gene_Length_kb`
## instead.

10. Genes per chromosome

Make a bar plot of the total number of genes on each chromosome.

# dplyr
genes_per_chr = mgenes %>% count(Chr)
head(genes_per_chr)
##     Chr    n
## 1  chr1 1075
## 2 chr10  913
## 3 chr11 1523
## 4 chr12  567
## 5 chr13  739
## 6 chr14  692
ggplot( data=genes_per_chr, (aes(x=Chr, y=n, fill=Chr)) ) + 
  geom_col() +
  labs(x="Chromosome", y="Total genes per chromosome") +
  
  # rotate axis tick labels to make them legible 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Which chromosome has the fewest genes? Is this consistent with your background knowledge?

11. Chromosome size

We don’t actually have this information in our table, but let’s use the largest gene End coordinate for each chromosome as a proxy for chromosome length.

Since chromosomes are really big, report the size in megabases (Mb).

# approximate chr lengths
# dplyr
chr_length = mgenes %>%            # original data frame
  group_by(Chr) %>%                # group by chromosome
  summarise(chr_size = max(End))   # get upper coordinate limit for each chr

str(chr_length)                    # a new data frame with 2 columns
## tibble [21 × 2] (S3: tbl_df/tbl/data.frame)
##  $ Chr     : chr [1:21] "chr1" "chr10" "chr11" "chr12" ...
##  $ chr_size: int [1:21] 195176716 130497396 121843423 120021246 120318178 122984035 103899312 98031801 94750019 90592528 ...
# ========================================================================= #
# dot plot, default (alphabetical) order of chromosomes
ggplot( data=chr_length, (aes(x=Chr, 
                              y=chr_size/1000000, col=Chr)) ) + 
  geom_point() +
  xlab("Chromosome") + 
  ylab("Chromosome size (est., in Mb)") +

  # rotate axis tick labels to make them legible 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# ========================================================================= #
# dot plot, sorted by chromosome size
ggplot( data=chr_length, (aes(x=reorder(Chr,chr_size), 
                              y=chr_size/1000000, col=Chr)) ) + 
  geom_point() +
  xlab("Chromosome") +                     # could also use lab(x= , y= )
  ylab("Chromosome size (est., in Mb)") +

  # rotate axis tick labels to make them legible 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Which is the smallest chromosome? Is the chromosome with the fewest genes also the smallest? What do you think is going on?

12. SD and CV

Calculate the standard deviation for gene lengths and gene GC-content, then calculate the coefficient of variation to compare the variation in gene lengths with the variation in gene GC-content. Which one is more variable?

# sd
sd(mgenes$Gene_Length_kb)
## [1] 37.84557
sd(mgenes$GC_Percent)
## [1] 6.010397
# cv = ( sd / mean ) * 100
cv_gene_length = sd(mgenes$Gene_Length_kb) / mean(mgenes$Gene_Length_kb) * 100
cv_gc = sd(mgenes$GC_Percent) / mean(mgenes$GC_Percent) * 100
cv_gene_length
## [1] 119.6849
cv_gc
## [1] 13.07665
summary(mgenes$GC_Percent)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.92   41.47   45.60   45.96   50.29   73.13