This dataset contains protein coding genes from the mouse genome along with their chromosomal coordinates and GC content.
Note: Non-standard chromosomes and ~1000 genes 200kb and longer have been removed to make the data easier to work with
library(ggplot2)
# install.packages("dplyr") # only do this once
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Import the “mousegenes.csv” file into an object called ‘mgenes’ and take a look at it.
mgenes = read.csv("mouse_genes.csv", stringsAsFactors = FALSE)
str(mgenes)
## 'data.frame': 19699 obs. of 5 variables:
## $ Id : chr "ENSMUSG00000024566" "ENSMUSG00000022652" "ENSMUSG00000068663" "ENSMUSG00000051586" ...
## $ Chr : chr "chr18" "chr16" "chr16" "chr6" ...
## $ Start : int 80734141 48431237 10545339 120931707 83925145 30174482 97689824 4920540 83866712 194618218 ...
## $ End : int 80934060 48630900 10744878 121130999 84124240 30373375 97888707 5119299 84065379 194816869 ...
## $ GC_Percent: num 41 41.4 47.4 49.8 41.5 ...
Add a column called ‘Gene_Length_kb’ containing gene lengths.
lengths = round((abs(mgenes$Start-mgenes$End) + 1)/1000, digits = 1)
mgenes$Gene_Length_kb = lengths
tail(mgenes)
## Id Chr Start End GC_Percent Gene_Length_kb
## 19694 ENSMUSG00000095655 chrX 11315158 11315475 51.57 0.3
## 19695 ENSMUSG00000099443 chrX 11321492 11321809 51.57 0.3
## 19696 ENSMUSG00000095445 chrX 11324659 11324976 51.57 0.3
## 19697 ENSMUSG00000064288 chr13 21750194 21750505 65.38 0.3
## 19698 ENSMUSG00000043982 chr16 88884786 88885089 55.92 0.3
## 19699 ENSMUSG00000045566 chr3 92500263 92500493 51.52 0.2
Use ggplot to make a histogram of gene lengths.
boundary = 0, closed = "left"
After you are happy with the graph, store it in an object called “mgenes_hist” and take a look at it.
mgenes_hist = ggplot(mgenes, aes(x=Gene_Length_kb)) +
geom_histogram(col="black",fill="lightblue", alpha=0.5,
boundary = 0, closed = "left", binwidth = 5) +
xlab("Mouse genes (kb)") +
ylab("Frequency")
mgenes_hist
Compute the mean and median of the distribution. Then, add some vertical lines to the “genes_hist” graph using geom_vline()
:
linetype="dashed"
)Now take a look at the graph again. Do these data look skewed?
mean(mgenes$Gene_Length_kb)
## [1] 31.62101
median(mgenes$Gene_Length_kb)
## [1] 16.8
mgenes_hist = mgenes_hist +
geom_vline(aes(xintercept = mean(Gene_Length_kb)),
color="blue", linetype="dashed") +
geom_vline(aes(xintercept = median(Gene_Length_kb)),
color="blue")
mgenes_hist
Add text labels to the lines calling them “mean” and “median” using geom_text()
. Below we’ve added all the fancy bits for you!
mgenes_hist = mgenes_hist + # original plot
geom_text(aes(x= median(Gene_Length_kb) , # add this
y=3000, label="Median"),
color="blue", angle=90, vjust = 1.5) +
geom_text(aes(x= mean(Gene_Length_kb), # add this
y=3000, label="Mean"),
color="blue", angle=90, vjust = 1.5)
mgenes_hist
Use the summary()
and IQR()
function to find the IQR of gene lengths, and make a boxplot of gene lengths (use base R or ggplot, as you like).
summary(mgenes$Gene_Length_kb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.20 6.70 16.80 31.62 40.60 199.90
IQR(mgenes$Gene_Length_kb)
## [1] 33.9
# ========================================================================= #
# base R
boxplot(mgenes$Gene_Length_kb)
# ggplot
ggplot(data = mgenes, mapping = aes(y=Gene_Length_kb)) +
geom_boxplot()
summary()
to get a vector of stats for the distribution and call it something short for convenience.# x is now a vector of stats for the distribution with named elements
x = summary(mgenes$Gene_Length_kb)
# calculate the highest and lowest values that are not outliers
# hint: where do the whiskers typically end on a boxplot?
min_len = x["1st Qu."] - 1.5*IQR(mgenes$Gene_Length_kb)
max_len = x[5] + 1.5*IQR(mgenes$Gene_Length_kb)
c(min_len,max_len)
## 1st Qu. 3rd Qu.
## -44.15 91.45
# ========================================================================= #
# get trimmed data using base R
trimmed = mgenes[ which(mgenes$Gene_Length_kb >= min_len & # filter rows
mgenes$Gene_Length_kb <= max_len),
"Gene_Length_kb" ] # select column
str(trimmed)
## num [1:17967] 91.4 91.4 91.4 91.4 91.3 91.3 91.3 91.3 91.2 91.2 ...
is(trimmed)
## [1] "numeric" "vector"
# OR, get trimmed data use dplyr
#library(dplyr)
trimmed = mgenes %>% # data frame
filter(mgenes$Gene_Length_kb >= min_len & # filter by rows
mgenes$Gene_Length_kb <= max_len) %>%
select(Gene_Length_kb) # select specific columns
# take a look at the result
str(trimmed)
## 'data.frame': 17967 obs. of 1 variable:
## $ Gene_Length_kb: num 91.4 91.4 91.4 91.4 91.3 91.3 91.3 91.3 91.2 91.2 ...
is(trimmed)
## [1] "data.frame" "list" "oldClass" "vector"
# compare trimmed estimates to previous values
mean(mgenes$Gene_Length_kb)
## [1] 31.62101
mean(trimmed$Gene_Length_kb)
## [1] 22.04246
median(mgenes$Gene_Length_kb)
## [1] 16.8
median(trimmed$Gene_Length_kb)
## [1] 14.4
geom
layer, and adding the aes
mapping to that new dataset.# add trimmed estimates to our histogram
mgenes_hist + # uses the "mgenes" data frame
# uses the "trimmed" data frame
geom_histogram(data = trimmed, aes(x=Gene_Length_kb),
col="black",fill="salmon", alpha=0.2,
binwidth = 5) +
# also using "trimmed" data frame
geom_vline(data = trimmed, aes(xintercept = mean(Gene_Length_kb)),
color="red", linetype="dashed") +
geom_vline(data = trimmed, aes(xintercept = median(Gene_Length_kb)),
color="red")
Look at the variation in gene lengths among chromosomes by generating a ggplot containing separate boxplots for each chromosome.
Add some horizontal lines showing the limits of the IQR for all genes together.
ggplot( data=mgenes, (aes(x=Chr, y=Gene_Length_kb, color=Chr)) ) +
geom_boxplot() +
# 1st and 3rd quartiles of combined data
geom_hline(data = mgenes, aes(yintercept = summary(mgenes$Gene_Length_kb)[5]),
color="black", linetype="dashed") +
geom_hline(data = mgenes, aes(yintercept = summary(mgenes$Gene_Length_kb)[2]),
color="black", linetype="dashed")
## Warning: Use of `mgenes$Gene_Length_kb` is discouraged. Use `Gene_Length_kb`
## instead.
## Warning: Use of `mgenes$Gene_Length_kb` is discouraged. Use `Gene_Length_kb`
## instead.
Make a bar plot of the total number of genes on each chromosome.
# dplyr
genes_per_chr = mgenes %>% count(Chr)
head(genes_per_chr)
## Chr n
## 1 chr1 1075
## 2 chr10 913
## 3 chr11 1523
## 4 chr12 567
## 5 chr13 739
## 6 chr14 692
ggplot( data=genes_per_chr, (aes(x=Chr, y=n, fill=Chr)) ) +
geom_col() +
labs(x="Chromosome", y="Total genes per chromosome") +
# rotate axis tick labels to make them legible
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Which chromosome has the fewest genes? Is this consistent with your background knowledge?
We don’t actually have this information in our table, but let’s use the largest gene End
coordinate for each chromosome as a proxy for chromosome length.
Since chromosomes are really big, report the size in megabases (Mb).
# approximate chr lengths
# dplyr
chr_length = mgenes %>% # original data frame
group_by(Chr) %>% # group by chromosome
summarise(chr_size = max(End)) # get upper coordinate limit for each chr
str(chr_length) # a new data frame with 2 columns
## tibble [21 × 2] (S3: tbl_df/tbl/data.frame)
## $ Chr : chr [1:21] "chr1" "chr10" "chr11" "chr12" ...
## $ chr_size: int [1:21] 195176716 130497396 121843423 120021246 120318178 122984035 103899312 98031801 94750019 90592528 ...
# ========================================================================= #
# dot plot, default (alphabetical) order of chromosomes
ggplot( data=chr_length, (aes(x=Chr,
y=chr_size/1000000, col=Chr)) ) +
geom_point() +
xlab("Chromosome") +
ylab("Chromosome size (est., in Mb)") +
# rotate axis tick labels to make them legible
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# ========================================================================= #
# dot plot, sorted by chromosome size
ggplot( data=chr_length, (aes(x=reorder(Chr,chr_size),
y=chr_size/1000000, col=Chr)) ) +
geom_point() +
xlab("Chromosome") + # could also use lab(x= , y= )
ylab("Chromosome size (est., in Mb)") +
# rotate axis tick labels to make them legible
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Which is the smallest chromosome? Is the chromosome with the fewest genes also the smallest? What do you think is going on?
Calculate the standard deviation for gene lengths and gene GC-content, then calculate the coefficient of variation to compare the variation in gene lengths with the variation in gene GC-content. Which one is more variable?
# sd
sd(mgenes$Gene_Length_kb)
## [1] 37.84557
sd(mgenes$GC_Percent)
## [1] 6.010397
# cv = ( sd / mean ) * 100
cv_gene_length = sd(mgenes$Gene_Length_kb) / mean(mgenes$Gene_Length_kb) * 100
cv_gc = sd(mgenes$GC_Percent) / mean(mgenes$GC_Percent) * 100
cv_gene_length
## [1] 119.6849
cv_gc
## [1] 13.07665
summary(mgenes$GC_Percent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29.92 41.47 45.60 45.96 50.29 73.13