###
# the data set provided is a modified and trimmed version of the data taken from
# https://www.kaggle.com/rec3141/biol342-genome-data
# undergraduate students at the University of Alaska Fairbanks cultured, sequenced, and assembled
# the complete genome of microorganisms as part of the General Microbiology course.
# the data set contains tetranucleotide frequencies in each contig of each genome,
# along with some additional assembly statistics, such as contig length, %GC, and read coverage.
# in this exercise, let us explore the nucleotide composition of contigs in the first genome ("genome0")
# with the goal of identifying potential outliers / wrongly assemblied contigs / contamination from other species
###
# import data
my_df <- read.csv("/Users/bsierieb/Documents/GitHub/xdasi-bio-2021/docs/syllabus/week_15_PCA_Clustering/Class2/W15.C2_Exercise_Clustering_DATA.csv")
# explore data
str(my_df)
## 'data.frame': 67095 obs. of 261 variables:
## $ contig: chr "genome0_1" "genome0_10" "genome0_100" "genome0_1000" ...
## $ genome: chr "genome0" "genome0" "genome0" "genome0" ...
## $ cov : num 29.01 34.18 6.22 3.61 3.55 ...
## $ len : int 255873 96681 11623 1086 1085 1084 1084 1084 1083 1083 ...
## $ gc : num 0.499 0.524 0.478 0.38 0.374 ...
## $ AAAA : int 2122 737 129 8 13 3 7 24 3 27 ...
## $ AAAC : int 1416 546 76 7 5 5 4 17 4 8 ...
## $ AAAG : int 1386 528 107 1 7 4 4 16 2 15 ...
## $ AAAT : int 1575 495 57 11 7 3 6 13 10 17 ...
## $ AACA : int 1233 410 82 3 8 3 4 16 5 6 ...
## $ AACC : int 1109 411 62 4 7 5 5 5 1 3 ...
## $ AACG : int 1357 553 89 3 4 2 0 8 9 9 ...
## $ AACT : int 901 394 40 4 11 3 3 7 4 5 ...
## $ AAGA : int 966 384 49 3 7 5 5 7 3 9 ...
## $ AAGC : int 1074 499 87 4 4 4 3 12 4 7 ...
## $ AAGG : int 841 297 68 0 1 6 3 3 0 9 ...
## $ AAGT : int 735 300 54 6 5 2 6 8 3 12 ...
## $ AATA : int 1326 351 89 12 11 1 7 7 8 5 ...
## $ AATC : int 1148 395 55 13 10 4 4 5 5 5 ...
## $ AATG : int 1335 409 88 8 4 3 5 12 12 8 ...
## $ AATT : int 1179 361 59 13 8 2 3 9 9 13 ...
## $ ACAA : int 972 345 67 1 7 3 0 13 5 8 ...
## $ ACAC : int 580 189 33 3 3 3 3 4 4 4 ...
## $ ACAG : int 987 334 54 0 1 1 5 10 3 1 ...
## $ ACAT : int 743 246 24 7 6 1 5 4 7 7 ...
## $ ACCA : int 1293 354 67 4 4 1 5 5 3 1 ...
## $ ACCC : int 631 242 14 1 1 1 0 3 1 0 ...
## $ ACCG : int 1322 552 69 3 4 4 6 1 5 2 ...
## $ ACCT : int 691 250 10 5 5 5 1 2 2 3 ...
## $ ACGA : int 814 311 34 5 9 2 8 6 3 2 ...
## $ ACGC : int 1304 563 25 7 3 6 2 2 8 3 ...
## $ ACGG : int 1022 379 74 3 1 3 6 3 1 1 ...
## $ ACGT : int 764 365 25 9 1 1 2 6 4 11 ...
## $ ACTA : int 353 144 27 2 5 0 2 5 1 3 ...
## $ ACTC : int 458 205 14 3 0 2 2 1 1 0 ...
## $ ACTG : int 1170 454 40 2 6 3 2 8 3 6 ...
## $ ACTT : int 712 264 31 11 8 5 1 2 7 5 ...
## $ AGAA : int 1080 357 33 3 5 3 4 12 4 11 ...
## $ AGAC : int 591 203 25 4 2 2 3 2 0 2 ...
## $ AGAG : int 685 187 15 1 3 0 5 1 3 4 ...
## $ AGAT : int 881 315 45 1 2 2 1 3 3 6 ...
## $ AGCA : int 1293 459 84 4 3 1 6 10 6 7 ...
## $ AGCC : int 967 366 50 0 4 4 3 1 1 1 ...
## $ AGCG : int 1473 615 54 3 3 5 4 7 3 4 ...
## $ AGCT : int 659 267 57 1 6 2 3 10 6 4 ...
## $ AGGA : int 661 204 32 1 0 2 2 3 1 4 ...
## $ AGGC : int 917 361 61 1 0 2 5 2 0 0 ...
## $ AGGG : int 654 219 26 0 0 1 4 0 0 5 ...
## $ AGGT : int 747 274 72 1 1 6 2 5 0 6 ...
## $ AGTA : int 619 190 60 5 3 1 7 4 5 10 ...
## $ AGTC : int 565 186 12 1 1 0 2 0 1 2 ...
## $ AGTG : int 883 321 45 4 3 3 3 2 3 6 ...
## $ AGTT : int 917 298 44 3 3 2 6 5 2 3 ...
## $ ATAA : int 1350 380 61 9 11 2 2 7 10 8 ...
## $ ATAC : int 766 219 43 7 3 2 2 5 4 2 ...
## $ ATAG : int 579 143 22 2 2 2 3 3 3 1 ...
## $ ATAT : int 1052 303 47 7 9 5 7 8 5 6 ...
## $ ATCA : int 1499 513 56 9 8 1 5 3 5 5 ...
## $ ATCC : int 963 340 31 6 2 2 6 2 3 2 ...
## $ ATCG : int 1290 490 61 5 2 9 4 4 4 3 ...
## $ ATCT : int 823 340 28 10 15 2 2 4 8 2 ...
## $ ATGA : int 1235 398 39 2 4 5 9 7 7 8 ...
## $ ATGC : int 1214 460 85 4 0 3 8 3 10 3 ...
## $ ATGG : int 1182 432 56 3 3 6 3 10 1 4 ...
## $ ATGT : int 818 273 50 6 2 5 2 5 6 4 ...
## $ ATTA : int 1116 388 51 5 5 9 7 8 2 15 ...
## $ ATTC : int 881 324 39 8 5 9 5 0 7 6 ...
## $ ATTG : int 1261 438 87 5 4 7 7 4 5 6 ...
## $ ATTT : int 1486 441 48 12 13 4 14 8 10 9 ...
## $ CAAA : int 1416 485 81 4 4 5 4 14 2 11 ...
## $ CAAC : int 1157 451 94 2 7 2 3 7 6 5 ...
## $ CAAG : int 522 186 70 5 3 4 2 4 5 3 ...
## $ CAAT : int 1201 356 106 10 8 2 6 7 5 6 ...
## $ CACA : int 720 214 37 2 1 3 4 5 3 5 ...
## $ CACC : int 1154 399 33 5 4 1 2 5 5 0 ...
## $ CACG : int 857 324 16 12 3 6 6 0 2 4 ...
## $ CACT : int 724 274 21 6 4 3 2 1 4 1 ...
## $ CAGA : int 1280 392 40 0 3 0 3 5 3 1 ...
## $ CAGC : int 1950 679 80 1 2 5 5 10 5 1 ...
## $ CAGG : int 1390 554 63 1 0 3 7 2 0 1 ...
## $ CAGT : int 1259 386 60 3 2 2 4 1 2 3 ...
## $ CATA : int 758 182 17 8 3 1 4 5 6 5 ...
## $ CATC : int 1363 446 44 8 12 2 5 2 9 2 ...
## $ CATG : int 877 285 23 2 2 3 4 3 4 3 ...
## $ CATT : int 1222 428 59 8 9 5 6 2 9 7 ...
## $ CCAA : int 752 223 92 5 2 0 2 5 5 3 ...
## $ CCAC : int 899 288 22 4 2 4 3 0 3 0 ...
## $ CCAG : int 1906 579 28 1 0 2 3 2 4 2 ...
## $ CCAT : int 1141 326 45 5 6 4 5 2 5 0 ...
## $ CCCA : int 742 208 23 2 0 2 0 2 5 0 ...
## $ CCCC : int 372 117 6 2 4 3 0 1 0 0 ...
## $ CCCG : int 745 341 19 1 1 3 3 3 0 0 ...
## $ CCCT : int 358 140 6 2 2 1 0 1 1 0 ...
## $ CCGA : int 854 377 50 2 2 6 8 2 5 2 ...
## $ CCGC : int 1364 628 45 1 3 13 3 0 5 0 ...
## $ CCGG : int 1230 559 51 0 2 7 2 0 1 1 ...
## $ CCGT : int 957 427 43 5 4 9 6 3 3 1 ...
## $ CCTA : int 196 70 12 1 3 1 0 1 0 0 ...
## $ CCTC : int 334 155 7 1 4 3 0 0 2 0 ...
## [list output truncated]
head(my_df)
## contig genome cov len gc AAAA AAAC AAAG AAAT AACA AACC AACG
## 1 genome0_1 genome0 29.0114 255873 0.4995 2122 1416 1386 1575 1233 1109 1357
## 2 genome0_10 genome0 34.1807 96681 0.5237 737 546 528 495 410 411 553
## 3 genome0_100 genome0 6.2198 11623 0.4785 129 76 107 57 82 62 89
## 4 genome0_1000 genome0 3.6059 1086 0.3803 8 7 1 11 3 4 3
## 5 genome0_1001 genome0 3.5475 1085 0.3742 13 5 7 7 8 7 4
## 6 genome0_1002 genome0 5.0692 1084 0.5258 3 5 4 3 3 5 2
## AACT AAGA AAGC AAGG AAGT AATA AATC AATG AATT ACAA ACAC ACAG ACAT ACCA ACCC
## 1 901 966 1074 841 735 1326 1148 1335 1179 972 580 987 743 1293 631
## 2 394 384 499 297 300 351 395 409 361 345 189 334 246 354 242
## 3 40 49 87 68 54 89 55 88 59 67 33 54 24 67 14
## 4 4 3 4 0 6 12 13 8 13 1 3 0 7 4 1
## 5 11 7 4 1 5 11 10 4 8 7 3 1 6 4 1
## 6 3 5 4 6 2 1 4 3 2 3 3 1 1 1 1
## ACCG ACCT ACGA ACGC ACGG ACGT ACTA ACTC ACTG ACTT AGAA AGAC AGAG AGAT AGCA
## 1 1322 691 814 1304 1022 764 353 458 1170 712 1080 591 685 881 1293
## 2 552 250 311 563 379 365 144 205 454 264 357 203 187 315 459
## 3 69 10 34 25 74 25 27 14 40 31 33 25 15 45 84
## 4 3 5 5 7 3 9 2 3 2 11 3 4 1 1 4
## 5 4 5 9 3 1 1 5 0 6 8 5 2 3 2 3
## 6 4 5 2 6 3 1 0 2 3 5 3 2 0 2 1
## AGCC AGCG AGCT AGGA AGGC AGGG AGGT AGTA AGTC AGTG AGTT ATAA ATAC ATAG ATAT
## 1 967 1473 659 661 917 654 747 619 565 883 917 1350 766 579 1052
## 2 366 615 267 204 361 219 274 190 186 321 298 380 219 143 303
## 3 50 54 57 32 61 26 72 60 12 45 44 61 43 22 47
## 4 0 3 1 1 1 0 1 5 1 4 3 9 7 2 7
## 5 4 3 6 0 0 0 1 3 1 3 3 11 3 2 9
## 6 4 5 2 2 2 1 6 1 0 3 2 2 2 2 5
## ATCA ATCC ATCG ATCT ATGA ATGC ATGG ATGT ATTA ATTC ATTG ATTT CAAA CAAC CAAG
## 1 1499 963 1290 823 1235 1214 1182 818 1116 881 1261 1486 1416 1157 522
## 2 513 340 490 340 398 460 432 273 388 324 438 441 485 451 186
## 3 56 31 61 28 39 85 56 50 51 39 87 48 81 94 70
## 4 9 6 5 10 2 4 3 6 5 8 5 12 4 2 5
## 5 8 2 2 15 4 0 3 2 5 5 4 13 4 7 3
## 6 1 2 9 2 5 3 6 5 9 9 7 4 5 2 4
## CAAT CACA CACC CACG CACT CAGA CAGC CAGG CAGT CATA CATC CATG CATT CCAA CCAC
## 1 1201 720 1154 857 724 1280 1950 1390 1259 758 1363 877 1222 752 899
## 2 356 214 399 324 274 392 679 554 386 182 446 285 428 223 288
## 3 106 37 33 16 21 40 80 63 60 17 44 23 59 92 22
## 4 10 2 5 12 6 0 1 1 3 8 8 2 8 5 4
## 5 8 1 4 3 4 3 2 0 2 3 12 2 9 2 2
## 6 2 3 1 6 3 0 5 3 2 1 2 3 5 0 4
## CCAG CCAT CCCA CCCC CCCG CCCT CCGA CCGC CCGG CCGT CCTA CCTC CCTG CCTT CGAA
## 1 1906 1141 742 372 745 358 854 1364 1230 957 196 334 1212 688 1081
## 2 579 326 208 117 341 140 377 628 559 427 70 155 493 229 487
## 3 28 45 23 6 19 6 50 45 51 43 12 7 23 13 43
## 4 1 5 2 2 1 2 2 1 0 5 1 1 2 4 4
## 5 0 6 0 4 1 2 2 3 2 4 3 4 1 9 7
## 6 2 4 2 3 3 1 6 13 7 9 1 3 10 4 7
## CGAC CGAG CGAT CGCA CGCC CGCG CGCT CGGA CGGC CGGG CGGT CGTA CGTC CGTG CGTT
## 1 918 585 1377 1347 1669 1480 1302 940 1460 957 1361 722 881 891 1363
## 2 381 243 544 543 661 732 670 356 664 354 591 331 413 448 589
## 3 31 16 60 39 31 23 35 38 96 39 98 38 13 26 44
## 4 2 4 0 3 3 2 9 4 2 0 2 5 2 2 9
## 5 0 1 7 2 2 1 7 3 0 0 2 4 1 0 5
## 6 1 0 5 11 7 8 11 2 9 4 10 4 1 5 6
## CTAA CTAC CTAG CTAT CTCA CTCC CTCG CTCT CTGA CTGC CTGG CTGT CTTA CTTC CTTG
## 1 441 452 37 500 583 402 484 504 1221 1398 1793 915 512 1036 560
## 2 148 207 14 202 233 188 248 226 549 628 894 348 188 369 203
## 3 31 25 10 42 25 5 11 11 48 49 34 46 19 38 23
## 4 9 5 3 0 3 3 1 3 1 4 3 5 4 10 8
## 5 13 7 0 3 5 2 0 8 1 7 1 8 3 20 6
## 6 0 1 0 1 0 4 5 4 1 13 4 6 5 4 6
## CTTT GAAA GAAC GAAG GAAT GACA GACC GACG GACT GAGA GAGC GAGG GAGT GATA GATC
## 1 1287 1616 992 1166 1040 749 767 958 494 683 744 496 595 1146 990
## 2 414 648 426 569 335 261 270 434 164 202 319 126 191 373 424
## 3 48 84 32 47 53 24 23 29 18 16 34 28 18 52 17
## 4 16 3 0 5 5 2 1 5 1 2 3 1 2 2 1
## 5 19 2 6 3 8 2 0 4 1 1 4 0 2 2 2
## 6 6 5 4 7 1 0 3 0 1 1 2 0 0 4 2
## GATG GATT GCAA GCAC GCAG GCAT GCCA GCCC GCCG GCCT GCGA GCGC GCGG GCGT GCTA
## 1 1455 1173 1552 958 1617 1197 1782 698 1519 822 1421 1858 1647 1401 590
## 2 603 437 538 362 667 455 557 259 707 344 633 914 699 680 226
## 3 84 46 117 36 115 44 75 24 68 32 47 36 86 33 52
## 4 2 1 3 7 2 6 2 1 0 0 3 1 1 2 5
## 5 1 6 6 3 3 2 1 1 4 4 0 2 0 2 3
## 6 6 14 7 5 5 4 5 2 20 10 5 5 5 6 1
## GCTC GCTG GCTT GGAA GGAC GGAG GGAT GGCA GGCC GGCG GGCT GGGA GGGC GGGG GGGT
## 1 623 1873 1073 1206 495 556 1007 1549 651 1916 995 712 957 585 715
## 2 267 963 397 455 170 186 386 646 263 953 396 236 428 186 236
## 3 28 76 57 54 9 31 38 118 50 65 56 19 47 22 49
## 4 4 4 8 2 1 0 4 5 0 1 0 0 1 0 0
## 5 3 2 9 3 3 0 1 0 0 0 0 1 0 1 1
## 6 8 4 9 4 0 2 6 4 9 3 2 4 4 3 6
## GGTA GGTC GGTG GGTT GTAA GTAC GTAG GTAT GTCA GTCC GTCG GTCT GTGA GTGC GTGG
## 1 997 746 1316 1158 1074 613 523 719 1079 449 880 537 1088 1042 1057
## 2 370 304 580 439 384 256 188 252 350 156 383 270 483 416 502
## 3 112 15 91 63 119 33 55 39 23 7 20 16 61 54 57
## 4 1 2 1 1 8 2 2 11 2 3 0 4 5 4 0
## 5 3 1 0 1 8 2 3 1 2 3 4 2 2 2 1
## 6 12 1 9 8 5 4 3 9 1 0 3 0 5 9 3
## GTGT GTTA GTTC GTTG GTTT TAAA TAAC TAAG TAAT TACA TACC TACG TACT TAGA TAGC
## 1 705 1004 939 1278 1510 1345 1035 542 1172 580 907 732 574 308 624
## 2 243 366 342 511 524 436 345 197 330 229 318 307 235 84 210
## 3 35 49 34 75 50 75 72 34 75 35 42 24 33 13 44
## 4 1 3 9 3 5 12 5 2 20 4 3 4 7 4 0
## 5 1 3 4 3 2 13 12 4 10 6 3 3 3 1 6
## 6 5 2 4 9 10 2 2 2 4 2 2 4 3 1 1
## TAGG TAGT TATA TATC TATG TATT TCAA TCAC TCAG TCAT TCCA TCCC TCCG TCCT TCGA
## 1 252 395 517 1074 782 1170 1020 1018 1369 1139 881 516 819 559 873
## 2 81 118 139 418 266 365 372 372 431 314 297 188 391 213 334
## 3 32 28 15 60 35 61 75 16 46 30 22 10 33 7 19
## 4 1 2 3 8 4 8 12 11 2 8 7 3 4 1 0
## 5 0 1 9 3 2 4 7 4 3 12 5 1 2 6 4
## 6 2 2 5 6 7 8 3 1 2 2 2 3 8 2 0
## TCGC TCGG TCGT TCTA TCTC TCTG TCTT TGAA TGAC TGAG TGAT TGCA TGCC TGCG TGCT
## 1 1272 818 735 291 558 1072 922 1447 964 692 1499 1135 1534 1458 1203
## 2 501 328 309 131 268 509 284 679 375 222 592 374 577 626 520
## 3 22 60 20 17 3 38 27 86 29 34 56 71 68 60 65
## 4 8 4 2 9 2 5 15 4 2 3 1 6 0 1 11
## 5 4 2 3 12 8 8 22 4 2 3 1 9 4 0 5
## 6 13 10 0 0 0 7 3 3 1 1 13 5 17 5 7
## TGGA TGGC TGGG TGGT TGTA TGTC TGTG TGTT TTAA TTAC TTAG TTAT TTCA TTCC TTCG
## 1 951 1777 773 1394 591 753 802 1293 1229 962 440 1272 1385 961 1044
## 2 401 805 327 592 189 256 295 417 395 407 148 432 393 405 351
## 3 43 85 50 62 36 26 45 57 45 33 30 43 63 29 29
## 4 1 2 1 2 12 4 3 7 13 4 0 5 19 3 8
## 5 3 0 2 1 4 8 3 3 7 3 3 4 11 7 7
## 6 4 3 9 8 4 2 5 9 3 4 1 11 6 9 6
## TTCT TTGA TTGC TTGG TTGT TTTA TTTC TTTG TTTT
## 1 979 1058 1676 863 1001 1271 1513 1499 2142
## 2 356 438 593 297 293 440 470 469 576
## 3 30 57 76 93 33 32 40 74 38
## 4 14 2 6 0 14 10 17 6 11
## 5 25 3 9 1 7 6 21 7 19
## 6 4 7 9 11 4 4 7 9 6
# normalize tetranucleotide frequencies (contained in all columns BUT columns 1 to 5) by contig length
my_df_normalized <- my_df # make a copy of original data
my_df_normalized[,-(1:5)] <- my_df_normalized[,-(1:5)] / my_df_normalized$len
# subset "genome0"
my_df_normalized_0 <- my_df_normalized[my_df_normalized$genome == "genome0", ]
# perform PCA on the normalized tetranucleotide frequencies
pca_results <- prcomp(my_df_normalized_0[,-(1:5)])
# extract and plot proportion of variance explained by the first 20 PCs
variances <- summary(pca_results)$importance["Proportion of Variance", 1:20]
barplot(variances)
# most variance in the data is explained by the first 3 PCs
# plot the distribution of data points along PCs 1 and 2, and then 2 and 3
plot(pca_results$x[,1:2])
plot(pca_results$x[,2:3])
# PC 1 separates two outliers from the rest,
# while PC 2 and 3 separate data into several (3?) clouds.
# this indicates that contigs of the assembly do not have uniform nucleotide composition.
# instead, there are 3 (?) prevalent nucleotide composition patterns.
# let us partition all of our contigs according to their nucleotide composition
# and examine various assembly metrics in each group
# approach I: k-means clustering
# perform k-means clustering with different k in the range from 2 to 20
# and record total within- and between-cluster variance (measured as sum of squares)
within_vector <- c() # empty vector to store within-cluster variances
between_vector <- c() # empty vector to store between-cluster variances
for (n_centers in 2:20) {
# perform k-means clustering with k = n_centers
clustered_km <- kmeans(x = pca_results$x[,2:3],
centers = n_centers)
# extract and store within-cluster variance
within_i <- clustered_km$tot.withinss
within_vector <- c(within_vector, within_i)
# extract and store between-cluster variance
between_i <- clustered_km$betweenss
between_vector <- c(between_vector, between_i)
}
## Warning: did not converge in 10 iterations
# elbow plot, option 1
plot(x = 2:20,
y = within_vector)
# elbow plot, option 2
plot(x = 2:20,
y = within_vector / between_vector)
# elbow plot indicates that performing k-means clustering with k=3 is optimal
# does silhouette plot agree?
# silhouette plot
library(cluster)
silhouette_vector <- c() # empty vector to store average silhouette widths
for (n_centers in 2:20) {
# perform k-means clustering with k = n_centers
clustered_km <- kmeans(x = pca_results$x[,2:3],
centers = n_centers)
# extract assigned cluster identities
clustering_identities <- clustered_km$cluster
# calculate silhouettes for all data points
silhouettes <- silhouette(x = clustering_identities,
dist = dist(pca_results$x[,2:3]))
# average silhouette width across all data points
silhouette_i <- mean(silhouettes[,"sil_width"])
silhouette_vector <- c(silhouette_vector, silhouette_i)
}
# silhouette plot
plot(x = 2:20,
y = silhouette_vector)
# both elbow and silhouette plot suggest that performing k-means clustering with k=3 is optimal
# perform k-means clustering with k=3
clustered_km <- kmeans(x = pca_results$x[,2:3],
centers = 3)
# extract assigned cluster identities
cluster_identities <- clustered_km$cluster
# color the data by cluster
plot(pca_results$x[,2:3],
col = cluster_identities)
# compare %GC, contig length, and read coverage between the three clusters
boxplot(my_df_normalized_0$gc ~ cluster_identities)
boxplot(my_df_normalized_0$len ~ cluster_identities)
boxplot(my_df_normalized_0$cov ~ cluster_identities)
# 2 of the 3 groups of contigs have relatively low %GC and contain none of the longer contigs -
# these would have to be investigated further as they might be contaminants or poorly assembled contigs
# approach II: hierarchical clustering
# calculate Euclidean distances between data points
distances <- dist(pca_results$x[,2:3])
# perform hierarchical clustering with default parameters
clustered_hc <- hclust(distances)
# plot the dendrogram
plot(clustered_hc,
labels = FALSE) # labels do not fit on the screen anyways
# elbow plot
# as a proxy for within-cluster variance, we can use heights at which the data first split into 2, 3, 4 etc clusters
plot(x = 2:20,
y = rev(clustered_hc$height)[1:19]) # last height value corresponds to 2 (not 1) clusters, second-last to 3, third-last to 4, which is why we have to reverse the height vector
# elbow plot indicates that cutting the tree such that there are 5 or 6 clusters is optimal
# does silhouette plot agree?
# silhouette plot
silhouette_vector <- c() # empty vector to store average silhouette widths
for (n_centers in 2:20) {
# cut the tree such that the data are split into k = n_centers clusters
clustering_identities <- cutree(tree = clustered_hc,
k = n_centers)
# calculate silhouettes for all data points
silhouettes <- silhouette(x = clustering_identities,
dist = distances)
# average silhouette width across all data points
silhouette_i <- mean(silhouettes[,"sil_width"])
silhouette_vector <- c(silhouette_vector, silhouette_i)
}
# silhouette plot
plot(x = 2:20,
y = silhouette_vector)
# there are two maxima - the first one (k=2) likely separates highly divergent points from the rest,
# the second one (k=6), could be what we want; also, it agrees with our previous interpretation of the elbow plot
# cut the tree such that the data are split into k=6 clusters
cluster_identities <- cutree(tree = clustered_hc,
k = 6)
# color the data by cluster
plot(pca_results$x[,2:3],
col = cluster_identities)
# compare %GC, contig length, and read coverage between the three clusters
boxplot(my_df_normalized_0$gc ~ cluster_identities)
boxplot(my_df_normalized_0$len ~ cluster_identities)
boxplot(my_df_normalized_0$cov ~ cluster_identities)
# 5 of the 6 groups of contigs have relatively low %GC and contain few of the longer contigs -
# these would have to be investigated further as they might be contaminants or poorly assembled contigs.
# conveniently, many outliers are grouped separately from the main bulk of the data - these can be easily discarded