###
# the data set provided is a modified and trimmed version of the data taken from
# https://www.kaggle.com/rec3141/biol342-genome-data
# undergraduate students at the University of Alaska Fairbanks cultured, sequenced, and assembled
# the complete genome of microorganisms as part of the General Microbiology course.
# the data set contains tetranucleotide frequencies in each contig of each genome,
# along with some additional assembly statistics, such as contig length, %GC, and read coverage.
# in this exercise, let us explore the nucleotide composition of contigs in the first genome ("genome0")
# with the goal of identifying potential outliers / wrongly assemblied contigs / contamination from other species
###

# import data
my_df <- read.csv("/Users/bsierieb/Documents/GitHub/xdasi-bio-2021/docs/syllabus/week_15_PCA_Clustering/Class2/W15.C2_Exercise_Clustering_DATA.csv")

# explore data
str(my_df)
## 'data.frame':    67095 obs. of  261 variables:
##  $ contig: chr  "genome0_1" "genome0_10" "genome0_100" "genome0_1000" ...
##  $ genome: chr  "genome0" "genome0" "genome0" "genome0" ...
##  $ cov   : num  29.01 34.18 6.22 3.61 3.55 ...
##  $ len   : int  255873 96681 11623 1086 1085 1084 1084 1084 1083 1083 ...
##  $ gc    : num  0.499 0.524 0.478 0.38 0.374 ...
##  $ AAAA  : int  2122 737 129 8 13 3 7 24 3 27 ...
##  $ AAAC  : int  1416 546 76 7 5 5 4 17 4 8 ...
##  $ AAAG  : int  1386 528 107 1 7 4 4 16 2 15 ...
##  $ AAAT  : int  1575 495 57 11 7 3 6 13 10 17 ...
##  $ AACA  : int  1233 410 82 3 8 3 4 16 5 6 ...
##  $ AACC  : int  1109 411 62 4 7 5 5 5 1 3 ...
##  $ AACG  : int  1357 553 89 3 4 2 0 8 9 9 ...
##  $ AACT  : int  901 394 40 4 11 3 3 7 4 5 ...
##  $ AAGA  : int  966 384 49 3 7 5 5 7 3 9 ...
##  $ AAGC  : int  1074 499 87 4 4 4 3 12 4 7 ...
##  $ AAGG  : int  841 297 68 0 1 6 3 3 0 9 ...
##  $ AAGT  : int  735 300 54 6 5 2 6 8 3 12 ...
##  $ AATA  : int  1326 351 89 12 11 1 7 7 8 5 ...
##  $ AATC  : int  1148 395 55 13 10 4 4 5 5 5 ...
##  $ AATG  : int  1335 409 88 8 4 3 5 12 12 8 ...
##  $ AATT  : int  1179 361 59 13 8 2 3 9 9 13 ...
##  $ ACAA  : int  972 345 67 1 7 3 0 13 5 8 ...
##  $ ACAC  : int  580 189 33 3 3 3 3 4 4 4 ...
##  $ ACAG  : int  987 334 54 0 1 1 5 10 3 1 ...
##  $ ACAT  : int  743 246 24 7 6 1 5 4 7 7 ...
##  $ ACCA  : int  1293 354 67 4 4 1 5 5 3 1 ...
##  $ ACCC  : int  631 242 14 1 1 1 0 3 1 0 ...
##  $ ACCG  : int  1322 552 69 3 4 4 6 1 5 2 ...
##  $ ACCT  : int  691 250 10 5 5 5 1 2 2 3 ...
##  $ ACGA  : int  814 311 34 5 9 2 8 6 3 2 ...
##  $ ACGC  : int  1304 563 25 7 3 6 2 2 8 3 ...
##  $ ACGG  : int  1022 379 74 3 1 3 6 3 1 1 ...
##  $ ACGT  : int  764 365 25 9 1 1 2 6 4 11 ...
##  $ ACTA  : int  353 144 27 2 5 0 2 5 1 3 ...
##  $ ACTC  : int  458 205 14 3 0 2 2 1 1 0 ...
##  $ ACTG  : int  1170 454 40 2 6 3 2 8 3 6 ...
##  $ ACTT  : int  712 264 31 11 8 5 1 2 7 5 ...
##  $ AGAA  : int  1080 357 33 3 5 3 4 12 4 11 ...
##  $ AGAC  : int  591 203 25 4 2 2 3 2 0 2 ...
##  $ AGAG  : int  685 187 15 1 3 0 5 1 3 4 ...
##  $ AGAT  : int  881 315 45 1 2 2 1 3 3 6 ...
##  $ AGCA  : int  1293 459 84 4 3 1 6 10 6 7 ...
##  $ AGCC  : int  967 366 50 0 4 4 3 1 1 1 ...
##  $ AGCG  : int  1473 615 54 3 3 5 4 7 3 4 ...
##  $ AGCT  : int  659 267 57 1 6 2 3 10 6 4 ...
##  $ AGGA  : int  661 204 32 1 0 2 2 3 1 4 ...
##  $ AGGC  : int  917 361 61 1 0 2 5 2 0 0 ...
##  $ AGGG  : int  654 219 26 0 0 1 4 0 0 5 ...
##  $ AGGT  : int  747 274 72 1 1 6 2 5 0 6 ...
##  $ AGTA  : int  619 190 60 5 3 1 7 4 5 10 ...
##  $ AGTC  : int  565 186 12 1 1 0 2 0 1 2 ...
##  $ AGTG  : int  883 321 45 4 3 3 3 2 3 6 ...
##  $ AGTT  : int  917 298 44 3 3 2 6 5 2 3 ...
##  $ ATAA  : int  1350 380 61 9 11 2 2 7 10 8 ...
##  $ ATAC  : int  766 219 43 7 3 2 2 5 4 2 ...
##  $ ATAG  : int  579 143 22 2 2 2 3 3 3 1 ...
##  $ ATAT  : int  1052 303 47 7 9 5 7 8 5 6 ...
##  $ ATCA  : int  1499 513 56 9 8 1 5 3 5 5 ...
##  $ ATCC  : int  963 340 31 6 2 2 6 2 3 2 ...
##  $ ATCG  : int  1290 490 61 5 2 9 4 4 4 3 ...
##  $ ATCT  : int  823 340 28 10 15 2 2 4 8 2 ...
##  $ ATGA  : int  1235 398 39 2 4 5 9 7 7 8 ...
##  $ ATGC  : int  1214 460 85 4 0 3 8 3 10 3 ...
##  $ ATGG  : int  1182 432 56 3 3 6 3 10 1 4 ...
##  $ ATGT  : int  818 273 50 6 2 5 2 5 6 4 ...
##  $ ATTA  : int  1116 388 51 5 5 9 7 8 2 15 ...
##  $ ATTC  : int  881 324 39 8 5 9 5 0 7 6 ...
##  $ ATTG  : int  1261 438 87 5 4 7 7 4 5 6 ...
##  $ ATTT  : int  1486 441 48 12 13 4 14 8 10 9 ...
##  $ CAAA  : int  1416 485 81 4 4 5 4 14 2 11 ...
##  $ CAAC  : int  1157 451 94 2 7 2 3 7 6 5 ...
##  $ CAAG  : int  522 186 70 5 3 4 2 4 5 3 ...
##  $ CAAT  : int  1201 356 106 10 8 2 6 7 5 6 ...
##  $ CACA  : int  720 214 37 2 1 3 4 5 3 5 ...
##  $ CACC  : int  1154 399 33 5 4 1 2 5 5 0 ...
##  $ CACG  : int  857 324 16 12 3 6 6 0 2 4 ...
##  $ CACT  : int  724 274 21 6 4 3 2 1 4 1 ...
##  $ CAGA  : int  1280 392 40 0 3 0 3 5 3 1 ...
##  $ CAGC  : int  1950 679 80 1 2 5 5 10 5 1 ...
##  $ CAGG  : int  1390 554 63 1 0 3 7 2 0 1 ...
##  $ CAGT  : int  1259 386 60 3 2 2 4 1 2 3 ...
##  $ CATA  : int  758 182 17 8 3 1 4 5 6 5 ...
##  $ CATC  : int  1363 446 44 8 12 2 5 2 9 2 ...
##  $ CATG  : int  877 285 23 2 2 3 4 3 4 3 ...
##  $ CATT  : int  1222 428 59 8 9 5 6 2 9 7 ...
##  $ CCAA  : int  752 223 92 5 2 0 2 5 5 3 ...
##  $ CCAC  : int  899 288 22 4 2 4 3 0 3 0 ...
##  $ CCAG  : int  1906 579 28 1 0 2 3 2 4 2 ...
##  $ CCAT  : int  1141 326 45 5 6 4 5 2 5 0 ...
##  $ CCCA  : int  742 208 23 2 0 2 0 2 5 0 ...
##  $ CCCC  : int  372 117 6 2 4 3 0 1 0 0 ...
##  $ CCCG  : int  745 341 19 1 1 3 3 3 0 0 ...
##  $ CCCT  : int  358 140 6 2 2 1 0 1 1 0 ...
##  $ CCGA  : int  854 377 50 2 2 6 8 2 5 2 ...
##  $ CCGC  : int  1364 628 45 1 3 13 3 0 5 0 ...
##  $ CCGG  : int  1230 559 51 0 2 7 2 0 1 1 ...
##  $ CCGT  : int  957 427 43 5 4 9 6 3 3 1 ...
##  $ CCTA  : int  196 70 12 1 3 1 0 1 0 0 ...
##  $ CCTC  : int  334 155 7 1 4 3 0 0 2 0 ...
##   [list output truncated]
head(my_df)
##         contig  genome     cov    len     gc AAAA AAAC AAAG AAAT AACA AACC AACG
## 1    genome0_1 genome0 29.0114 255873 0.4995 2122 1416 1386 1575 1233 1109 1357
## 2   genome0_10 genome0 34.1807  96681 0.5237  737  546  528  495  410  411  553
## 3  genome0_100 genome0  6.2198  11623 0.4785  129   76  107   57   82   62   89
## 4 genome0_1000 genome0  3.6059   1086 0.3803    8    7    1   11    3    4    3
## 5 genome0_1001 genome0  3.5475   1085 0.3742   13    5    7    7    8    7    4
## 6 genome0_1002 genome0  5.0692   1084 0.5258    3    5    4    3    3    5    2
##   AACT AAGA AAGC AAGG AAGT AATA AATC AATG AATT ACAA ACAC ACAG ACAT ACCA ACCC
## 1  901  966 1074  841  735 1326 1148 1335 1179  972  580  987  743 1293  631
## 2  394  384  499  297  300  351  395  409  361  345  189  334  246  354  242
## 3   40   49   87   68   54   89   55   88   59   67   33   54   24   67   14
## 4    4    3    4    0    6   12   13    8   13    1    3    0    7    4    1
## 5   11    7    4    1    5   11   10    4    8    7    3    1    6    4    1
## 6    3    5    4    6    2    1    4    3    2    3    3    1    1    1    1
##   ACCG ACCT ACGA ACGC ACGG ACGT ACTA ACTC ACTG ACTT AGAA AGAC AGAG AGAT AGCA
## 1 1322  691  814 1304 1022  764  353  458 1170  712 1080  591  685  881 1293
## 2  552  250  311  563  379  365  144  205  454  264  357  203  187  315  459
## 3   69   10   34   25   74   25   27   14   40   31   33   25   15   45   84
## 4    3    5    5    7    3    9    2    3    2   11    3    4    1    1    4
## 5    4    5    9    3    1    1    5    0    6    8    5    2    3    2    3
## 6    4    5    2    6    3    1    0    2    3    5    3    2    0    2    1
##   AGCC AGCG AGCT AGGA AGGC AGGG AGGT AGTA AGTC AGTG AGTT ATAA ATAC ATAG ATAT
## 1  967 1473  659  661  917  654  747  619  565  883  917 1350  766  579 1052
## 2  366  615  267  204  361  219  274  190  186  321  298  380  219  143  303
## 3   50   54   57   32   61   26   72   60   12   45   44   61   43   22   47
## 4    0    3    1    1    1    0    1    5    1    4    3    9    7    2    7
## 5    4    3    6    0    0    0    1    3    1    3    3   11    3    2    9
## 6    4    5    2    2    2    1    6    1    0    3    2    2    2    2    5
##   ATCA ATCC ATCG ATCT ATGA ATGC ATGG ATGT ATTA ATTC ATTG ATTT CAAA CAAC CAAG
## 1 1499  963 1290  823 1235 1214 1182  818 1116  881 1261 1486 1416 1157  522
## 2  513  340  490  340  398  460  432  273  388  324  438  441  485  451  186
## 3   56   31   61   28   39   85   56   50   51   39   87   48   81   94   70
## 4    9    6    5   10    2    4    3    6    5    8    5   12    4    2    5
## 5    8    2    2   15    4    0    3    2    5    5    4   13    4    7    3
## 6    1    2    9    2    5    3    6    5    9    9    7    4    5    2    4
##   CAAT CACA CACC CACG CACT CAGA CAGC CAGG CAGT CATA CATC CATG CATT CCAA CCAC
## 1 1201  720 1154  857  724 1280 1950 1390 1259  758 1363  877 1222  752  899
## 2  356  214  399  324  274  392  679  554  386  182  446  285  428  223  288
## 3  106   37   33   16   21   40   80   63   60   17   44   23   59   92   22
## 4   10    2    5   12    6    0    1    1    3    8    8    2    8    5    4
## 5    8    1    4    3    4    3    2    0    2    3   12    2    9    2    2
## 6    2    3    1    6    3    0    5    3    2    1    2    3    5    0    4
##   CCAG CCAT CCCA CCCC CCCG CCCT CCGA CCGC CCGG CCGT CCTA CCTC CCTG CCTT CGAA
## 1 1906 1141  742  372  745  358  854 1364 1230  957  196  334 1212  688 1081
## 2  579  326  208  117  341  140  377  628  559  427   70  155  493  229  487
## 3   28   45   23    6   19    6   50   45   51   43   12    7   23   13   43
## 4    1    5    2    2    1    2    2    1    0    5    1    1    2    4    4
## 5    0    6    0    4    1    2    2    3    2    4    3    4    1    9    7
## 6    2    4    2    3    3    1    6   13    7    9    1    3   10    4    7
##   CGAC CGAG CGAT CGCA CGCC CGCG CGCT CGGA CGGC CGGG CGGT CGTA CGTC CGTG CGTT
## 1  918  585 1377 1347 1669 1480 1302  940 1460  957 1361  722  881  891 1363
## 2  381  243  544  543  661  732  670  356  664  354  591  331  413  448  589
## 3   31   16   60   39   31   23   35   38   96   39   98   38   13   26   44
## 4    2    4    0    3    3    2    9    4    2    0    2    5    2    2    9
## 5    0    1    7    2    2    1    7    3    0    0    2    4    1    0    5
## 6    1    0    5   11    7    8   11    2    9    4   10    4    1    5    6
##   CTAA CTAC CTAG CTAT CTCA CTCC CTCG CTCT CTGA CTGC CTGG CTGT CTTA CTTC CTTG
## 1  441  452   37  500  583  402  484  504 1221 1398 1793  915  512 1036  560
## 2  148  207   14  202  233  188  248  226  549  628  894  348  188  369  203
## 3   31   25   10   42   25    5   11   11   48   49   34   46   19   38   23
## 4    9    5    3    0    3    3    1    3    1    4    3    5    4   10    8
## 5   13    7    0    3    5    2    0    8    1    7    1    8    3   20    6
## 6    0    1    0    1    0    4    5    4    1   13    4    6    5    4    6
##   CTTT GAAA GAAC GAAG GAAT GACA GACC GACG GACT GAGA GAGC GAGG GAGT GATA GATC
## 1 1287 1616  992 1166 1040  749  767  958  494  683  744  496  595 1146  990
## 2  414  648  426  569  335  261  270  434  164  202  319  126  191  373  424
## 3   48   84   32   47   53   24   23   29   18   16   34   28   18   52   17
## 4   16    3    0    5    5    2    1    5    1    2    3    1    2    2    1
## 5   19    2    6    3    8    2    0    4    1    1    4    0    2    2    2
## 6    6    5    4    7    1    0    3    0    1    1    2    0    0    4    2
##   GATG GATT GCAA GCAC GCAG GCAT GCCA GCCC GCCG GCCT GCGA GCGC GCGG GCGT GCTA
## 1 1455 1173 1552  958 1617 1197 1782  698 1519  822 1421 1858 1647 1401  590
## 2  603  437  538  362  667  455  557  259  707  344  633  914  699  680  226
## 3   84   46  117   36  115   44   75   24   68   32   47   36   86   33   52
## 4    2    1    3    7    2    6    2    1    0    0    3    1    1    2    5
## 5    1    6    6    3    3    2    1    1    4    4    0    2    0    2    3
## 6    6   14    7    5    5    4    5    2   20   10    5    5    5    6    1
##   GCTC GCTG GCTT GGAA GGAC GGAG GGAT GGCA GGCC GGCG GGCT GGGA GGGC GGGG GGGT
## 1  623 1873 1073 1206  495  556 1007 1549  651 1916  995  712  957  585  715
## 2  267  963  397  455  170  186  386  646  263  953  396  236  428  186  236
## 3   28   76   57   54    9   31   38  118   50   65   56   19   47   22   49
## 4    4    4    8    2    1    0    4    5    0    1    0    0    1    0    0
## 5    3    2    9    3    3    0    1    0    0    0    0    1    0    1    1
## 6    8    4    9    4    0    2    6    4    9    3    2    4    4    3    6
##   GGTA GGTC GGTG GGTT GTAA GTAC GTAG GTAT GTCA GTCC GTCG GTCT GTGA GTGC GTGG
## 1  997  746 1316 1158 1074  613  523  719 1079  449  880  537 1088 1042 1057
## 2  370  304  580  439  384  256  188  252  350  156  383  270  483  416  502
## 3  112   15   91   63  119   33   55   39   23    7   20   16   61   54   57
## 4    1    2    1    1    8    2    2   11    2    3    0    4    5    4    0
## 5    3    1    0    1    8    2    3    1    2    3    4    2    2    2    1
## 6   12    1    9    8    5    4    3    9    1    0    3    0    5    9    3
##   GTGT GTTA GTTC GTTG GTTT TAAA TAAC TAAG TAAT TACA TACC TACG TACT TAGA TAGC
## 1  705 1004  939 1278 1510 1345 1035  542 1172  580  907  732  574  308  624
## 2  243  366  342  511  524  436  345  197  330  229  318  307  235   84  210
## 3   35   49   34   75   50   75   72   34   75   35   42   24   33   13   44
## 4    1    3    9    3    5   12    5    2   20    4    3    4    7    4    0
## 5    1    3    4    3    2   13   12    4   10    6    3    3    3    1    6
## 6    5    2    4    9   10    2    2    2    4    2    2    4    3    1    1
##   TAGG TAGT TATA TATC TATG TATT TCAA TCAC TCAG TCAT TCCA TCCC TCCG TCCT TCGA
## 1  252  395  517 1074  782 1170 1020 1018 1369 1139  881  516  819  559  873
## 2   81  118  139  418  266  365  372  372  431  314  297  188  391  213  334
## 3   32   28   15   60   35   61   75   16   46   30   22   10   33    7   19
## 4    1    2    3    8    4    8   12   11    2    8    7    3    4    1    0
## 5    0    1    9    3    2    4    7    4    3   12    5    1    2    6    4
## 6    2    2    5    6    7    8    3    1    2    2    2    3    8    2    0
##   TCGC TCGG TCGT TCTA TCTC TCTG TCTT TGAA TGAC TGAG TGAT TGCA TGCC TGCG TGCT
## 1 1272  818  735  291  558 1072  922 1447  964  692 1499 1135 1534 1458 1203
## 2  501  328  309  131  268  509  284  679  375  222  592  374  577  626  520
## 3   22   60   20   17    3   38   27   86   29   34   56   71   68   60   65
## 4    8    4    2    9    2    5   15    4    2    3    1    6    0    1   11
## 5    4    2    3   12    8    8   22    4    2    3    1    9    4    0    5
## 6   13   10    0    0    0    7    3    3    1    1   13    5   17    5    7
##   TGGA TGGC TGGG TGGT TGTA TGTC TGTG TGTT TTAA TTAC TTAG TTAT TTCA TTCC TTCG
## 1  951 1777  773 1394  591  753  802 1293 1229  962  440 1272 1385  961 1044
## 2  401  805  327  592  189  256  295  417  395  407  148  432  393  405  351
## 3   43   85   50   62   36   26   45   57   45   33   30   43   63   29   29
## 4    1    2    1    2   12    4    3    7   13    4    0    5   19    3    8
## 5    3    0    2    1    4    8    3    3    7    3    3    4   11    7    7
## 6    4    3    9    8    4    2    5    9    3    4    1   11    6    9    6
##   TTCT TTGA TTGC TTGG TTGT TTTA TTTC TTTG TTTT
## 1  979 1058 1676  863 1001 1271 1513 1499 2142
## 2  356  438  593  297  293  440  470  469  576
## 3   30   57   76   93   33   32   40   74   38
## 4   14    2    6    0   14   10   17    6   11
## 5   25    3    9    1    7    6   21    7   19
## 6    4    7    9   11    4    4    7    9    6
# normalize tetranucleotide frequencies (contained in all columns BUT columns 1 to 5) by contig length
my_df_normalized <- my_df # make a copy of original data
my_df_normalized[,-(1:5)] <- my_df_normalized[,-(1:5)] / my_df_normalized$len

# subset "genome0"
my_df_normalized_0 <- my_df_normalized[my_df_normalized$genome == "genome0", ]

# perform PCA on the normalized tetranucleotide frequencies
pca_results <- prcomp(my_df_normalized_0[,-(1:5)])

# extract and plot proportion of variance explained by the first 20 PCs
variances <- summary(pca_results)$importance["Proportion of Variance", 1:20]
barplot(variances)

# most variance in the data is explained by the first 3 PCs
# plot the distribution of data points along PCs 1 and 2, and then 2 and 3
plot(pca_results$x[,1:2])

plot(pca_results$x[,2:3])

# PC 1 separates two outliers from the rest,
# while PC 2 and 3 separate data into several (3?) clouds.
# this indicates that contigs of the assembly do not have uniform nucleotide composition.
# instead, there are 3 (?) prevalent nucleotide composition patterns.
# let us partition all of our contigs according to their nucleotide composition
# and examine various assembly metrics in each group

# approach I: k-means clustering
# perform k-means clustering with different k in the range from 2 to 20
# and record total within- and between-cluster variance (measured as sum of squares)
within_vector <- c() # empty vector to store within-cluster variances
between_vector <- c() # empty vector to store between-cluster variances
for (n_centers in 2:20) {
  # perform k-means clustering with k = n_centers
  clustered_km <- kmeans(x = pca_results$x[,2:3],
                         centers = n_centers)
  # extract and store within-cluster variance
  within_i <- clustered_km$tot.withinss
  within_vector <- c(within_vector, within_i)
  # extract and store between-cluster variance
  between_i <- clustered_km$betweenss
  between_vector <- c(between_vector, between_i)
}
## Warning: did not converge in 10 iterations
# elbow plot, option 1
plot(x = 2:20,
     y = within_vector)

# elbow plot, option 2
plot(x = 2:20,
     y = within_vector / between_vector)

# elbow plot indicates that performing k-means clustering with k=3 is optimal
# does silhouette plot agree?

# silhouette plot
library(cluster)
silhouette_vector <- c() # empty vector to store average silhouette widths
for (n_centers in 2:20) {
  # perform k-means clustering with k = n_centers
  clustered_km <- kmeans(x = pca_results$x[,2:3],
                         centers = n_centers)
  # extract assigned cluster identities
  clustering_identities <- clustered_km$cluster
  # calculate silhouettes for all data points
  silhouettes <- silhouette(x = clustering_identities,
                            dist = dist(pca_results$x[,2:3]))
  # average silhouette width across all data points
  silhouette_i <- mean(silhouettes[,"sil_width"])
  silhouette_vector <- c(silhouette_vector, silhouette_i)
}

# silhouette plot
plot(x = 2:20,
     y = silhouette_vector)

# both elbow and silhouette plot suggest that performing k-means clustering with k=3 is optimal

# perform k-means clustering with k=3
clustered_km <- kmeans(x = pca_results$x[,2:3],
                       centers = 3)

# extract assigned cluster identities
cluster_identities <- clustered_km$cluster

# color the data by cluster
plot(pca_results$x[,2:3],
     col = cluster_identities)

# compare %GC, contig length, and read coverage between the three clusters
boxplot(my_df_normalized_0$gc ~ cluster_identities)

boxplot(my_df_normalized_0$len ~ cluster_identities)

boxplot(my_df_normalized_0$cov ~ cluster_identities)

# 2 of the 3 groups of contigs have relatively low %GC and contain none of the longer contigs -
# these would have to be investigated further as they might be contaminants or poorly assembled contigs

# approach II: hierarchical clustering

# calculate Euclidean distances between data points
distances <- dist(pca_results$x[,2:3])

# perform hierarchical clustering with default parameters
clustered_hc <- hclust(distances)

# plot the dendrogram
plot(clustered_hc,
     labels = FALSE) # labels do not fit on the screen anyways

# elbow plot
# as a proxy for within-cluster variance, we can use heights at which the data first split into 2, 3, 4 etc clusters
plot(x = 2:20,
     y = rev(clustered_hc$height)[1:19]) # last height value corresponds to 2 (not 1) clusters, second-last to 3, third-last to 4, which is why we have to reverse the height vector

# elbow plot indicates that cutting the tree such that there are 5 or 6 clusters is optimal
# does silhouette plot agree?

# silhouette plot
silhouette_vector <- c() # empty vector to store average silhouette widths
for (n_centers in 2:20) {
  # cut the tree such that the data are split into k = n_centers clusters
  clustering_identities <- cutree(tree = clustered_hc,
                                  k = n_centers)
  # calculate silhouettes for all data points
  silhouettes <- silhouette(x = clustering_identities,
                            dist = distances)
  # average silhouette width across all data points
  silhouette_i <- mean(silhouettes[,"sil_width"])
  silhouette_vector <- c(silhouette_vector, silhouette_i)
}

# silhouette plot
plot(x = 2:20,
     y = silhouette_vector)

# there are two maxima - the first one (k=2) likely separates highly divergent points from the rest,
# the second one (k=6), could be what we want; also, it agrees with our previous interpretation of the elbow plot

# cut the tree such that the data are split into k=6 clusters
cluster_identities <- cutree(tree = clustered_hc,
                             k = 6)

# color the data by cluster
plot(pca_results$x[,2:3],
     col = cluster_identities)

# compare %GC, contig length, and read coverage between the three clusters
boxplot(my_df_normalized_0$gc ~ cluster_identities)

boxplot(my_df_normalized_0$len ~ cluster_identities)

boxplot(my_df_normalized_0$cov ~ cluster_identities)

# 5 of the 6 groups of contigs have relatively low %GC and contain few of the longer contigs -
# these would have to be investigated further as they might be contaminants or poorly assembled contigs.
# conveniently, many outliers are grouped separately from the main bulk of the data - these can be easily discarded