diff --git a/_freeze/index/execute-results/html.json b/_freeze/index/execute-results/html.json index 79d8066..d160318 100644 --- a/_freeze/index/execute-results/html.json +++ b/_freeze/index/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "74659d655325373525014d24d14d2e8a", + "hash": "fc0f1af7330bc7d8fd538f7fc83e47a6", "result": { "engine": "knitr", - "markdown": "---\ntitle: Integrative Analysis of Multi-omic Data\nauthor:\n - name: Piero Palacios Bernuy\n orcid: 0000-0001-6729-4080\n corresponding: true\n email: p.palacios.bernuy@gmail.com\n roles:\n - Investigation\n - Bioinformatics\n - Deep learning\n - Visualization\nkeywords:\n - Genomic Ranges\n - Omics\n - Bioconductor\nabstract: |\n This document is part of a series of the analysis of Omics data. Especifically, here is showed how to analyze bulk RNA-Seq data with Bioconductor packages. Also, it's showcased how to make plots of the RNA data in the context of differentially gene expression and gene-sets. \nplain-language-summary: |\n This document have a example of the analysis of bulk RNA-Seq data.\nkey-points:\n - A guide to analyze GWAS public data.\n - A guide to analyze TCGA database.\ndate: last-modified\nbibliography: references.bib\ncitation:\n container-title: An open source portfolio\nnumber-sections: true\n---\n\n\n## Introduction\n\n\n\n## GWAS Catalog with a ChIP-Seq Experiment\n\nHere, we are gonna analyze the relation between transcription factor binding (ESRRA binding data) from a ChIP-Seq experiment and the genome-wide associations between DNA variants and phenotypes like diseases. For this task, we are gonna use a the `gwascat` package distributed by the **EMBL** (European Molecular Biology Laboratories).\n\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlibrary(tidyverse)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'ggplot2' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'tidyr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'readr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'purrr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'dplyr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'stringr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'lubridate' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──\n✔ dplyr 1.1.4 ✔ readr 2.1.5\n✔ forcats 1.0.0 ✔ stringr 1.5.1\n✔ ggplot2 3.5.0 ✔ tibble 3.2.1\n✔ lubridate 1.9.3 ✔ tidyr 1.3.1\n✔ purrr 1.0.2 \n── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n✖ dplyr::filter() masks stats::filter()\n✖ dplyr::lag() masks stats::lag()\nℹ Use the conflicted package () to force all conflicts to become errors\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nlibrary(gwascat)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'gwascat' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\ngwascat loaded. Use makeCurrentGwascat() to extract current image.\n from EBI. The data folder of this package has some legacy extracts.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nlibrary(GenomeInfoDb)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'GenomeInfoDb' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: BiocGenerics\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'BiocGenerics' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'BiocGenerics'\n\nThe following objects are masked from 'package:lubridate':\n\n intersect, setdiff, union\n\nThe following objects are masked from 'package:dplyr':\n\n combine, intersect, setdiff, union\n\nThe following objects are masked from 'package:stats':\n\n IQR, mad, sd, var, xtabs\n\nThe following objects are masked from 'package:base':\n\n anyDuplicated, aperm, append, as.data.frame, basename, cbind,\n colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,\n get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,\n match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,\n Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,\n table, tapply, union, unique, unsplit, which.max, which.min\n\nLoading required package: S4Vectors\nLoading required package: stats4\n\nAttaching package: 'S4Vectors'\n\nThe following objects are masked from 'package:lubridate':\n\n second, second<-\n\nThe following objects are masked from 'package:dplyr':\n\n first, rename\n\nThe following object is masked from 'package:tidyr':\n\n expand\n\nThe following object is masked from 'package:utils':\n\n findMatches\n\nThe following objects are masked from 'package:base':\n\n expand.grid, I, unname\n\nLoading required package: IRanges\n\nAttaching package: 'IRanges'\n\nThe following object is masked from 'package:lubridate':\n\n %within%\n\nThe following objects are masked from 'package:dplyr':\n\n collapse, desc, slice\n\nThe following object is masked from 'package:purrr':\n\n reduce\n\nThe following object is masked from 'package:grDevices':\n\n windows\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nlibrary(ERBS)\nlibrary(liftOver)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: GenomicRanges\nLoading required package: rtracklayer\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'rtracklayer' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Homo.sapiens\nLoading required package: AnnotationDbi\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'AnnotationDbi' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Biobase\nWelcome to Bioconductor\n\n Vignettes contain introductory material; view with\n 'browseVignettes()'. To cite Bioconductor, see\n 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n\n\nAttaching package: 'AnnotationDbi'\n\nThe following object is masked from 'package:dplyr':\n\n select\n\nLoading required package: OrganismDbi\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'OrganismDbi' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: GenomicFeatures\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'GenomicFeatures' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: GO.db\n\nLoading required package: org.Hs.eg.db\n\nLoading required package: TxDb.Hsapiens.UCSC.hg19.knownGene\n```\n\n\n:::\n:::\n\n\nFirst, we need to download the data, keep the 24 chromosomes (from 1 to Y) and, specify the sequence information from the GRCh38 human genome annotation.\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ngwcat = get_cached_gwascat()\n\ngg = gwcat |> as_GRanges()\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\ndropping 45505 records that have NA for CHR_POS\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n1950 records have semicolon in CHR_POS; splitting and using first entry.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n3265 records have ' x ' in CHR_POS indicating multiple SNP effects, using first.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngg = keepStandardChromosomes(gg, pruning.mode = \"coarse\")\n\n# seqlevelsStyle(gg) <- \"UCSC\"\n\nseqlevels(gg) <- seqlevels(gg) |> \n sortSeqlevels()\n\ndata(\"si.hs.38\")\n\nseqinfo(gg) <- si.hs.38\n```\n:::\n\n\nNow, let's plot a karyogram that will show the SNP's identified with significant associations with a phenotype. The SNP's in the GWAS catalog have a stringent criterion of significance and there has been a replication of the finding from a independent population.\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nggbio::autoplot(gg, layout=\"karyogram\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRegistered S3 method overwritten by 'GGally':\n method from \n +.gg ggplot2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nScale for x is already present.\nAdding another scale for x, which will replace the existing scale.\nScale for x is already present.\nAdding another scale for x, which will replace the existing scale.\n```\n\n\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-3-1.png){width=672}\n:::\n:::\n\n\nWe can see the peak data as a `GRanges` object: \n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ndata(\"GM12878\")\n\nGM12878\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nGRanges object with 1873 ranges and 7 metadata columns:\n seqnames ranges strand | name score col\n | \n [1] chrX 1509355-1512462 * | 5 0 \n [2] chrX 26801422-26802448 * | 6 0 \n [3] chr19 11694102-11695359 * | 1 0 \n [4] chr19 4076893-4079276 * | 4 0 \n [5] chr3 53288568-53290767 * | 9 0 \n ... ... ... ... . ... ... ...\n [1869] chr19 11201120-11203985 * | 8701 0 \n [1870] chr19 2234920-2237370 * | 990 0 \n [1871] chr1 94311336-94313543 * | 4035 0 \n [1872] chr19 45690614-45691210 * | 10688 0 \n [1873] chr19 6110100-6111252 * | 2274 0 \n signalValue pValue qValue peak\n \n [1] 157.92 310.000 32 1991\n [2] 147.38 310.000 32 387\n [3] 99.71 311.660 32 861\n [4] 84.74 310.000 32 1508\n [5] 78.20 299.505 32 1772\n ... ... ... ... ...\n [1869] 8.65 7.281 0.26576 2496\n [1870] 8.65 26.258 1.99568 1478\n [1871] 8.65 12.511 1.47237 1848\n [1872] 8.65 6.205 0.00000 298\n [1873] 8.65 17.356 2.01323 496\n -------\n seqinfo: 93 sequences (1 circular) from hg19 genome\n```\n\n\n:::\n:::\n\n\nIf we see the bottom of the `GRanges` table, this experiment have the hg19 annotation from the human genome. To work on the GRCh38 annotation we need to lift-over with a `.chain` file. For this we can use the `AnnotationHub` package.\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlibrary(AnnotationHub)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'AnnotationHub' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: BiocFileCache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'BiocFileCache' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: dbplyr\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'dbplyr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'dbplyr'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:dplyr':\n\n ident, sql\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'AnnotationHub'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:Biobase':\n\n cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:rtracklayer':\n\n hubUrl\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nah <- AnnotationHub::AnnotationHub()\n\nquery(ah, c(\"hg19ToHg38.over.chain\"))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nAnnotationHub with 1 record\n# snapshotDate(): 2023-10-23\n# names(): AH14150\n# $dataprovider: UCSC\n# $species: Homo sapiens\n# $rdataclass: ChainFile\n# $rdatadateadded: 2014-12-15\n# $title: hg19ToHg38.over.chain.gz\n# $description: UCSC liftOver chain file from hg19 to hg38\n# $taxonomyid: 9606\n# $genome: hg19\n# $sourcetype: Chain\n# $sourceurl: http://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19To...\n# $sourcesize: NA\n# $tags: c(\"liftOver\", \"chain\", \"UCSC\", \"genome\", \"homology\") \n# retrieve record with 'object[[\"AH14150\"]]' \n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nchain <- ah[[\"AH14150\"]]\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nGM12878 <- liftOver(GM12878, chain) |> \n unlist()\n\nseqlevelsStyle(GM12878) <- \"NCBI\"\n\nseqinfo(GM12878) <- si.hs.38\n\n\nseqlevelsStyle(GM12878) <- \"UCSC\"\nseqlevelsStyle(gg) <- \"UCSC\"\n```\n:::\n\n\nWe can find overlaps between the GWAS catalog and the ESRRA ChIP-Seq experiment but, there is a problem; the GWAS catalog is a collection of intervals that reports all significant SNPs and there can be duplications of SNPs associated to multiple phenotypes or the same SNP might be found for the same phenotype in different studies.\n\nWe can see the duplications with the `reduce` function from `IRanges` package:\n\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n# duplicated loci\nlength(gg) - length(reduce(gg))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 261160\n```\n\n\n:::\n:::\n\nWe can see that there are `261160` duplicated loci. Let's find the overlap between the *reduced* catalog and the ChIP-Seq experiment:\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n#\nfo = findOverlaps(GM12878, reduce(gg))\nfo\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nHits object with 613 hits and 0 metadata columns:\n queryHits subjectHits\n \n [1] 6 237757\n [2] 9 221130\n [3] 12 25060\n [4] 15 104699\n [5] 15 104700\n ... ... ...\n [609] 1928 246305\n [610] 1929 244698\n [611] 1929 244699\n [612] 1931 250380\n [613] 1931 250381\n -------\n queryLength: 1932 / subjectLength: 268560\n```\n\n\n:::\n:::\n\nWe can see 613 hits. Then, we are gonna eobtain the ranges from those hits, retrieve the phenotypes (DISEASE/TRAIT) and show the top 20 most common phenotypes with association to SNPs that lies on the ESRRA binding peaks.\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nover_ranges <- reduce(gg)[subjectHits(fo)]\n\n\nii <- over_ranges |> \n as.data.frame() |> \n GenomicRanges::makeGRangesListFromDataFrame()\n\n\nphset = lapply(ii, function(x){\n \n # print(glue::glue(\"On range: {x}\"))\n unique(gg[which(gg %over% x)]$\"DISEASE/TRAIT\")\n \n})\n\ngwas_on_peaks <- phset |> \n enframe() |> \n unnest(value) |> \n dplyr::count(value) |> \n slice_max(n, n=20)\n\n\np <- gwas_on_peaks |> \n mutate(value = fct_reorder(value,n)) |> \n ggplot(aes(n,value, fill=value))+\n geom_col() +\n theme(legend.position = \"none\") +\n paletteer::scale_fill_paletteer_d(\"khroma::smoothrainbow\") +\n theme_minimal()\n\nbslib::card(plotly::ggplotly(p), full_screen = T)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
\n
\n\n
\n\n\n\n\n\n
\n```\n\n:::\n:::\n\n\nDistinct phenotypes identified on the peaks:\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlength(phset)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 613\n```\n\n\n:::\n:::\n\nNow, how to do the inference of these phenotype on peaks of these b cells? We can use permutation on the genomic positions to test if the number of phenotypes found is due to chance or not.\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlibrary(ph525x)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: png\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: grid\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: VariantAnnotation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'VariantAnnotation' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: MatrixGenerics\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'MatrixGenerics' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: matrixStats\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'matrixStats' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'matrixStats'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:Biobase':\n\n anyMissing, rowMedians\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:dplyr':\n\n count\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'MatrixGenerics'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:matrixStats':\n\n colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,\n colCounts, colCummaxs, colCummins, colCumprods, colCumsums,\n colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,\n colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,\n colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,\n colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,\n colWeightedMeans, colWeightedMedians, colWeightedSds,\n colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,\n rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,\n rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,\n rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,\n rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,\n rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,\n rowWeightedMads, rowWeightedMeans, rowWeightedMedians,\n rowWeightedSds, rowWeightedVars\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:Biobase':\n\n rowMedians\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: SummarizedExperiment\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'SummarizedExperiment' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Rsamtools\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'Rsamtools' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Biostrings\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'Biostrings' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: XVector\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'XVector' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'XVector'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:purrr':\n\n compact\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'Biostrings'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:grid':\n\n pattern\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:base':\n\n strsplit\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'VariantAnnotation'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:stringr':\n\n fixed\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:base':\n\n tabulate\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nlibrary(progress)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'progress' was built under R version 4.3.3\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nset.seed(123)\n\nn_iter = 100\n\npb <- progress_bar$new(format = \"(:spin) [:bar] :percent [Elapsed time: :elapsedfull || Estimated time remaining: :eta]\",\n total = n_iter,\n complete = \"=\", # Completion bar character\n incomplete = \" \", # Incomplete bar character\n current = \">\", # Current bar character\n clear = FALSE, # If TRUE, clears the bar when finish\n width = 100) \n\n\nrsc = sapply(1:n_iter, function(x) {\n pb$tick()\n length(findOverlaps(reposition(GM12878), reduce(gg))) |> \n suppressWarnings()\n \n})\n\n# compute prop with more overlaps than in observed data\nmean(rsc > length(fo))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 0\n```\n\n\n:::\n:::\n\n\n\n\n## Explore the TCGA\n\nPlease check the script below the image to see how to explore and get insights from the TCGA (The Cancer Genome Atlas) database.\n\n\n{{< embed notebooks/TCGA.qmd#fig-genesmut >}}\n\n\n\n## Conclusion\n\n## References {.unnumbered}\n\n:::{#refs}\n\n:::", + "markdown": "---\ntitle: Integrative Analysis of Multi-omic Data\nauthor:\n - name: Piero Palacios Bernuy\n orcid: 0000-0001-6729-4080\n corresponding: true\n email: p.palacios.bernuy@gmail.com\n roles:\n - Investigation\n - Bioinformatics\n - Deep learning\n - Visualization\nkeywords:\n - Genomic Ranges\n - Omics\n - Bioconductor\nabstract: |\n This document is part of a series of the analysis of Omics data. Especifically, here is showed how to analyze bulk RNA-Seq data with Bioconductor packages. Also, it's showcased how to make plots of the RNA data in the context of differentially gene expression and gene-sets. \nplain-language-summary: |\n This document have a example of the analysis of bulk RNA-Seq data.\nkey-points:\n - A guide to analyze GWAS public data.\n - A guide to analyze TCGA database.\ndate: last-modified\nbibliography: references.bib\ncitation:\n container-title: An open source portfolio\nnumber-sections: true\n---\n\n\n## Introduction\n\n\n\n## GWAS Catalog with a ChIP-Seq Experiment\n\nHere, we are gonna analyze the relation between transcription factor binding (ESRRA binding data) from a ChIP-Seq experiment and the genome-wide associations between DNA variants and phenotypes like diseases. For this task, we are gonna use a the `gwascat` package distributed by the **EMBL** (European Molecular Biology Laboratories).\n\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlibrary(tidyverse)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'ggplot2' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'tidyr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'readr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'purrr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'dplyr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'stringr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'lubridate' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──\n✔ dplyr 1.1.4 ✔ readr 2.1.5\n✔ forcats 1.0.0 ✔ stringr 1.5.1\n✔ ggplot2 3.5.0 ✔ tibble 3.2.1\n✔ lubridate 1.9.3 ✔ tidyr 1.3.1\n✔ purrr 1.0.2 \n── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n✖ dplyr::filter() masks stats::filter()\n✖ dplyr::lag() masks stats::lag()\nℹ Use the conflicted package () to force all conflicts to become errors\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nlibrary(gwascat)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'gwascat' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\ngwascat loaded. Use makeCurrentGwascat() to extract current image.\n from EBI. The data folder of this package has some legacy extracts.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nlibrary(GenomeInfoDb)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'GenomeInfoDb' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: BiocGenerics\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'BiocGenerics' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'BiocGenerics'\n\nThe following objects are masked from 'package:lubridate':\n\n intersect, setdiff, union\n\nThe following objects are masked from 'package:dplyr':\n\n combine, intersect, setdiff, union\n\nThe following objects are masked from 'package:stats':\n\n IQR, mad, sd, var, xtabs\n\nThe following objects are masked from 'package:base':\n\n anyDuplicated, aperm, append, as.data.frame, basename, cbind,\n colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,\n get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,\n match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,\n Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,\n table, tapply, union, unique, unsplit, which.max, which.min\n\nLoading required package: S4Vectors\nLoading required package: stats4\n\nAttaching package: 'S4Vectors'\n\nThe following objects are masked from 'package:lubridate':\n\n second, second<-\n\nThe following objects are masked from 'package:dplyr':\n\n first, rename\n\nThe following object is masked from 'package:tidyr':\n\n expand\n\nThe following object is masked from 'package:utils':\n\n findMatches\n\nThe following objects are masked from 'package:base':\n\n expand.grid, I, unname\n\nLoading required package: IRanges\n\nAttaching package: 'IRanges'\n\nThe following object is masked from 'package:lubridate':\n\n %within%\n\nThe following objects are masked from 'package:dplyr':\n\n collapse, desc, slice\n\nThe following object is masked from 'package:purrr':\n\n reduce\n\nThe following object is masked from 'package:grDevices':\n\n windows\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nlibrary(ERBS)\nlibrary(liftOver)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: GenomicRanges\nLoading required package: rtracklayer\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'rtracklayer' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Homo.sapiens\nLoading required package: AnnotationDbi\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'AnnotationDbi' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Biobase\nWelcome to Bioconductor\n\n Vignettes contain introductory material; view with\n 'browseVignettes()'. To cite Bioconductor, see\n 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n\n\nAttaching package: 'AnnotationDbi'\n\nThe following object is masked from 'package:dplyr':\n\n select\n\nLoading required package: OrganismDbi\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'OrganismDbi' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: GenomicFeatures\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'GenomicFeatures' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: GO.db\n\nLoading required package: org.Hs.eg.db\n\nLoading required package: TxDb.Hsapiens.UCSC.hg19.knownGene\n```\n\n\n:::\n:::\n\n\nFirst, we need to download the data, keep the 24 chromosomes (from 1 to Y) and, specify the sequence information from the GRCh38 human genome annotation.\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ngwcat = get_cached_gwascat()\n\ngg = gwcat |> as_GRanges()\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\ndropping 45505 records that have NA for CHR_POS\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n1950 records have semicolon in CHR_POS; splitting and using first entry.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n3265 records have ' x ' in CHR_POS indicating multiple SNP effects, using first.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngg = keepStandardChromosomes(gg, pruning.mode = \"coarse\")\n\n# seqlevelsStyle(gg) <- \"UCSC\"\n\nseqlevels(gg) <- seqlevels(gg) |> \n sortSeqlevels()\n\ndata(\"si.hs.38\")\n\nseqinfo(gg) <- si.hs.38\n```\n:::\n\n\nNow, let's plot a karyogram that will show the SNP's identified with significant associations with a phenotype. The SNP's in the GWAS catalog have a stringent criterion of significance and there has been a replication of the finding from a independent population.\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nggbio::autoplot(gg, layout=\"karyogram\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRegistered S3 method overwritten by 'GGally':\n method from \n +.gg ggplot2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nScale for x is already present.\nAdding another scale for x, which will replace the existing scale.\nScale for x is already present.\nAdding another scale for x, which will replace the existing scale.\n```\n\n\n:::\n\n::: {.cell-output-display}\n![](index_files/figure-html/unnamed-chunk-3-1.png){width=672}\n:::\n:::\n\n\nWe can see the peak data as a `GRanges` object: \n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ndata(\"GM12878\")\n\nGM12878\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nGRanges object with 1873 ranges and 7 metadata columns:\n seqnames ranges strand | name score col\n | \n [1] chrX 1509355-1512462 * | 5 0 \n [2] chrX 26801422-26802448 * | 6 0 \n [3] chr19 11694102-11695359 * | 1 0 \n [4] chr19 4076893-4079276 * | 4 0 \n [5] chr3 53288568-53290767 * | 9 0 \n ... ... ... ... . ... ... ...\n [1869] chr19 11201120-11203985 * | 8701 0 \n [1870] chr19 2234920-2237370 * | 990 0 \n [1871] chr1 94311336-94313543 * | 4035 0 \n [1872] chr19 45690614-45691210 * | 10688 0 \n [1873] chr19 6110100-6111252 * | 2274 0 \n signalValue pValue qValue peak\n \n [1] 157.92 310.000 32 1991\n [2] 147.38 310.000 32 387\n [3] 99.71 311.660 32 861\n [4] 84.74 310.000 32 1508\n [5] 78.20 299.505 32 1772\n ... ... ... ... ...\n [1869] 8.65 7.281 0.26576 2496\n [1870] 8.65 26.258 1.99568 1478\n [1871] 8.65 12.511 1.47237 1848\n [1872] 8.65 6.205 0.00000 298\n [1873] 8.65 17.356 2.01323 496\n -------\n seqinfo: 93 sequences (1 circular) from hg19 genome\n```\n\n\n:::\n:::\n\n\nIf we see the bottom of the `GRanges` table, this experiment have the hg19 annotation from the human genome. To work on the GRCh38 annotation we need to lift-over with a `.chain` file. For this we can use the `AnnotationHub` package.\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlibrary(AnnotationHub)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'AnnotationHub' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: BiocFileCache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'BiocFileCache' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: dbplyr\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'dbplyr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'dbplyr'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:dplyr':\n\n ident, sql\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'AnnotationHub'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:Biobase':\n\n cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:rtracklayer':\n\n hubUrl\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nah <- AnnotationHub::AnnotationHub()\n\nquery(ah, c(\"hg19ToHg38.over.chain\"))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nAnnotationHub with 1 record\n# snapshotDate(): 2023-10-23\n# names(): AH14150\n# $dataprovider: UCSC\n# $species: Homo sapiens\n# $rdataclass: ChainFile\n# $rdatadateadded: 2014-12-15\n# $title: hg19ToHg38.over.chain.gz\n# $description: UCSC liftOver chain file from hg19 to hg38\n# $taxonomyid: 9606\n# $genome: hg19\n# $sourcetype: Chain\n# $sourceurl: http://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19To...\n# $sourcesize: NA\n# $tags: c(\"liftOver\", \"chain\", \"UCSC\", \"genome\", \"homology\") \n# retrieve record with 'object[[\"AH14150\"]]' \n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nchain <- ah[[\"AH14150\"]]\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nGM12878 <- liftOver(GM12878, chain) |> \n unlist()\n\nseqlevelsStyle(GM12878) <- \"NCBI\"\n\nseqinfo(GM12878) <- si.hs.38\n\n\nseqlevelsStyle(GM12878) <- \"UCSC\"\nseqlevelsStyle(gg) <- \"UCSC\"\n```\n:::\n\n\nWe can find overlaps between the GWAS catalog and the ESRRA ChIP-Seq experiment but, there is a problem; the GWAS catalog is a collection of intervals that reports all significant SNPs and there can be duplications of SNPs associated to multiple phenotypes or the same SNP might be found for the same phenotype in different studies.\n\nWe can see the duplications with the `reduce` function from `IRanges` package:\n\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n# duplicated loci\nlength(gg) - length(reduce(gg))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 261160\n```\n\n\n:::\n:::\n\nWe can see that there are `261160` duplicated loci. Let's find the overlap between the *reduced* catalog and the ChIP-Seq experiment:\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n#\nfo = findOverlaps(GM12878, reduce(gg))\nfo\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nHits object with 613 hits and 0 metadata columns:\n queryHits subjectHits\n \n [1] 6 237757\n [2] 9 221130\n [3] 12 25060\n [4] 15 104699\n [5] 15 104700\n ... ... ...\n [609] 1928 246305\n [610] 1929 244698\n [611] 1929 244699\n [612] 1931 250380\n [613] 1931 250381\n -------\n queryLength: 1932 / subjectLength: 268560\n```\n\n\n:::\n:::\n\nWe can see 613 hits. Then, we are gonna eobtain the ranges from those hits, retrieve the phenotypes (DISEASE/TRAIT) and show the top 20 most common phenotypes with association to SNPs that lies on the ESRRA binding peaks.\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nover_ranges <- reduce(gg)[subjectHits(fo)]\n\n\nii <- over_ranges |> \n as.data.frame() |> \n GenomicRanges::makeGRangesListFromDataFrame()\n\n\nphset = lapply(ii, function(x){\n \n # print(glue::glue(\"On range: {x}\"))\n unique(gg[which(gg %over% x)]$\"DISEASE/TRAIT\")\n \n})\n\ngwas_on_peaks <- phset |> \n enframe() |> \n unnest(value) |> \n dplyr::count(value) |> \n slice_max(n, n=20)\n\n\np <- gwas_on_peaks |> \n mutate(value = fct_reorder(value,n)) |> \n ggplot(aes(n,value, fill=value))+\n geom_col() +\n theme(legend.position = \"none\") +\n paletteer::scale_fill_paletteer_d(\"khroma::smoothrainbow\") +\n theme_minimal()\n\nbslib::card(plotly::ggplotly(p), full_screen = T)\n```\n\n::: {.cell-output-display}\n\n```{=html}\n
\n
\n
\n\n
\n\n\n\n\n\n
\n```\n\n:::\n:::\n\n\nDistinct phenotypes identified on the peaks:\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlength(phset)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 613\n```\n\n\n:::\n:::\n\nNow, how to do the inference of these phenotype on peaks of these b cells? We can use permutation on the genomic positions to test if the number of phenotypes found is due to chance or not.\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlibrary(ph525x)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: png\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: grid\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: VariantAnnotation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'VariantAnnotation' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: MatrixGenerics\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'MatrixGenerics' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: matrixStats\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'matrixStats' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'matrixStats'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:Biobase':\n\n anyMissing, rowMedians\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:dplyr':\n\n count\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'MatrixGenerics'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:matrixStats':\n\n colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,\n colCounts, colCummaxs, colCummins, colCumprods, colCumsums,\n colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,\n colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,\n colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,\n colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,\n colWeightedMeans, colWeightedMedians, colWeightedSds,\n colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,\n rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,\n rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,\n rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,\n rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,\n rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,\n rowWeightedMads, rowWeightedMeans, rowWeightedMedians,\n rowWeightedSds, rowWeightedVars\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:Biobase':\n\n rowMedians\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: SummarizedExperiment\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'SummarizedExperiment' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Rsamtools\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'Rsamtools' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Biostrings\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'Biostrings' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: XVector\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'XVector' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'XVector'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:purrr':\n\n compact\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'Biostrings'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:grid':\n\n pattern\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:base':\n\n strsplit\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'VariantAnnotation'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:stringr':\n\n fixed\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:base':\n\n tabulate\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nlibrary(progress)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'progress' was built under R version 4.3.3\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nset.seed(123)\n\nn_iter = 100\n\npb <- progress_bar$new(format = \"(:spin) [:bar] :percent [Elapsed time: :elapsedfull || Estimated time remaining: :eta]\",\n total = n_iter,\n complete = \"=\", # Completion bar character\n incomplete = \" \", # Incomplete bar character\n current = \">\", # Current bar character\n clear = FALSE, # If TRUE, clears the bar when finish\n width = 100) \n\n\nrsc = sapply(1:n_iter, function(x) {\n pb$tick()\n length(findOverlaps(reposition(GM12878), reduce(gg))) |> \n suppressWarnings()\n \n})\n\n# compute prop with more overlaps than in observed data\nmean(rsc > length(fo))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 0\n```\n\n\n:::\n:::\n\n\n\n\n## Explore the TCGA\n\nPlease check the script below the image to see how to explore and get insights from the TCGA (The Cancer Genome Atlas) database.\n\n\n{{< embed notebooks/TCGA.qmd#samplemap echo=true >}}\n\n\n\n## Conclusion\n\n## References {.unnumbered}\n\n:::{#refs}\n\n:::", "supporting": [ "index_files\\figure-html" ], diff --git a/_freeze/notebooks/TCGA/execute-results/html.json b/_freeze/notebooks/TCGA/execute-results/html.json index e38a873..4b549f2 100644 --- a/_freeze/notebooks/TCGA/execute-results/html.json +++ b/_freeze/notebooks/TCGA/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "ff3a6c52e08d7ae5ee63eba846994abd", + "hash": "a8532235cd9faf5a024ab5c7a2a04228", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Integrative Analysis with TCGA Data\"\nsubtitle: \"Analysis of Mutation Data from The Cancer Genome Atlas (TCGA)\"\nformat: html\neditor: visual\n---\n\n::: {.cell .hidden}\n\n```{.r .cell-code .hidden}\n#| include: false\nlibrary(knitr)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'knitr' was built under R version 4.3.2\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n#| include: false\nknitr::opts_chunk$set(\n warning = FALSE,\n message = FALSE,\n dpi = 180,\n fig.width = 15,\n fig.height = 8,\n echo = TRUE\n)\n\nlibrary(tidyverse)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'ggplot2' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'tidyr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'readr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'purrr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'dplyr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'stringr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'lubridate' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──\n✔ dplyr 1.1.4 ✔ readr 2.1.5\n✔ forcats 1.0.0 ✔ stringr 1.5.1\n✔ ggplot2 3.5.0 ✔ tibble 3.2.1\n✔ lubridate 1.9.3 ✔ tidyr 1.3.1\n✔ purrr 1.0.2 \n── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n✖ dplyr::filter() masks stats::filter()\n✖ dplyr::lag() masks stats::lag()\nℹ Use the conflicted package () to force all conflicts to become errors\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n#| include: false\ntheme_set(theme_minimal())\n```\n:::\n\n\n## Introduction\n\nThe Cancer Genome Atlas (TCGA) is a massive cancer genomics project compiling high-throughput multi-omics data on dozens of cancer types for [public access](https://www.cancer.gov/ccg/research/genome-sequencing/tcga).\n\nWe are gonna use the `curatedTCGAData` package to manipulate locally to multiple high-throughput datasets from the project. The package provides access to TCGA data that has been curated and stored as a *MultiAssayExperiment* object on the Bioconductor [ExperimentHub](https://bioconductor.org/packages/release/bioc/html/ExperimentHub.html).\n\nFirst, let's load the packages needed.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(curatedTCGAData)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: MultiAssayExperiment\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'MultiAssayExperiment' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: SummarizedExperiment\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'SummarizedExperiment' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: MatrixGenerics\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'MatrixGenerics' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: matrixStats\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'matrixStats' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'matrixStats'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:dplyr':\n\n count\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'MatrixGenerics'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:matrixStats':\n\n colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,\n colCounts, colCummaxs, colCummins, colCumprods, colCumsums,\n colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,\n colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,\n colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,\n colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,\n colWeightedMeans, colWeightedMedians, colWeightedSds,\n colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,\n rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,\n rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,\n rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,\n rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,\n rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,\n rowWeightedMads, rowWeightedMeans, rowWeightedMedians,\n rowWeightedSds, rowWeightedVars\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: GenomicRanges\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: stats4\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: BiocGenerics\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'BiocGenerics' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'BiocGenerics'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:lubridate':\n\n intersect, setdiff, union\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:dplyr':\n\n combine, intersect, setdiff, union\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:stats':\n\n IQR, mad, sd, var, xtabs\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:base':\n\n anyDuplicated, aperm, append, as.data.frame, basename, cbind,\n colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,\n get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,\n match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,\n Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,\n table, tapply, union, unique, unsplit, which.max, which.min\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: S4Vectors\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'S4Vectors'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:lubridate':\n\n second, second<-\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:dplyr':\n\n first, rename\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:tidyr':\n\n expand\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:utils':\n\n findMatches\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:base':\n\n expand.grid, I, unname\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: IRanges\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'IRanges'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:lubridate':\n\n %within%\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:dplyr':\n\n collapse, desc, slice\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:purrr':\n\n reduce\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:grDevices':\n\n windows\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: GenomeInfoDb\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'GenomeInfoDb' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Biobase\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWelcome to Bioconductor\n\n Vignettes contain introductory material; view with\n 'browseVignettes()'. To cite Bioconductor, see\n 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'Biobase'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:MatrixGenerics':\n\n rowMedians\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:matrixStats':\n\n anyMissing, rowMedians\n```\n\n\n:::\n\n```{.r .cell-code}\nlibrary(TCGAutils)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'TCGAutils' was built under R version 4.3.2\n```\n\n\n:::\n\n```{.r .cell-code}\nlibrary(MultiAssayExperiment)\n```\n:::\n\n\n## Download the Data\n\nTo download the data we need to use `curatedTCGAData`function. The first argument is a four letter disease (cancer) code (A complete list of disease codes used by the TCGA project are available on the [NCI Genomic Data Commons website](https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations)), the second argument is a vector of data types we want to download. We need to specify `dry.run = FALSE` to download the data.\n\nIn this specific case, we are gonna work with RNA-Seq data, mutation data and methylation data from Rectum Adenocarcinoma (READ). The clinical data is included by default.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n#| message: false\n#| warning: false\n\nreadData = curatedTCGAData(\"READ\", \n c(\"RNASeq2GeneNorm\", \"Mutation\", \"Methylation_methyl450\"), \n dry.run = FALSE, version = \"2.1.1\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_Mutation-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nrequire(\"RaggedExperiment\")\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'RaggedExperiment' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_RNASeq2GeneNorm-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_Methylation_methyl450-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nrequire(\"rhdf5\")\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'rhdf5' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: HDF5Array\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'HDF5Array' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: DelayedArray\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'DelayedArray' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Matrix\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'Matrix' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'Matrix'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:S4Vectors':\n\n expand\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:tidyr':\n\n expand, pack, unpack\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: S4Arrays\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'S4Arrays' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: abind\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'S4Arrays'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:abind':\n\n abind\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:base':\n\n rowsum\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: SparseArray\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'SparseArray' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'DelayedArray'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:purrr':\n\n simplify\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:base':\n\n apply, scale, sweep\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'HDF5Array'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:rhdf5':\n\n h5ls\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_colData-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_sampleMap-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_metadata-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nharmonizing input:\n removing 1903 sampleMap rows not in names(experiments)\n removing 2 colData rownames not in sampleMap 'primary'\n```\n\n\n:::\n\n```{.r .cell-code}\n#| message: false\n#| warning: false\n\nreadData # this is a MultiAssayExperiment object\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nA MultiAssayExperiment object of 3 listed\n experiments with user-defined names and respective classes.\n Containing an ExperimentList class object of length 3:\n [1] READ_Mutation-20160128: RaggedExperiment with 22075 rows and 69 columns\n [2] READ_RNASeq2GeneNorm-20160128: SummarizedExperiment with 18115 rows and 177 columns\n [3] READ_Methylation_methyl450-20160128: SummarizedExperiment with 485577 rows and 106 columns\nFunctionality:\n experiments() - obtain the ExperimentList instance\n colData() - the primary/phenotype DataFrame\n sampleMap() - the sample coordination DataFrame\n `$`, `[`, `[[` - extract colData columns, subset, or experiment\n *Format() - convert into a long or wide DataFrame\n assays() - convert ExperimentList to a SimpleList of matrices\n exportClass() - save data to flat files\n```\n\n\n:::\n:::\n\n\nWe can see which patients have data for each assay. The assay column gives the experiment type, the primary column gives the unique patient ID and the colname gives the sample ID used as a identifier within a given experiment.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nsampleMap(readData)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nDataFrame with 352 rows and 3 columns\n assay primary colname\n \n1 READ_Methylation_methyl450-20160128 TCGA-AF-2687 TCGA-AF-2687-01A-02D..\n2 READ_Methylation_methyl450-20160128 TCGA-AF-2690 TCGA-AF-2690-01A-02D..\n3 READ_Methylation_methyl450-20160128 TCGA-AF-2693 TCGA-AF-2693-01A-02D..\n4 READ_Methylation_methyl450-20160128 TCGA-AF-3911 TCGA-AF-3911-01A-01D..\n5 READ_Methylation_methyl450-20160128 TCGA-AF-4110 TCGA-AF-4110-01A-02D..\n... ... ... ...\n348 READ_RNASeq2GeneNorm-20160128 TCGA-AG-A02G TCGA-AG-A02G-01\n349 READ_RNASeq2GeneNorm-20160128 TCGA-AG-A02N TCGA-AG-A02N-01\n350 READ_RNASeq2GeneNorm-20160128 TCGA-AG-A02X TCGA-AG-A02X-01\n351 READ_RNASeq2GeneNorm-20160128 TCGA-AG-A032 TCGA-AG-A032-01\n352 READ_RNASeq2GeneNorm-20160128 TCGA-AG-A036 TCGA-AG-A036-01\n```\n\n\n:::\n:::\n\n\nNot all patients have data for all assays, and some of them can have multiple data entries for one or more experiment type. This may correspond to multiple biopsies or matched tumor and normal samples from an individual patient.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nsampleMap(readData) |> \n as_tibble() |> \n pull(primary) |> \n table() |> \n table()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\n 1 2 3 4 \n 5 147 7 8 \n```\n\n\n:::\n:::\n\n\nWe can see the metadata of the patients with `colData`. Note that there are more than 2000 columns of data per patient (not necessarily complete).\n\n\n::: {.cell}\n\n```{.r .cell-code}\nclin = colData(readData) |> \n as_tibble()\ndim(clin)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 167 2260\n```\n\n\n:::\n\n```{.r .cell-code}\nhead(colnames(clin), 10) \n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n [1] \"patientID\" \"years_to_birth\" \"vital_status\" \n [4] \"days_to_death\" \"days_to_last_followup\" \"tumor_tissue_site\" \n [7] \"pathologic_stage\" \"pathology_T_stage\" \"pathology_N_stage\" \n[10] \"pathology_M_stage\" \n```\n\n\n:::\n:::\n\n\nAs an example, for rectum adenocarcinoma, we can see the tumor stage.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nclin |> \n pull(pathology_T_stage) |> \n table()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\n t1 t2 t3 t4 t4a t4b \n 9 28 114 5 8 1 \n```\n\n\n:::\n:::\n\n\nStage T4 have subgroups. To simplify the analysis, let's combine all T4 tumors.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nclin <- clin |> \n mutate(t_stage = case_when(\n pathology_T_stage %in% c(\"t4\",\"t4a\",\"t4b\") ~ \"t4\",\n .default = pathology_T_stage\n ))\n\nclin$t_stage |> \n table()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\n t1 t2 t3 t4 \n 9 28 114 14 \n```\n\n\n:::\n:::\n\n\nAlso, we can see the vital status (alive=0, deceased=1)\n\n\n::: {.cell}\n\n```{.r .cell-code}\nclin$vital_status |> \n table()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\n 0 1 \n139 28 \n```\n\n\n:::\n:::\n\n\nOr combine tumor status and vital status.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntable(clin$t_stage, clin$vital_status)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n \n 0 1\n t1 9 0\n t2 24 4\n t3 96 18\n t4 9 5\n```\n\n\n:::\n:::\n\n\n# Analyzing Mutation Data\n\nLet's begin analyzing the mutation data. Below is the code to retrieve the mutation data.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmut_data = readData[[1]]\n\nmut_data\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nclass: RaggedExperiment \ndim: 22075 69 \nassays(34): Hugo_Symbol Entrez_Gene_Id ... COSMIC_Gene Drug_Target\nrownames: NULL\ncolnames(69): TCGA-AF-2689-01A-01W-0831-10 TCGA-AF-2691-01A-01W-0831-10\n ... TCGA-AG-A032-01 TCGA-AG-A036-01\ncolData names(0):\n```\n\n\n:::\n:::\n\n\nFrom the inspection of the sample IDs we can see that the mutation colnames match to the **primary** column from he clinical data.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmut_sample_ids = colnames(mut_data)\nhead(mut_sample_ids)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] \"TCGA-AF-2689-01A-01W-0831-10\" \"TCGA-AF-2691-01A-01W-0831-10\"\n[3] \"TCGA-AF-2692-01A-01W-0831-10\" \"TCGA-AF-3400-01A-01W-0831-10\"\n[5] \"TCGA-AF-3913-01\" \"TCGA-AG-3574-01A-01W-0831-10\"\n```\n\n\n:::\n\n```{.r .cell-code}\nhead(clin$patientID)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] \"TCGA-AF-2687\" \"TCGA-AF-2689\" \"TCGA-AF-2690\" \"TCGA-AF-2691\" \"TCGA-AF-2692\"\n[6] \"TCGA-AF-2693\"\n```\n\n\n:::\n:::\n\n\nWe need to manipulate these by substracting 12 characters.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmut_sample_ids <- mut_sample_ids |> \n stringr::str_sub(1,12)\n\nall(mut_sample_ids %in% clin$patientID)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] TRUE\n```\n\n\n:::\n:::\n\n\nIs important to note that the data stored in `assay(mut_data)` is difficult to work with because is a sparse matrix that has a row for each `GRanges` with a mutation in at least one sample.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nassay(mut_data)[1:3,1:3]\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n TCGA-AF-2689-01A-01W-0831-10 TCGA-AF-2691-01A-01W-0831-10\nX:54241715-54241716:+ \"WNK3\" NA \n7:111899511:+ \"IFRD1\" NA \n7:113309556:+ \"PPP1R3A\" NA \n TCGA-AF-2692-01A-01W-0831-10\nX:54241715-54241716:+ NA \n7:111899511:+ NA \n7:113309556:+ NA \n```\n\n\n:::\n\n```{.r .cell-code}\n# Is a sparse matrix\n\nassay(mut_data)[1,] |> \n table(useNA=\"ifany\")\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\nWNK3 \n 1 68 \n```\n\n\n:::\n:::\n\n\nWe can get more information if we look at the mutation information for each patient.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmut_assay = mut_data@assays\n\nmut_assay # GRangesList\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nGRangesList object of length 69:\n$`TCGA-AF-2689-01A-01W-0831-10`\nGRanges object with 64 ranges and 34 metadata columns:\n seqnames ranges strand | Hugo_Symbol Entrez_Gene_Id\n | \n [1] X 54241715-54241716 + | WNK3 65267\n [2] 7 111899511 + | IFRD1 3475\n [3] 7 113309556 + | PPP1R3A 5506\n [4] 7 128146325 + | FAM71F1 84691\n [5] 7 156447624 + | NOM1 64434\n ... ... ... ... . ... ...\n [60] 10 50616059 + | OGDHL 55753\n [61] 10 96343335 + | HELLS 3070\n [62] 5 139173166 + | PSD2 84249\n [63] 5 147800932 + | FBXO38 81545\n [64] 5 54365356 + | GZMK 3003\n Center NCBI_Build Variant_Classification Variant_Type\n \n [1] hgsc.bcm.edu 36 Frame_Shift_Ins INS\n [2] hgsc.bcm.edu 36 Missense_Mutation SNP\n [3] hgsc.bcm.edu 36 Missense_Mutation SNP\n [4] hgsc.bcm.edu 36 Silent SNP\n [5] hgsc.bcm.edu 36 Missense_Mutation SNP\n ... ... ... ... ...\n [60] hgsc.bcm.edu 36 Silent SNP\n [61] hgsc.bcm.edu 36 Silent SNP\n [62] hgsc.bcm.edu 36 Nonsense_Mutation SNP\n [63] hgsc.bcm.edu 36 Silent SNP\n [64] hgsc.bcm.edu 36 Missense_Mutation SNP\n Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS\n \n [1] - - T novel\n [2] A A G novel\n [3] T T C novel\n [4] G G A novel\n [5] A A G novel\n ... ... ... ... ...\n [60] G G A novel\n [61] T T C novel\n [62] C C T novel\n [63] G G A novel\n [64] T T A novel\n dbSNP_Val_Status Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1\n \n [1] unknown TCGA-AF-2689-10A-01W.. -\n [2] unknown TCGA-AF-2689-10A-01W.. A\n [3] unknown TCGA-AF-2689-10A-01W.. T\n [4] unknown TCGA-AF-2689-10A-01W.. G\n [5] unknown TCGA-AF-2689-10A-01W.. A\n ... ... ... ...\n [60] unknown TCGA-AF-2689-10A-01W.. G\n [61] unknown TCGA-AF-2689-10A-01W.. T\n [62] unknown TCGA-AF-2689-10A-01W.. C\n [63] unknown TCGA-AF-2689-10A-01W.. G\n [64] unknown TCGA-AF-2689-10A-01W.. T\n Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2\n \n [1] - - T\n [2] A A G\n [3] T T C\n [4] G . .\n [5] A A G\n ... ... ... ...\n [60] G . .\n [61] T . .\n [62] C C T\n [63] G G A\n [64] T T A\n Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2\n \n [1] - -\n [2] A A\n [3] T T\n [4] . .\n [5] A A\n ... ... ...\n [60] . .\n [61] . .\n [62] C C\n [63] G G\n [64] T T\n Verification_Status Validation_Status Mutation_Status Sequencing_Phase\n \n [1] Unknown Valid Somatic Phase_I\n [2] Unknown Valid Somatic Phase_I\n [3] Unknown Valid Somatic Phase_I\n [4] Unknown Unknown Somatic Phase_I\n [5] Unknown Valid Somatic Phase_I\n ... ... ... ... ...\n [60] Unknown Unknown Somatic Phase_I\n [61] Unknown Unknown Somatic Phase_I\n [62] Unknown Valid Somatic Phase_I\n [63] Unknown Valid Somatic Phase_I\n [64] Unknown Valid Somatic Phase_I\n Sequence_Source Validation_Method Score BAM_file Sequencer\n \n [1] Capture 454 SOLID\n [2] Capture 454 SOLID\n [3] Capture 454 SOLID\n [4] Capture . SOLID\n [5] Capture 454 SOLID\n ... ... ... ... ... ...\n [60] Capture . SOLID\n [61] Capture . SOLID\n [62] Capture 454 SOLID\n [63] Capture 454 SOLID\n [64] Capture 454 SOLID\n TranscriptID Exon ChromChange AAChange COSMIC_Codon\n \n [1] NM_001002838 exon23 c.4999_5000insA p.L1667fs .\n [2] NM_001550 exon10 c.A1043G p.E348G .\n [3] NM_002711 exon2 c.A838G p.T280A .\n [4] NM_032599 exon3 c.G639A p.T213T .\n [5] NM_138400 exon5 c.A1651G p.T551A .\n ... ... ... ... ... ...\n [60] NM_001143996 exon18 c.C2286T p.S762S .\n [61] NM_018063 exon18 c.T2061C p.D687D .\n [62] NM_032289 exon3 c.C460T p.Q154X .\n [63] NM_205836 exon21 c.G3327A p.R1109R .\n [64] NM_002104 exon5 c.T640A p.S214T .\n COSMIC_Gene Drug_Target\n \n [1] . .\n [2] . .\n [3] . .\n [4] . .\n [5] . .\n ... ... ...\n [60] . .\n [61] . .\n [62] . .\n [63] . .\n [64] . .\n -------\n seqinfo: 24 sequences from NCBI36 genome; no seqlengths\n\n...\n<68 more elements>\n```\n\n\n:::\n\n```{.r .cell-code}\nmut_assay |> class()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] \"CompressedGRangesList\"\nattr(,\"package\")\n[1] \"GenomicRanges\"\n```\n\n\n:::\n\n```{.r .cell-code}\nlength(mut_assay)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 69\n```\n\n\n:::\n:::\n\n\nLet's inspect the data from the first patient. We can see from the metadata information the Hugo Symbol, mutation status and predicted effect of each mutation at variant classification.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmut_assay[[1]]\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nGRanges object with 64 ranges and 34 metadata columns:\n seqnames ranges strand | Hugo_Symbol Entrez_Gene_Id\n | \n [1] X 54241715-54241716 + | WNK3 65267\n [2] 7 111899511 + | IFRD1 3475\n [3] 7 113309556 + | PPP1R3A 5506\n [4] 7 128146325 + | FAM71F1 84691\n [5] 7 156447624 + | NOM1 64434\n ... ... ... ... . ... ...\n [60] 10 50616059 + | OGDHL 55753\n [61] 10 96343335 + | HELLS 3070\n [62] 5 139173166 + | PSD2 84249\n [63] 5 147800932 + | FBXO38 81545\n [64] 5 54365356 + | GZMK 3003\n Center NCBI_Build Variant_Classification Variant_Type\n \n [1] hgsc.bcm.edu 36 Frame_Shift_Ins INS\n [2] hgsc.bcm.edu 36 Missense_Mutation SNP\n [3] hgsc.bcm.edu 36 Missense_Mutation SNP\n [4] hgsc.bcm.edu 36 Silent SNP\n [5] hgsc.bcm.edu 36 Missense_Mutation SNP\n ... ... ... ... ...\n [60] hgsc.bcm.edu 36 Silent SNP\n [61] hgsc.bcm.edu 36 Silent SNP\n [62] hgsc.bcm.edu 36 Nonsense_Mutation SNP\n [63] hgsc.bcm.edu 36 Silent SNP\n [64] hgsc.bcm.edu 36 Missense_Mutation SNP\n Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS\n \n [1] - - T novel\n [2] A A G novel\n [3] T T C novel\n [4] G G A novel\n [5] A A G novel\n ... ... ... ... ...\n [60] G G A novel\n [61] T T C novel\n [62] C C T novel\n [63] G G A novel\n [64] T T A novel\n dbSNP_Val_Status Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1\n \n [1] unknown TCGA-AF-2689-10A-01W.. -\n [2] unknown TCGA-AF-2689-10A-01W.. A\n [3] unknown TCGA-AF-2689-10A-01W.. T\n [4] unknown TCGA-AF-2689-10A-01W.. G\n [5] unknown TCGA-AF-2689-10A-01W.. A\n ... ... ... ...\n [60] unknown TCGA-AF-2689-10A-01W.. G\n [61] unknown TCGA-AF-2689-10A-01W.. T\n [62] unknown TCGA-AF-2689-10A-01W.. C\n [63] unknown TCGA-AF-2689-10A-01W.. G\n [64] unknown TCGA-AF-2689-10A-01W.. T\n Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2\n \n [1] - - T\n [2] A A G\n [3] T T C\n [4] G . .\n [5] A A G\n ... ... ... ...\n [60] G . .\n [61] T . .\n [62] C C T\n [63] G G A\n [64] T T A\n Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2\n \n [1] - -\n [2] A A\n [3] T T\n [4] . .\n [5] A A\n ... ... ...\n [60] . .\n [61] . .\n [62] C C\n [63] G G\n [64] T T\n Verification_Status Validation_Status Mutation_Status Sequencing_Phase\n \n [1] Unknown Valid Somatic Phase_I\n [2] Unknown Valid Somatic Phase_I\n [3] Unknown Valid Somatic Phase_I\n [4] Unknown Unknown Somatic Phase_I\n [5] Unknown Valid Somatic Phase_I\n ... ... ... ... ...\n [60] Unknown Unknown Somatic Phase_I\n [61] Unknown Unknown Somatic Phase_I\n [62] Unknown Valid Somatic Phase_I\n [63] Unknown Valid Somatic Phase_I\n [64] Unknown Valid Somatic Phase_I\n Sequence_Source Validation_Method Score BAM_file Sequencer\n \n [1] Capture 454 SOLID\n [2] Capture 454 SOLID\n [3] Capture 454 SOLID\n [4] Capture . SOLID\n [5] Capture 454 SOLID\n ... ... ... ... ... ...\n [60] Capture . SOLID\n [61] Capture . SOLID\n [62] Capture 454 SOLID\n [63] Capture 454 SOLID\n [64] Capture 454 SOLID\n TranscriptID Exon ChromChange AAChange COSMIC_Codon\n \n [1] NM_001002838 exon23 c.4999_5000insA p.L1667fs .\n [2] NM_001550 exon10 c.A1043G p.E348G .\n [3] NM_002711 exon2 c.A838G p.T280A .\n [4] NM_032599 exon3 c.G639A p.T213T .\n [5] NM_138400 exon5 c.A1651G p.T551A .\n ... ... ... ... ... ...\n [60] NM_001143996 exon18 c.C2286T p.S762S .\n [61] NM_018063 exon18 c.T2061C p.D687D .\n [62] NM_032289 exon3 c.C460T p.Q154X .\n [63] NM_205836 exon21 c.G3327A p.R1109R .\n [64] NM_002104 exon5 c.T640A p.S214T .\n COSMIC_Gene Drug_Target\n \n [1] . .\n [2] . .\n [3] . .\n [4] . .\n [5] . .\n ... ... ...\n [60] . .\n [61] . .\n [62] . .\n [63] . .\n [64] . .\n -------\n seqinfo: 24 sequences from NCBI36 genome; no seqlengths\n```\n\n\n:::\n\n```{.r .cell-code}\nmut_assay[[1]]$Hugo_Symbol\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n [1] \"WNK3\" \"IFRD1\" \"PPP1R3A\" \"FAM71F1\" \"NOM1\" \"TRIM73\" \n [7] \"C7orf51\" \"DIDO1\" \"DNMT1\" \"CALR\" \"SIN3B\" \"ZNF569\" \n[13] \"SIGLEC12\" \"ZNF160\" \"NLRP4\" \"ENPP2\" \"TRPA1\" \"MMP16\" \n[19] \"GPR89A\" \"SLC9A11\" \"PAX7\" \"PLA2G5\" \"SLC26A9\" \"PIK3R3\" \n[25] \"LPPR4\" \"BCL9L\" \"PRDM11\" \"PEX3\" \"DCDC2\" \"KRT20\" \n[31] \"TP53\" \"CDH8\" \"SF3B3\" \"SLC9A10\" \"SLC7A14\" \"NLGN1\" \n[37] \"MYRIP\" \"CYP8B1\" \"TGM4\" \"COL7A1\" \"P2RX7\" \"KRAS\" \n[43] \"TPH2\" \"ANO4\" \"UBR1\" \"LBXCOR1\" \"PDLIM5\" \"ODZ1\" \n[49] \"SMARCA1\" \"CNKSR2\" \"RBM10\" \"RBM3\" \"HUWE1\" \"CYLC1\" \n[55] \"ATP6V1E2\" \"ASTN2\" \"TAF1L\" \"SGCG\" \"GBF1\" \"OGDHL\" \n[61] \"HELLS\" \"PSD2\" \"FBXO38\" \"GZMK\" \n```\n\n\n:::\n\n```{.r .cell-code}\nmut_assay[[1]]$Mutation_Status |> \n table()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\nSomatic \n 64 \n```\n\n\n:::\n\n```{.r .cell-code}\nmut_assay[[1]]$Variant_Classification |> \n table()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\n Frame_Shift_Ins Missense_Mutation Nonsense_Mutation Silent \n 1 39 5 19 \n```\n\n\n:::\n:::\n\n\nNow, is kind of a trouble to inspect manually each patient. So, lets get all mutation information from Hugo symbol and Variant classification for all the patients.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nvar_class_df = mapply(function(sample_id, mutation_assay){\n \n d = mcols(mutation_assay)[,c(\"Hugo_Symbol\",\"Variant_Classification\")] |> \n as.data.frame()\n \n colnames(d) = c(\"symbol\",\"variant_class\")\n \n d$patientID = sample_id\n \n return(d)\n \n}, sample_id=mut_sample_ids, mutation_assay = mut_assay,SIMPLIFY = F, USE.NAMES = F)\n\n\nvar_class_df = do.call(rbind, var_class_df)\n\n\nhead(var_class_df)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n symbol variant_class patientID\n1 WNK3 Frame_Shift_Ins TCGA-AF-2689\n2 IFRD1 Missense_Mutation TCGA-AF-2689\n3 PPP1R3A Missense_Mutation TCGA-AF-2689\n4 FAM71F1 Silent TCGA-AF-2689\n5 NOM1 Missense_Mutation TCGA-AF-2689\n6 TRIM73 Nonsense_Mutation TCGA-AF-2689\n```\n\n\n:::\n:::\n\n\nWe can visualize the most common mutated genes genes in rectum adenocarcinoma\n\n\n::: {#cell-fig-genesmut .cell}\n\n```{.r .cell-code}\n#| label: fig-genesmut\n\np <- var_class_df |> \n as_tibble() |> \n group_by(symbol,variant_class) |> \n summarise(n = n()) |> \n arrange(desc(n)) |> \n ungroup() |> \n slice_max(order_by = n, n = 20) |> \n ungroup() |> \n mutate(symbol = fct_reorder(symbol,n)) |> \n ggplot(aes(n, symbol,fill=variant_class)) +\n geom_col() +\n facet_wrap(~variant_class) +\n paletteer::scale_fill_paletteer_d(\"awtools::a_palette\") +\n labs(x = \"Number of samples with specific mutation\", y=\"Gene Symbol\", fill=\"Variant Class\",\n title=\"Top 20 Samples with Rectum Adenocarcinoma Mutation\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`summarise()` has grouped output by 'symbol'. You can override using the\n`.groups` argument.\n```\n\n\n:::\n\n```{.r .cell-code}\n#| label: fig-genesmut\n\np\n```\n\n::: {.cell-output-display}\n![](TCGA_files/figure-html/fig-genesmut-1.png){#fig-genesmut width=2700}\n:::\n:::\n", + "markdown": "---\ntitle: \"Integrative Analysis with TCGA Data\"\nsubtitle: \"Analysis of Mutation Data from The Cancer Genome Atlas (TCGA)\"\nformat: html\neditor: visual\n---\n\n::: {.cell .hidden}\n\n```{.r .cell-code .hidden}\n#| include: false\nlibrary(knitr)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'knitr' was built under R version 4.3.2\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n#| include: false\nknitr::opts_chunk$set(\n warning = FALSE,\n message = FALSE,\n dpi = 180,\n fig.width = 15,\n fig.height = 8,\n echo = TRUE\n)\n\nlibrary(tidyverse)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'ggplot2' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'tidyr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'readr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'purrr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'dplyr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'stringr' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'lubridate' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──\n✔ dplyr 1.1.4 ✔ readr 2.1.5\n✔ forcats 1.0.0 ✔ stringr 1.5.1\n✔ ggplot2 3.5.0 ✔ tibble 3.2.1\n✔ lubridate 1.9.3 ✔ tidyr 1.3.1\n✔ purrr 1.0.2 \n── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n✖ dplyr::filter() masks stats::filter()\n✖ dplyr::lag() masks stats::lag()\nℹ Use the conflicted package () to force all conflicts to become errors\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n#| include: false\ntheme_set(theme_minimal())\n```\n:::\n\n\n## Introduction\n\nThe Cancer Genome Atlas (TCGA) is a massive cancer genomics project compiling high-throughput multi-omics data on dozens of cancer types for [public access](https://www.cancer.gov/ccg/research/genome-sequencing/tcga).\n\nWe are gonna use the `curatedTCGAData` package to manipulate locally to multiple high-throughput datasets from the project. The package provides access to TCGA data that has been curated and stored as a *MultiAssayExperiment* object on the Bioconductor [ExperimentHub](https://bioconductor.org/packages/release/bioc/html/ExperimentHub.html).\n\nFirst, let's load the packages needed.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(curatedTCGAData)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: MultiAssayExperiment\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'MultiAssayExperiment' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: SummarizedExperiment\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'SummarizedExperiment' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: MatrixGenerics\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'MatrixGenerics' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: matrixStats\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'matrixStats' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'matrixStats'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:dplyr':\n\n count\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'MatrixGenerics'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:matrixStats':\n\n colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,\n colCounts, colCummaxs, colCummins, colCumprods, colCumsums,\n colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,\n colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,\n colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,\n colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,\n colWeightedMeans, colWeightedMedians, colWeightedSds,\n colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,\n rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,\n rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,\n rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,\n rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,\n rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,\n rowWeightedMads, rowWeightedMeans, rowWeightedMedians,\n rowWeightedSds, rowWeightedVars\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: GenomicRanges\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: stats4\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: BiocGenerics\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'BiocGenerics' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'BiocGenerics'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:lubridate':\n\n intersect, setdiff, union\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:dplyr':\n\n combine, intersect, setdiff, union\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:stats':\n\n IQR, mad, sd, var, xtabs\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:base':\n\n anyDuplicated, aperm, append, as.data.frame, basename, cbind,\n colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,\n get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,\n match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,\n Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,\n table, tapply, union, unique, unsplit, which.max, which.min\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: S4Vectors\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'S4Vectors'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:lubridate':\n\n second, second<-\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:dplyr':\n\n first, rename\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:tidyr':\n\n expand\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:utils':\n\n findMatches\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:base':\n\n expand.grid, I, unname\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: IRanges\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'IRanges'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:lubridate':\n\n %within%\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:dplyr':\n\n collapse, desc, slice\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:purrr':\n\n reduce\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:grDevices':\n\n windows\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: GenomeInfoDb\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'GenomeInfoDb' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Biobase\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWelcome to Bioconductor\n\n Vignettes contain introductory material; view with\n 'browseVignettes()'. To cite Bioconductor, see\n 'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'Biobase'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:MatrixGenerics':\n\n rowMedians\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:matrixStats':\n\n anyMissing, rowMedians\n```\n\n\n:::\n\n```{.r .cell-code}\nlibrary(TCGAutils)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'TCGAutils' was built under R version 4.3.2\n```\n\n\n:::\n\n```{.r .cell-code}\nlibrary(MultiAssayExperiment)\n```\n:::\n\n\n## Download the Data\n\nTo download the data we need to use `curatedTCGAData`function. The first argument is a four letter disease (cancer) code (A complete list of disease codes used by the TCGA project are available on the [NCI Genomic Data Commons website](https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations)), the second argument is a vector of data types we want to download. We need to specify `dry.run = FALSE` to download the data.\n\nIn this specific case, we are gonna work with RNA-Seq data, mutation data and methylation data from Rectum Adenocarcinoma (READ). The clinical data is included by default.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n#| message: false\n#| warning: false\n\nreadData = curatedTCGAData(\"READ\", \n c(\"RNASeq2GeneNorm\", \"Mutation\", \"Methylation_methyl450\"), \n dry.run = FALSE, version = \"2.1.1\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_Mutation-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nrequire(\"RaggedExperiment\")\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'RaggedExperiment' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_RNASeq2GeneNorm-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_Methylation_methyl450-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nrequire(\"rhdf5\")\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'rhdf5' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: HDF5Array\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'HDF5Array' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: DelayedArray\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'DelayedArray' was built under R version 4.3.1\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: Matrix\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'Matrix' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'Matrix'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:S4Vectors':\n\n expand\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:tidyr':\n\n expand, pack, unpack\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: S4Arrays\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'S4Arrays' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: abind\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'S4Arrays'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:abind':\n\n abind\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:base':\n\n rowsum\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required package: SparseArray\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: package 'SparseArray' was built under R version 4.3.2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'DelayedArray'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:purrr':\n\n simplify\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following objects are masked from 'package:base':\n\n apply, scale, sweep\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n\nAttaching package: 'HDF5Array'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nThe following object is masked from 'package:rhdf5':\n\n h5ls\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_colData-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_sampleMap-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWorking on: READ_metadata-20160128\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nsee ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nloading from cache\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nharmonizing input:\n removing 1903 sampleMap rows not in names(experiments)\n removing 2 colData rownames not in sampleMap 'primary'\n```\n\n\n:::\n\n```{.r .cell-code}\n#| message: false\n#| warning: false\n\nreadData # this is a MultiAssayExperiment object\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nA MultiAssayExperiment object of 3 listed\n experiments with user-defined names and respective classes.\n Containing an ExperimentList class object of length 3:\n [1] READ_Mutation-20160128: RaggedExperiment with 22075 rows and 69 columns\n [2] READ_RNASeq2GeneNorm-20160128: SummarizedExperiment with 18115 rows and 177 columns\n [3] READ_Methylation_methyl450-20160128: SummarizedExperiment with 485577 rows and 106 columns\nFunctionality:\n experiments() - obtain the ExperimentList instance\n colData() - the primary/phenotype DataFrame\n sampleMap() - the sample coordination DataFrame\n `$`, `[`, `[[` - extract colData columns, subset, or experiment\n *Format() - convert into a long or wide DataFrame\n assays() - convert ExperimentList to a SimpleList of matrices\n exportClass() - save data to flat files\n```\n\n\n:::\n:::\n\n\nWe can see which patients have data for each assay. The assay column gives the experiment type, the primary column gives the unique patient ID and the colname gives the sample ID used as a identifier within a given experiment.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n#| label: samplemap\n\nsampleMap(readData)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\nDataFrame with 352 rows and 3 columns\n assay primary colname\n \n1 READ_Methylation_methyl450-20160128 TCGA-AF-2687 TCGA-AF-2687-01A-02D..\n2 READ_Methylation_methyl450-20160128 TCGA-AF-2690 TCGA-AF-2690-01A-02D..\n3 READ_Methylation_methyl450-20160128 TCGA-AF-2693 TCGA-AF-2693-01A-02D..\n4 READ_Methylation_methyl450-20160128 TCGA-AF-3911 TCGA-AF-3911-01A-01D..\n5 READ_Methylation_methyl450-20160128 TCGA-AF-4110 TCGA-AF-4110-01A-02D..\n... ... ... ...\n348 READ_RNASeq2GeneNorm-20160128 TCGA-AG-A02G TCGA-AG-A02G-01\n349 READ_RNASeq2GeneNorm-20160128 TCGA-AG-A02N TCGA-AG-A02N-01\n350 READ_RNASeq2GeneNorm-20160128 TCGA-AG-A02X TCGA-AG-A02X-01\n351 READ_RNASeq2GeneNorm-20160128 TCGA-AG-A032 TCGA-AG-A032-01\n352 READ_RNASeq2GeneNorm-20160128 TCGA-AG-A036 TCGA-AG-A036-01\n```\n\n\n:::\n:::\n", "supporting": [ "TCGA_files" ], diff --git a/index.qmd b/index.qmd index 45d35cc..4091d8d 100644 --- a/index.qmd +++ b/index.qmd @@ -202,7 +202,7 @@ mean(rsc > length(fo)) Please check the script below the image to see how to explore and get insights from the TCGA (The Cancer Genome Atlas) database. -{{< embed notebooks/TCGA.qmd#fig-genesmut >}} +{{< embed notebooks/TCGA.qmd#samplemap echo=true >}} ## Conclusion diff --git a/notebooks/TCGA.qmd b/notebooks/TCGA.qmd index a0df673..07cb830 100644 --- a/notebooks/TCGA.qmd +++ b/notebooks/TCGA.qmd @@ -60,6 +60,8 @@ readData # this is a MultiAssayExperiment object We can see which patients have data for each assay. The assay column gives the experiment type, the primary column gives the unique patient ID and the colname gives the sample ID used as a identifier within a given experiment. ```{r} +#| label: samplemap + sampleMap(readData) ``` @@ -243,3 +245,193 @@ p ``` +## Linking mutations and tumor stage + +Now that we are familiarized with the mutation data, we can link mutations to the tumor stage from the patients with rectum adenocarcinoma. + +We can filter the clinical data with the patients that have mutation data. + +```{r} +index <- which( (var_class_df$patientID |> unique()) %in% clin$patientID) + +clin_and_mutation = clin[index,] + +head(clin_and_mutation) + +``` + +We can count the number of genes with mutations per patient and then make a `left_join` to the clinical data to plot the mutated genes per tumor stage. + +```{r} +#| label: fig-plot1 + + +df = var_class_df |> + group_by(patientID) |> + summarise(n = n()) + +p <- clin_and_mutation |> + left_join(df, by = join_by(patientID)) |> + ggplot(aes(t_stage,log(n), fill=t_stage))+ + geom_boxplot() + + geom_jitter(width = 0.05, colour="red2") + + paletteer::scale_fill_paletteer_d("awtools::a_palette")+ + labs(x = "Tumor stage", y="Number of mutated genes in log scale") + + +p +``` + +Also, we can focus on a specific gene, e.g. [TP53](https://www.ncbi.nlm.nih.gov/gene/7157). + +```{r} +#| label: fig-plot_tp53 + + +mut_on_tp53 <- var_class_df |> + dplyr::filter(symbol == "TP53") + + +p <-clin_and_mutation |> + dplyr::filter(patientID %in% mut_on_tp53$patientID) |> + dplyr::select(t_stage, patientID) |> + group_by(t_stage) |> + summarise(n = n()) |> + # mutate(t_stage = fct_reorder(t_stage,n)) |> + ggplot(aes(n, t_stage, fill=t_stage, label=n)) + + geom_col() + + geom_label(fill="white") + + paletteer::scale_fill_paletteer_d("awtools::a_palette")+ + labs(y = "Tumor Stage", x="N° of Mutations", fill="Tumor Stage", title="Number of Mutations on Gene TP53") + +p +``` + +## Linking expression and tumor stage + +```{r} +rnaseq = readData[[2]] + +rnaseq + +``` + +```{r} + +colnames(rnaseq) = colnames(rnaseq) |> + stringr::str_sub(1,12) + +clin <- clin |> as.data.frame() +rownames(clin) <- clin$patientID + +index <- which(colnames(rnaseq) %in% rownames(clin)) + +colData(rnaseq) = as(clin[colnames(rnaseq),],"DataFrame") + + +idx <- is.na(colData(rnaseq)$t_stage) + +rnaseq <- rnaseq[,!idx] + +``` + +```{r} + +library(limma) +mm = model.matrix(~t_stage, data=colData(rnaseq)) +f1 = lmFit(assay(rnaseq), mm) +ef1 = eBayes(f1) +topTable(ef1) |> + arrange(desc(AveExpr)) + +``` + +```{r} +par(mfrow=c(1,2)) +boxplot(split(assay(rnaseq)["PAM",], rnaseq$t_stage), main="PAM") # higher expression in lower t_stage +boxplot(split(assay(rnaseq)["PAIP2",], rnaseq$t_stage), main="PAIP2") +``` + +# Linking methylation and expression + +```{r} + +methyl <- readData[[3]] + +idx <- colnames(methyl) |> + str_split(pattern = "-") |> + map(4) |> + enframe() |> + unnest(value) + +idx <- which(idx$value == "01A") + +methyl = methyl[,idx] + +methyl +``` + +```{r} +colnames(methyl) <- colnames(methyl) |> + str_sub(start = 1,end = 12) +``` + +```{r} +colData(methyl) <- as(clin[colnames(methyl),],"DataFrame") + + +intersect(colnames(methyl), colnames(rnaseq)) |> length() + +``` + +```{r} + +methyl_subset = methyl[,which(colnames(methyl) %in% colnames(rnaseq))] +rnaseq_subset = rnaseq[,which(colnames(rnaseq) %in% colnames(methyl))] + +methyl_genes = rowData(methyl_subset) +methyl_genes + +``` + +```{r} + +#| label: fig-plot + +me_rna_cor = function(sym, mpick = 3){ + require(GGally) + # subset methylation data to first mpick methylation sites for given gene symbol + methyl_ind = which(methyl_genes$Gene_Symbol == sym) + if (length(methyl_ind) > mpick){ + methyl_ind = methyl_ind[1:mpick] + } + methyl_dat = assay(methyl_subset)[methyl_ind,] # subset to selected methylation sites + + # subset expression data to selected gene symbol + expr_ind = which(rownames(rnaseq_subset) == sym) + expr_dat = assay(rnaseq_subset)[expr_ind,] + + # combine methylation and expression data as data frame + combined_dat = as(t(methyl_dat), "DataFrame") + combined_dat$expr = expr_dat + + # plot pairs and calculate correlation coefficients between methylation and expression + ggpairs(combined_dat) |> print() + sapply(1:mpick, function(i){ + + cor_to <- as.numeric(combined_dat[,mpick+1]) + + df <- data.frame(v1= as.numeric(combined_dat[,i]), + v2 = cor_to) + + df <- df |> na.omit() + + cor(df[,1], df[,2]) + }) +} + +me_rna_cor("TAC1", mpick=3) + +``` + +## Conclusion