The file downloaded from R2 comes with the “#” symbols, which need to be removed.
r2_gse62564 <- read.table("../data/ps_avgpres_gse62564geo498_seqcnb1_box1687888611-datagrabber-.txt")
class(r2_gse62564)
saveRDS(r2_gse62564, file = "../data/r2_gse62564.rds")
Removal of “#” will create and save the appropriate data-frame
r2_gse62564 <- read.table("../data/ps_avgpres_gse62564geo498_seqcnb1_box1687888611-datagrabber_modified.txt")
class(r2_gse62564)
saveRDS(r2_gse62564, file = "../data/r2_gse62564.rds")
start_time <- Sys.time()
r2_gse62564 <- readRDS("../data/r2_gse62564.rds")
end_time <- Sys.time()
end_time - start_time
## Time difference of 4.952882 secs
head(r2_gse62564, n = 20)
dim(r2_gse62564)
## [1] 24961 500
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
metadata_gse62564 <- r2_gse62564[1:17,]
## View(metadata_gse62564)
head(metadata_gse62564, n = 20)
## View data-frame
head(r2_gse62564, n = 20)
## Remove the second column, which is repeated
r2_gse62564 <- dplyr::select(r2_gse62564, -V2)
head(r2_gse62564, n = 20)
## Remove all metadata information, keep only gene counts
r2_gse62564 <- r2_gse62564[-c(2:18),]
## Give names to columns using row 1
colnames(r2_gse62564) <- r2_gse62564[1, ]
## Remove column that contain rows names
r2_gse62564 <- r2_gse62564[-1,]
head(r2_gse62564, n = 20)
library(maditr)
## Warning: package 'maditr' was built under R version 4.1.2
##
## To select columns from data: columns(mtcars, mpg, vs:carb)
##
## Attaching package: 'maditr'
## The following objects are masked from 'package:dplyr':
##
## between, coalesce, first, last
## Remove duplicated rows
## %>% requires maditr
r2_gse62564_distinct <- r2_gse62564 %>% distinct(`H:hugo`, .keep_all = TRUE)
head(r2_gse62564, n = 20)
## Give names to rows
rownames(r2_gse62564_distinct) <- r2_gse62564_distinct$`H:hugo`
r2_gse62564_distinct <- dplyr::select(r2_gse62564_distinct, -`H:hugo`)
head(r2_gse62564, n = 20)
names(r2_gse62564_distinct) <- toupper(names(r2_gse62564_distinct))
r2_gse62564_distinct <- r2_gse62564_distinct %>% mutate_if(is.character, as.numeric)
r2_gse62564_matrix <- as.matrix(r2_gse62564_distinct)
r2_gse62564_df <- as.data.frame(r2_gse62564_matrix)
In this part, we construct the dataframe to include phenotype scoring with the GSVA library.
# install.packages(BiocManager)
# library(BiocManager)
# BiocManager::install()
library(GSVA)
library(GSEABase)
#####################################################
###### 2) Load gene set and Construct GSVA data-frame
#####################################################
cfDNA_PCA_gene_list <- getGmt("../data/cfDNA_genes_PCA.txt")
## Warning in readLines(con, ...): incomplete final line found on
## '../data/cfDNA_genes_PCA.txt'
## Warning in getGmt("../data/cfDNA_genes_PCA.txt"): 321 record(s) contain
## duplicate ids: ADRN_Gene_List_373, ADRN_Gronigen, ...,
## WP_GLYCOLYSIS_AND_GLUCONEOGENESIS_7, WP_GLYCOLYSIS_IN_SENESCENCE_Genes_4
###################################
###### 3) Construct GSVA data-frame
###################################
# Needs to re-run this step if there are new gene sets
r2_gse62564_GSVA <- gsva(r2_gse62564_matrix,
cfDNA_PCA_gene_list,
min.sz=1, max.sz=Inf,
verbose=TRUE)
## Warning: useNames = NA is deprecated. Instead, specify either useNames = TRUE
## or useNames = TRUE.
## Warning in .filterFeatures(expr, method): 164 genes with constant expression
## values throuhgout the samples.
## Warning in .filterFeatures(expr, method): Since argument method!="ssgsea",
## genes with constant expression values are discarded.
## Warning in .gsva(expr, mapped.gset.idx.list, method, kcdf, rnaseq, abs.ranking,
## : Some gene sets have size one. Consider setting 'min.sz > 1'.
## Estimating GSVA scores for 312 gene sets.
## Estimating ECDFs with Gaussian kernels
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
########################################
###### 4) Merge Gene Expression and GSVA
########################################
r2_gse62564_GSVA_genes <- rbind(r2_gse62564_GSVA, r2_gse62564_matrix)
gse62564_metadata <- metadata_gse62564
# Repeat row with IDs to confirm matching later
gse62564_metadata[18,] <- gse62564_metadata[1,]
# Remove first column of metadata DF
gse62564_metadata <- gse62564_metadata[,-c(1)]
# Make all IDs in row 2, uppercase
gse62564_metadata[1,2:499] <- toupper(gse62564_metadata[1,2:499])
# Make row 1, column names
library(janitor)
## Warning: package 'janitor' was built under R version 4.1.2
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
gse62564_metadata <- gse62564_metadata %>%
row_to_names(row_number = 1)
# Make column as row names in metadata
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## Warning: package 'lubridate' was built under R version 4.1.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.2 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ maditr::between() masks dplyr::between()
## ✖ stringr::boundary() masks graph::boundary()
## ✖ maditr::coalesce() masks dplyr::coalesce()
## ✖ IRanges::collapse() masks dplyr::collapse()
## ✖ readr::cols() masks maditr::cols()
## ✖ Biobase::combine() masks BiocGenerics::combine(), dplyr::combine()
## ✖ IRanges::desc() masks dplyr::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ S4Vectors::first() masks maditr::first(), dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ maditr::last() masks dplyr::last()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks IRanges::reduce()
## ✖ S4Vectors::rename() masks dplyr::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ AnnotationDbi::select() masks dplyr::select()
## ✖ IRanges::slice() masks dplyr::slice()
## ✖ purrr::transpose() masks maditr::transpose()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
gse62564_metadata <- gse62564_metadata %>% remove_rownames %>% column_to_rownames(var="probeset")
r2_gse62564_GSVA_Metadata <- rbind(gse62564_metadata, r2_gse62564_GSVA_genes)
r2_gse62564_GSVA_Metadata <- t(r2_gse62564_GSVA_Metadata)
r2_gse62564_GSVA_Metadata <- as.data.frame(r2_gse62564_GSVA_Metadata)
saveRDS(r2_gse62564_GSVA_Metadata, file = "../results/r2_gse62564_GSVA_Metadata.rds")
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0
## [4] purrr_1.0.1 readr_2.1.4 tidyr_1.3.0
## [7] tibble_3.2.1 ggplot2_3.4.2 tidyverse_2.0.0
## [10] janitor_2.2.0 GSEABase_1.54.0 graph_1.70.0
## [13] annotate_1.70.0 XML_3.99-0.14 AnnotationDbi_1.54.1
## [16] IRanges_2.26.0 S4Vectors_0.30.2 Biobase_2.52.0
## [19] BiocGenerics_0.38.0 GSVA_1.44.2 maditr_0.8.3
## [22] dplyr_1.1.2
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 matrixStats_1.0.0
## [3] bit64_4.0.5 httr_1.4.6
## [5] GenomeInfoDb_1.30.0 tools_4.1.1
## [7] bslib_0.5.0 utf8_1.2.3
## [9] R6_2.5.1 irlba_2.3.5.1
## [11] HDF5Array_1.20.0 DBI_1.1.3
## [13] colorspace_2.1-0 rhdf5filters_1.4.0
## [15] withr_2.5.0 tidyselect_1.2.0
## [17] bit_4.0.5 compiler_4.1.1
## [19] cli_3.6.1 DelayedArray_0.18.0
## [21] sass_0.4.6 scales_1.2.1
## [23] digest_0.6.32 rmarkdown_2.22
## [25] XVector_0.32.0 pkgconfig_2.0.3
## [27] htmltools_0.5.5 sparseMatrixStats_1.4.2
## [29] MatrixGenerics_1.4.3 fastmap_1.1.1
## [31] rlang_1.1.1 rstudioapi_0.14
## [33] RSQLite_2.3.1 DelayedMatrixStats_1.14.3
## [35] jquerylib_0.1.4 generics_0.1.3
## [37] jsonlite_1.8.7 BiocParallel_1.26.2
## [39] RCurl_1.98-1.12 magrittr_2.0.3
## [41] BiocSingular_1.8.1 GenomeInfoDbData_1.2.6
## [43] Matrix_1.5-1 Rcpp_1.0.10
## [45] munsell_0.5.0 Rhdf5lib_1.14.2
## [47] fansi_1.0.4 lifecycle_1.0.3
## [49] stringi_1.7.12 yaml_2.3.7
## [51] snakecase_0.11.0 SummarizedExperiment_1.22.0
## [53] zlibbioc_1.38.0 rhdf5_2.36.0
## [55] grid_4.1.1 blob_1.2.4
## [57] crayon_1.5.2 lattice_0.21-8
## [59] Biostrings_2.60.2 beachmat_2.8.1
## [61] hms_1.1.3 KEGGREST_1.32.0
## [63] knitr_1.43 pillar_1.9.0
## [65] GenomicRanges_1.44.0 ScaledMatrix_1.0.0
## [67] glue_1.6.2 evaluate_0.21
## [69] data.table_1.14.8 png_0.1-8
## [71] vctrs_0.6.3 tzdb_0.4.0
## [73] gtable_0.3.3 cachem_1.0.8
## [75] xfun_0.39 rsvd_1.0.5
## [77] xtable_1.8-4 SingleCellExperiment_1.14.1
## [79] memoise_2.0.1 timechange_0.2.0