1) References

  • The R2 database link (R2 2023):


2) Load data-frame

2.1) Data-frame downloaded from the R2 website, in the data folder

The file downloaded from R2 comes with the “#” symbols, which need to be removed.

r2_gse62564 <- read.table("../data/ps_avgpres_gse62564geo498_seqcnb1_box1687888611-datagrabber-.txt")
saveRDS(r2_gse62564, file = "../data/r2_gse62564.rds")

Removal of “#” will create and save the appropriate data-frame

r2_gse62564 <- read.table("../data/ps_avgpres_gse62564geo498_seqcnb1_box1687888611-datagrabber_modified.txt")
saveRDS(r2_gse62564, file = "../data/r2_gse62564.rds")

2.2) Load rds object and calculate how long it takes R to load the object

start_time <- Sys.time()
r2_gse62564 <- readRDS("../data/r2_gse62564.rds")
end_time <- Sys.time()
end_time - start_time
## Time difference of 4.952882 secs

2.3) Visualize the dataframe

head(r2_gse62564, n = 20)
## [1] 24961   500

3) Process dataframe

3.1) Metadata

metadata_gse62564               <- r2_gse62564[1:17,]
## View(metadata_gse62564)
head(metadata_gse62564, n = 20)
## View data-frame
head(r2_gse62564, n = 20)
## Remove the second column, which is repeated
r2_gse62564   <- dplyr::select(r2_gse62564, -V2)
head(r2_gse62564, n = 20)
## Remove all metadata information, keep only gene counts
r2_gse62564   <- r2_gse62564[-c(2:18),]

## Give names to columns using row 1
colnames(r2_gse62564) <- r2_gse62564[1, ]

## Remove column that contain rows names
r2_gse62564           <- r2_gse62564[-1,]
head(r2_gse62564, n = 20)

3.2) More processing

## Remove duplicated rows
## %>% requires maditr
r2_gse62564_distinct           <- r2_gse62564 %>% distinct(`H:hugo`, .keep_all = TRUE)
head(r2_gse62564, n = 20)
## Give names to rows
rownames(r2_gse62564_distinct) <- r2_gse62564_distinct$`H:hugo`
r2_gse62564_distinct           <- dplyr::select(r2_gse62564_distinct, -`H:hugo`)
head(r2_gse62564, n = 20)
names(r2_gse62564_distinct)    <- toupper(names(r2_gse62564_distinct))
r2_gse62564_distinct           <- r2_gse62564_distinct %>% mutate_if(is.character, as.numeric)
r2_gse62564_matrix             <- as.matrix(r2_gse62564_distinct)
r2_gse62564_df                 <- as.data.frame(r2_gse62564_matrix)

4) Calculate GSVA using Gene Sets

In this part, we construct the dataframe to include phenotype scoring with the GSVA library.

4.1) Gene and GSVA data-frame

4.1.2) Prepare GSVA DF

# install.packages(BiocManager)
# library(BiocManager)
# BiocManager::install()
###### 2) Load gene set and Construct GSVA data-frame

cfDNA_PCA_gene_list    <- getGmt("../data/cfDNA_genes_PCA.txt")
###### 3) Construct GSVA data-frame

# Needs to re-run this step if there are new gene sets
r2_gse62564_GSVA <- gsva(r2_gse62564_matrix, 
                          min.sz=1, max.sz=Inf, 
## Estimating GSVA scores for 312 gene sets.
## Estimating ECDFs with Gaussian kernels
###### 4) Merge Gene Expression and GSVA

r2_gse62564_GSVA_genes <- rbind(r2_gse62564_GSVA, r2_gse62564_matrix)

4.1.3) Include Metadata Based on R2 information

gse62564_metadata <- metadata_gse62564
# Repeat row with IDs to confirm matching later
gse62564_metadata[18,] <- gse62564_metadata[1,]
# Remove first column of metadata DF
gse62564_metadata <- gse62564_metadata[,-c(1)]
# Make all IDs in row 2, uppercase
gse62564_metadata[1,2:499] <- toupper(gse62564_metadata[1,2:499])
# Make row 1, column names
gse62564_metadata <- gse62564_metadata %>% remove_rownames %>% column_to_rownames(var="probeset")
r2_gse62564_GSVA_Metadata <- rbind(gse62564_metadata, r2_gse62564_GSVA_genes)
r2_gse62564_GSVA_Metadata <- t(r2_gse62564_GSVA_Metadata)
r2_gse62564_GSVA_Metadata <- as.data.frame(r2_gse62564_GSVA_Metadata)
saveRDS(r2_gse62564_GSVA_Metadata, file = "../results/r2_gse62564_GSVA_Metadata.rds")

7) References

R2. 2023. R2: Genomics Analysis and Visualization Platform. http://r2.amc.nl.

8) Session Info

