diff --git a/02_normalize_harmonize_proteomics.html b/02_normalize_harmonize_proteomics.html deleted file mode 100644 index 803d592..0000000 --- a/02_normalize_harmonize_proteomics.html +++ /dev/null @@ -1,841 +0,0 @@ - - - - -
- - - - - - - - - - -We now have phospho proteomics from two cohorts. Here I’m trying to -collect data from both and normalize but am clearly missing something. I -do the following:
-Each dataset is done individually then combined at the end. There is -a clear batch effect.
-We start with the cohort 1 phospho data here.
-##cohort 1 phospho
-##first we read in file, and get site info
-phospho1<- read.table(syn$get('syn69963552')$path,sep='\t',fill=NA,header=T,quote='"') |>
- subset(!is.na(`Gene.Names`)) |>
- subset(Gene.Names!='') |>
- mutate(lsite=tolower(Residue)) |>
- tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |>
- tidyr::unite(c(`Gene.Names`,site),col='site',sep='-') |>
- as.data.frame()
-
-phospho1[which(phospho1==0,arr.ind=TRUE)]<-NA
-
-pfnames1 <- data.frame(fname=colnames(phospho1)[5:ncol(phospho1)])|>
- mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[8]))|>
- mutate(aliquot=as.double(aliquot))|>
- mutate(cohort=1)
-
-##logtransform##median transform
-pzeros<-which(apply(phospho1[,5:ncol(phospho1)],1,function(x)
- length(which(is.na(x)))/length(x) < 0.5))
-
-pmat1<-apply(0.01+log2(phospho1[pzeros,5:ncol(phospho1)]),2,
- function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |>
- as.data.frame() |>
- mutate(site=phospho1$site[pzeros])
-
-##move to long form, upload
-plong1<-pmat1|>
- tidyr::pivot_longer(1:(ncol(pmat1)-1),names_to='fname',values_to='abundance')|>
- left_join(pfnames1) |>
- group_by(site,fname,aliquot,cohort) |>
- summarize(meanAbundance=mean(abundance,na.rm=T))|>
- subset(!is.na(meanAbundance))|>
- left_join(meta)
-## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'site', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-readr::write_csv(plong1,file='log2normMedCenteredPhospho.csv')
-syn$store(File('log2normMedCenteredPhospho.csv',parentId='syn70078365'))
-## File(id='syn65598472', synapseStore=True, modifiedOn='2025-10-07T16:10:10.287Z', dataFileHandleId='163828412', versionNumber=51, name='log2normMedCenteredPhospho.csv', createdBy='1418096', parentId='syn70078365', path='log2normMedCenteredPhospho.csv', _file_handle={'id': '163828412', 'etag': 'c532fccc-538b-4345-85ee-78a55c4db07b', 'createdBy': '1418096', 'createdOn': '2025-10-02T01:31:03.000Z', 'modifiedOn': '2025-10-02T01:31:03.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '479eaaac2d8f26d5ef94fbec8a7d4c6d', 'fileName': 'log2normMedCenteredPhospho.csv', 'storageLocationId': 1, 'contentSize': 11566930, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/aff2a055-c32f-4a5c-bc5b-421908f76e6b/log2normMedCenteredPhospho.csv', 'previewId': '163828414', 'isPreview': False, 'externalURL': None}, concreteType='org.sagebionetworks.repo.model.FileEntity', isLatestVersion=True, cacheDir='', etag='0fc57caf-2cbf-4e7e-88a1-d4fce206142e', files=['log2normMedCenteredPhospho.csv'], modifiedBy='1418096', versionLabel='51', createdOn='2025-03-20T19:21:02.864Z')
-The file is uploaded to synapse.
-Now on October 7 we can process the second batch of phospho.
-##cohort 2 phospho
-##1 read in data
-phospho2 <- read.table(syn$get('syn69947351')$path,sep='\t',fill=NA,header=T,quote='"') |>
- subset(!is.na(`Gene.Names`)) |>
- subset(Gene.Names!='') |>
- mutate(lsite=tolower(Residue)) |>
- tidyr::unite(c(Residue,Site,lsite),col='site',sep='') |>
- tidyr::unite(c(`Gene.Names`,site),col='site',sep='-')
-
-
-phospho2[which(phospho2==0,arr.ind=TRUE)] <- NA
-
-pfnames2 <- data.frame(fname=colnames(phospho2)[5:ncol(phospho2)]) |>
- mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[9])) |>
- mutate(aliquot=as.double(aliquot))|>
- mutate(cohort=2)
-
-##remove missingness
-tm <- which(apply(phospho2[,5:ncol(phospho2)],1,function(x) length(which(is.na(x)))/length(x) < 0.5))
-
-##log2 adjusted z score
-pmat2<-apply(log2(0.01+phospho2[tm,5:ncol(phospho2)]),2,
- function(x) {0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T)}) |>
- as.data.frame() |>
- mutate(site=phospho2$site[tm])
-
-
-
-plong2<-pmat2|>
- tidyr::pivot_longer(1:(ncol(pmat2)-1),names_to='fname',values_to='abundance') |>
- left_join(pfnames2)|>
- group_by(site,fname,aliquot,cohort) |>
- summarize(meanAbundance=mean(abundance,na.rm=T)) |>
- subset(!is.na(meanAbundance))|>
- left_join(meta)
-## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'site', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-##save to file
-readr::write_csv(plong2,file='log2normMedCenteredPhospho_cohort2.csv')
-syn$store(File('log2normMedCenteredPhospho_cohort2.csv',parentId='syn70078365'))
-## File(synapseStore=True, id='syn70078413', path='log2normMedCenteredPhospho_cohort2.csv', createdOn='2025-10-07T15:56:04.262Z', createdBy='1418096', files=['log2normMedCenteredPhospho_cohort2.csv'], _file_handle={'id': '163828418', 'etag': '1d896b0a-e1eb-4083-a6e0-b3f266dc59ce', 'createdBy': '1418096', 'createdOn': '2025-10-02T01:31:22.000Z', 'modifiedOn': '2025-10-02T01:31:22.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '84b9a91da42f0630ebbd37e0ad9b22a7', 'fileName': 'log2normMedCenteredPhospho_cohort2.csv', 'storageLocationId': 1, 'contentSize': 6535765, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/b3ab724b-02f7-4b08-8ea0-0191c2835ee6/log2normMedCenteredPhospho_cohort2.csv', 'previewId': '163828419', 'isPreview': False, 'externalURL': None}, name='log2normMedCenteredPhospho_cohort2.csv', modifiedOn='2025-10-07T16:10:11.331Z', dataFileHandleId='163828418', etag='755cc7b3-9fae-473e-b684-9e81d4200f95', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, cacheDir='', versionLabel='3', parentId='syn70078365', modifiedBy='1418096')
-Now that we have two cohorts we can try to combine without batch -correction.
-Combining the phoshpo data here.
-##now we move back to long form
-plong <- rbind(plong1,plong2)
- #pmat |>
-# as.data.frame()|>
-# tibble::rownames_to_column('site')|>
-# pivot_longer(-site,names_to='fname',values_to='abundance')|>
-# left_join(rbind(pfnames1,pfnames2))|>
-# group_by(site,fname,aliquot,cohort) |>
-# summarize(meanAbundance=mean(abundance,na.rm=T)) |>
-# left_join(meta)
-
-
-compsites <- plong|>
-# subset(meanAbundance>(-5))|>
- group_by(site)|>
- summarize(spec = n_distinct(Specimen))|>
- subset(spec==31)
-
-#plong$meanAbundance[which(!is.finite(plong$meanAbundance))]<-0
-
-ppcs<-plong|>ungroup()|>
- dplyr::select(Specimen,meanAbundance,site)|>
- unique()|>
- subset(site%in%compsites$site)|>
- #subset(!is.na(site))|>
- #subset(!is.na(meanAbundance))|>
- tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',
- values_fn=mean,values_fill=0)|>
- tibble::column_to_rownames('site')|>
- t()|>
- prcomp()
-
-pplot<-ppcs$x|>
- as.data.frame()|>
- dplyr::select(PC1,PC2,PC3)|>
- tibble::rownames_to_column('Specimen')|>
- left_join(meta)|>
- dplyr::select(PC2,PC1,Specimen,Patient,cohort)|>
- mutate(cohort=as.factor(cohort))|>
- distinct()|>
- ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
- geom_point()+
- #ggrepel::geom_label_repel()+
- ggtitle("Phospho samples")+
- ggplot2::scale_color_manual(values=pcols)
-## Joining with `by = join_by(Specimen)`
-ph<- plong |>ungroup()|>
- subset(site%in%compsites$site)|>
- ggplot(aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram()
-
-cowplot::plot_grid(ph,pplot)
-## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-ggsave('cNFPhosphoQC.png',width=10)
-## Saving 10 x 5 in image
-pplot
-ggsave('phosphoPCA.pdf')
-## Saving 7 x 5 in image
-Clearly there is a strong batch effect.
-Now we can move onto the global data
-Global proteomics in cohort 1 here.
-####now process global
-#global1<-readr::read_tsv(syn$get('syn64906445')$path)
-global1 <- read.table(syn$get('syn69947355')$path,sep='\t',header=T,quote='"') |>
- tidyr::separate_rows(Genes,sep=';')
-##logtransform, medina transform
-
-#global1[which(global1==0,arr.ind=TRUE)]<-NA
-
-gmat1<-apply(log2(global1[,5:ncol(global1)]),2,function(x) 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T))
-
-gmat1<-gmat1|>
- as.data.frame()|>
- mutate(Genes=global1$Genes)
-
-##extract aliquot info from file name
-gfnames1 <- data.frame(fname=colnames(global1)[5:ncol(global1)]) |>
- mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[6])) |>
- mutate(aliquot=as.double(aliquot))|>
- mutate(cohort=1)
-
-glong1<-gmat1|>
- tidyr::pivot_longer(1:(ncol(gmat1)-1),names_to='fname',values_to='abundance')|>
- left_join(gfnames1)|>
- group_by(Genes,fname,aliquot,cohort)|>
- summarize(meanAbundance=mean(abundance))|>
- subset(is.finite(meanAbundance))|>
- left_join(meta)
-## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'Genes', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-readr::write_csv(glong1,file='log2normMedCenteredGlobal.csv')
-syn$store(File('log2normMedCenteredGlobal.csv',parentId='syn70078365'))
-## File(files=['log2normMedCenteredGlobal.csv'], synapseStore=True, name='log2normMedCenteredGlobal.csv', _file_handle={'id': '163828754', 'etag': 'ecb596c9-4869-4413-9838-eb66cb83b76b', 'createdBy': '1418096', 'createdOn': '2025-10-02T02:09:54.000Z', 'modifiedOn': '2025-10-02T02:09:54.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': 'd6c01c9ff6bf97322703ae1d5bbc8349', 'fileName': 'log2normMedCenteredGlobal.csv', 'storageLocationId': 1, 'contentSize': 20723175, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/75b0981e-7e42-4c6d-b5aa-1443652f6c43/log2normMedCenteredGlobal.csv', 'previewId': '163828755', 'isPreview': False, 'externalURL': None}, etag='f8f7705a-9072-42f8-9616-31c447e8f787', createdBy='1418096', path='log2normMedCenteredGlobal.csv', createdOn='2025-03-20T19:29:54.101Z', modifiedOn='2025-10-07T16:10:14.595Z', concreteType='org.sagebionetworks.repo.model.FileEntity', versionLabel='43', isLatestVersion=True, cacheDir='', dataFileHandleId='163828754', parentId='syn70078365', id='syn65599827', versionNumber=43, modifiedBy='1418096')
-October 7 we process the second cohort.
-global2<-read.table(syn$get('syn69947352')$path,header=T,sep='\t',quote='"')|>
- tidyr::separate_rows(Genes,sep=';')
-
-#global2[which(global2==0,arr.ind=TRUE)]<-NA
-
-gmat2<-apply(log2(global2[,5:ncol(global2)]),2,function(x)
- 0.6745 *(x-median(x,na.rm=T))/mad(x,na.rm=T))
-rownames(gmat2)<-global2$Genes
-
-gmat2<-gmat2|>
- as.data.frame()|>
- mutate(Genes=global2$Genes)
-
-gfnames2 <- data.frame(fname=colnames(global2)[5:ncol(global2)]) |>
- mutate(aliquot=sapply(fname,function(x) unlist(strsplit(x,split='_'))[7])) |>
- mutate(aliquot=as.double(aliquot))|>
- mutate(cohort=2)
-
-glong2<-gmat2|>
- tidyr::pivot_longer(1:(ncol(gmat2)-1),names_to='fname',values_to='abundance')|>
- left_join(gfnames2)|>
- group_by(Genes,fname,aliquot,cohort)|>
- summarize(meanAbundance=mean(abundance))|>
- subset(is.finite(meanAbundance))|>
- left_join(meta)
-## Joining with `by = join_by(fname)`
-## `summarise()` has grouped output by 'Genes', 'fname', 'aliquot'. You can
-## override using the `.groups` argument.
-## Joining with `by = join_by(aliquot, cohort)`
-#dupes<-global|>group_by(Genes)|>summarize(numIso=n())|>
-# subset(numIso>1)
-
-
-readr::write_csv(glong2,file='log2normMedCenteredGlobal_cohort2.csv')
-syn$store(File('log2normMedCenteredGlobal_cohort2.csv',parentId='syn70078365'))
-## File(synapseStore=True, files=['log2normMedCenteredGlobal_cohort2.csv'], createdBy='1418096', dataFileHandleId='163828776', createdOn='2025-10-07T15:56:09.318Z', parentId='syn70078365', modifiedOn='2025-10-07T16:10:16.010Z', name='log2normMedCenteredGlobal_cohort2.csv', etag='d11f86af-5af5-49f7-abf6-2bae20de1c9b', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, _file_handle={'id': '163828776', 'etag': 'ea261f85-be11-4c15-b745-fe79324c9b19', 'createdBy': '1418096', 'createdOn': '2025-10-02T02:12:01.000Z', 'modifiedOn': '2025-10-02T02:12:01.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '4da78f673cfedc27c22cdb76a0663db8', 'fileName': 'log2normMedCenteredGlobal_cohort2.csv', 'storageLocationId': 1, 'contentSize': 13474091, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/6c66fd73-e5d0-4f05-acc1-2aca33440948/log2normMedCenteredGlobal_cohort2.csv', 'previewId': '163828779', 'isPreview': False, 'externalURL': None}, cacheDir='', versionLabel='3', id='syn70078414', modifiedBy='1418096', path='log2normMedCenteredGlobal_cohort2.csv')
-Now we can combine the global withot batch correction.
-#ma<-mean(glong$abundance,na.rm=T)
-#glong$meanAbundance[which(!is.finite(glong$meanAbundance))]<-0
-glong <- rbind(glong1,glong2)|>
- subset(Genes!="")
-
-compsites <- glong|>
-# subset(meanAbundance>(-5))|>
- group_by(Genes)|>
- summarize(spec = n_distinct(Specimen))|>
- subset(spec==31)
-
-gpcs<-glong|>ungroup()|>
- dplyr::select(Specimen,meanAbundance,Genes)|>
- subset(!is.na(Genes))|>
- subset(Genes!="")|>
- subset(Genes%in%compsites$Genes)|>
- subset(!is.na(meanAbundance))|>
- tidyr::pivot_wider(names_from='Specimen',values_from='meanAbundance',values_fn=mean,values_fill=0)|>
- tibble::column_to_rownames('Genes')|>t()|>
- prcomp()
-
-gplot<-gpcs$x|>
- as.data.frame()|>
- dplyr::select(PC1,PC2)|>
- tibble::rownames_to_column('Specimen')|>
- left_join(meta)|>
- dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
- mutate(cohort=as.factor(cohort))|>
- distinct()|>
- ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
- geom_point()+ggrepel::geom_label_repel()+ggtitle("Global samples")+
- scale_color_manual(values=pcols)
-## Joining with `by = join_by(Specimen)`
-hplot <- ggplot(glong,aes(x=meanAbundance,fill=as.factor(cohort)))+geom_histogram()
-
-
-cowplot::plot_grid(hplot,gplot)
-## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-ggsave('cNFGlobalQC.png',width=10)
-## Saving 10 x 5 in image
-gplot
-ggsave('globalPCA.pdf')
-## Saving 7 x 5 in image
-Now we have two separate long tables with metadata, but we would like -to combine into a single one and batch correct.We can update this with -each cohort.
-##phospho
-##TODO: ideally we should use the long tables and reconvert
-pmat <- merge(as.data.frame(pmat1),as.data.frame(pmat2))
-
-gmat <- merge(gmat1,gmat2)
-
-##remove duplicated sites
-dsites<- unique(pmat$site[which(duplicated(pmat$site))])
-mvals<-sapply(dsites,function(x) colSums(pmat[pmat$site==x,2:ncol(pmat)])) |>
- t() |>
- as.data.frame() |>
- tibble::rownames_to_column('site')
-
-pmat <- pmat |>
- subset(!site %in% dsites) |>
- rbind(mvals)
-
-##now convert to matrix
-pmat <- pmat |>
- tibble::remove_rownames() |>
- tibble::column_to_rownames('site') |>
- as.matrix()
-
-gmat <- gmat |>
- subset(Genes!='')|>
- tibble::remove_rownames() |>
- tibble::column_to_rownames('Genes') |>
- as.matrix()
-##sigh, batch correct?
-library(sva)
-## Loading required package: mgcv
-## Loading required package: nlme
-##
-## Attaching package: 'nlme'
-## The following object is masked from 'package:dplyr':
-##
-## collapse
-## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
-## Loading required package: genefilter
-## Loading required package: BiocParallel
-## Warning: package 'BiocParallel' was built under R version 4.4.3
- pmat[which(!is.finite(pmat),arr.ind=T)] <- 0.0
- cbmat<-sva::ComBat(pmat,batch=meta$cohort,mean.only = FALSE)
-## Found2batches
-## Adjusting for0covariate(s) or covariate level(s)
-## Standardizing Data across genes
-## Fitting L/S model and finding priors
-## Finding parametric adjustments
-## Adjusting the Data
- gmat[which(!is.finite(gmat),arr.ind=T)] <- 0.0
- cgmat <- sva::ComBat(gmat,batch=meta$cohort,mean.only = FALSE)
-## Found2batches
-## Adjusting for0covariate(s) or covariate level(s)
-## Standardizing Data across genes
-## Fitting L/S model and finding priors
-## Finding parametric adjustments
-## Adjusting the Data
- ppcs<-prcomp(t(cbmat))
-
- pplot<-ppcs$x|>
- as.data.frame()|>
- dplyr::select(PC1,PC2)|>
- tibble::rownames_to_column('fname')|>
- left_join(rbind(pfnames1,pfnames2))|>
- left_join(meta)|>
- dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
- mutate(cohort=as.factor(cohort))|>
- distinct()|>
- ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
- geom_point()+ggrepel::geom_label_repel()+ggtitle("Corrected phospho samples")+
- scale_color_manual(values=pcols)
-## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
- pplot
-## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
- gpcs<-prcomp(t(cgmat))
- gplot<-gpcs$x|>
- as.data.frame()|>
- dplyr::select(PC1,PC2)|>
- tibble::rownames_to_column('fname')|>
- left_join(rbind(gfnames1,gfnames2))|>
- left_join(meta)|>
- dplyr::select(PC1,PC2,Specimen,Patient,cohort)|>
- mutate(cohort=as.factor(cohort))|>
- distinct()|>
- ggplot(aes(x=PC1,y=PC2,label=Specimen,col=Patient,shape=cohort))+
- geom_point()+ggrepel::geom_label_repel()+ggtitle("Corrected global samples")+
- scale_color_manual(values=pcols)
-## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
- gplot
-## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
-## increasing max.overlaps
-Now we can reformat the batch-corrected data and upload to syanps
-pc_long <- cbmat |>
- as.data.frame() |>
- tibble::rownames_to_column('site') |>
- pivot_longer(-site,names_to = 'fname',values_to = 'correctedAbundance') |>
- left_join(rbind(pfnames1,pfnames2)) |>
- left_join(meta) |>
- distinct()
-## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-gc_long <- cgmat |>
- as.data.frame() |>
- tibble::rownames_to_column('Gene') |>
- pivot_longer(-Gene,names_to = 'fname',values_to = 'correctedAbundance') |>
- left_join(rbind(gfnames1,gfnames2)) |>
- left_join(meta) |>
- distinct()
-## Joining with `by = join_by(fname)`
-## Joining with `by = join_by(aliquot, cohort)`
-readr::write_csv(pc_long,file='batch12_correctedPhospho.csv')
-readr::write_csv(gc_long,file='batch12_correctedGlobal.csv')
-
-syn$store(File('batch12_correctedPhospho.csv',parentId='syn70078365'))
-## File(modifiedOn='2025-10-07T16:10:24.600Z', synapseStore=True, etag='4ea27841-ef8e-4944-9a83-56d5ece5a31b', createdBy='1418096', path='batch12_correctedPhospho.csv', files=['batch12_correctedPhospho.csv'], id='syn70078415', parentId='syn70078365', concreteType='org.sagebionetworks.repo.model.FileEntity', versionNumber=3, isLatestVersion=True, cacheDir='', name='batch12_correctedPhospho.csv', versionLabel='3', createdOn='2025-10-07T15:56:24.984Z', modifiedBy='1418096', _file_handle={'id': '163984316', 'etag': '634ec130-4da8-4401-9731-511b589704b8', 'createdBy': '1418096', 'createdOn': '2025-10-07T15:56:25.000Z', 'modifiedOn': '2025-10-07T15:56:25.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': 'df202d20b8b8bcee6b0fbf1a7a712f37', 'fileName': 'batch12_correctedPhospho.csv', 'storageLocationId': 1, 'contentSize': 17060666, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/167bfcda-58b7-43d4-9457-4238fdae1118/batch12_correctedPhospho.csv', 'previewId': '163984318', 'isPreview': False, 'externalURL': None}, dataFileHandleId='163984316')
-syn$store(File('batch12_correctedGlobal.csv',parentId='syn70078365'))
-## File(synapseStore=True, createdBy='1418096', files=['batch12_correctedGlobal.csv'], name='batch12_correctedGlobal.csv', _file_handle={'id': '163984320', 'etag': 'f63e08df-542c-4642-a0f8-5314cda43a43', 'createdBy': '1418096', 'createdOn': '2025-10-07T15:56:29.000Z', 'modifiedOn': '2025-10-07T15:56:29.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/csv', 'contentMd5': '27e185221a058dafc833d1c2a52e74c1', 'fileName': 'batch12_correctedGlobal.csv', 'storageLocationId': 1, 'contentSize': 33970670, 'status': 'AVAILABLE', 'bucketName': 'proddata.sagebase.org', 'key': '1418096/c5da77ae-82a3-41a1-9045-d66fa87085a7/batch12_correctedGlobal.csv', 'previewId': '163984321', 'isPreview': False, 'externalURL': None}, id='syn70078416', createdOn='2025-10-07T15:56:29.301Z', concreteType='org.sagebionetworks.repo.model.FileEntity', cacheDir='', versionNumber=3, versionLabel='3', parentId='syn70078365', path='batch12_correctedGlobal.csv', modifiedBy='1418096', etag='935bbae3-5660-463b-b659-3d4a64de414b', isLatestVersion=True, modifiedOn='2025-10-07T16:10:25.266Z', dataFileHandleId='163984320')
-This notebook is designed to process multiple batches from +transcriptomics, proteomics, and phosphoproteomics data.
+In its simplest form, it will load the data in from synapse and join +it together in a single matrix. Additional options include batch +correction (needed for global proteomics and phosphoproteomics), batch +correction plots (before and after), and uploading the data back to +synapse.
+These scripts contain the functions that we will call below.
+- 00_cNF_helper_code.R is a basic script that pulls data and sets colors
+for samples.
+- 01_normalize_batchcorrect_omics.R contains all of the code for
+retrieving batches from synapse, performing batch correction, plotting,
+and uploading.
library(synapser)
+syn <- list(get = synapser::synGet, store = synapser::synStore)
+
+# Load helper metadata (00_cNF_helper_code.R defines 'meta' and 'pcols')
+source("../source/00_cNF_helper_code.R")
+
+# Source the pipeline
+source("../source/01_normalize_batchcorrect_omics.R")
+Here are the IDs of several samples that were used for protocol +optimization but not designed as experimental samples. We want to remove +these from the subsequent batch correction steps as they could skew the +“real” results. - Add to this list if more protocol optimization, +contaminated samples or anything else we want to remove is present.
+# Substrings to drop (These were the protocol optimization samples)
+drop_subs <- c(
+ "cNF_organoid_DIA_G_02_11Feb25",
+ "cNF_organoid_DIA_G_05_11Feb25",
+ "cNF_organoid_DIA_G_06_11Feb25",
+ "cNF_organoid_DIA_P_02_29Jan25",
+ "cNF_organoid_DIA_P_05_11Feb25",
+ "cNF_organoid_DIA_P_06_11Feb25"
+)
+We use run_modality() to perform all of the described
+steps.
Brief Summary:
+batches vector along with several other descriptive details
+/ params.Reference guide:
+modality: “global”, “phospho”, or “rna”
batches : list of batch configs, e.g.
+list( list(syn_id=“syn…”, cohort=1, value_start_col=5, +fname_aliquot_index=8), list(syn_id=“syn…”, cohort=2, value_start_col=5, +fname_aliquot_index=9) )
Required per batch:
+Optional per batch:
+meta: sample metadata joined by (aliquot, cohort); used to
+populate Specimen/Patient/Tumor
+
syn: synapser client
I am starting with the RNA batches as these are the simplest ones, +not actually requiring batch correction (seen in the plot titled “rna +samples” after the code runs).
+I’m going to describe the batches vector/argument that
+is required by the run_modality script here.
value_start_col and
+fname_aliquot_index may need to be set differently between
+batches.For the rna_batches vector, files are formatted identically so the
+value_start_col are both set to 3 and
+fname_aliquot_index is NULL.
This code is also written to handle different name formats in the +headers such as NF0018.T1.organoid, NF0018.T1.tissue, NF0018.skin, +NF0018.T2.organoids, NF0018.T2, NF0018.T3.organoids, NF0018.T3. ie: +organoid vs organoids, or mixed up capitalization.
+rna_batches <- list(
+ list(syn_id = "syn66352931", cohort = 1, value_start_col = 3, fname_aliquot_index = NULL),
+ list(syn_id = "syn70765053", cohort = 2, value_start_col = 3, fname_aliquot_index = NULL)
+)
+
+rna <- run_modality(
+ modality = "rna",
+ batches = rna_batches,
+ meta = meta,
+ syn = syn,
+ drop_name_substrings = drop_subs,
+ out_dir = "rna_test",
+ out_prefix = "rna",
+ upload_parent_id = "syn71099587",
+ pcols = pcols,
+ write_outputs = FALSE, # This has already been run so we don't need to write it again unless something changes.
+ save_basename = "RNA_12_no_batch_correct",
+ do_batch_correct = FALSE #Note this is set to false right now. Doesn't appear to be needed for RNA
+)
+
+In the plots above, we see that no batch correction was required or
+performed.
Okay, now we are moving on to global proteomics. This is run in the
+same functional method as the RNA data. We set our batch info, assign
+modality and other basic info in run_modality and then we
+run it. Again, we don’t upload the data (by default atleast). However,
+we do batch correct here - the reason why can be seen in the before and
+after plots.
Here we do use the fname_aliquot_index variable in
+batches. This essentially splits by underscore and takes the Xth
+value
Example for exp2 where fname_aliquot_index is 6:
+D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**01**_01May25_Romulus_BEHCoA-25-02-16.mzML:
+01D:\DMS_WorkDir1\cNF_organoid_exp2_DIA_G_**02**_01May25_Romulus_BEHCoA-25-02-16.mzML:
+02global_batches <- list(
+ list(syn_id = "syn69947355", cohort = 1, value_start_col = 5, fname_aliquot_index = 5),
+ list(syn_id = "syn69947352", cohort = 2, value_start_col = 5, fname_aliquot_index = 6)
+)
+
+global <- run_modality(
+ modality = "Global",
+ batches = global_batches,
+ meta = meta,
+ syn = syn,
+ drop_name_substrings = drop_subs,
+ out_dir = "global_test",
+ out_prefix = "global",
+ upload_parent_id = "syn70078365",
+ pcols = pcols,
+ write_outputs = FALSE, # This has already been run so we don't need to write it again unless something changes.
+ save_basename = "global_batch12_corrected",
+ do_batch_correct = TRUE
+)
+## Found 2 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
+Above we can see that the batch correction was successfully performed +for the global proteomic data. If anything changes or if new batches are +added, we can set write_outputs to TRUE and rerun.
+Finally, we run this same process for Phosphoproteomics. Again the +run_modality function is used, although the batch info and params are +updated. After this code is run, we will see that batch correction was +required and that it ran successfully.
+Again we use the fname_aliquot_index variable in
+batches. This essentially splits by underscore and takes the Xth
+value.
Example for exp1 where fname_aliquot_index is 8:
+I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338217_cNF_organoid_DIA_P_**04**_29Jan25_Ned_BEHCoA-25-01-02.raw:
+04I:\UserData\LeDay\Piehowski_orgonoids_Feb25\RawData\1338241_cNF_organoid_DIA_P_**01**_29Jan25_Ned_BEHCoA-25-01-02.raw:
+01phospho_batches <- list(
+ list(syn_id = "syn69963552", cohort = 1, value_start_col = 7, fname_aliquot_index = 8),
+ list(syn_id = "syn69947351", cohort = 2, value_start_col = 7, fname_aliquot_index = 9)
+)
+
+phospho <- run_modality(
+ modality = "Phospho",
+ batches = phospho_batches,
+ meta = meta,
+ syn = syn,
+ drop_name_substrings = drop_subs,
+ out_dir = "phospho_test",
+ out_prefix = "phospho",
+ upload_parent_id = "syn70078365",
+ pcols = pcols,
+ write_outputs = FALSE, # This has already been run so we don't need to write it again unless something changes.
+ save_basename = "phospho_batch12_corrected",
+ do_batch_correct = TRUE
+)
+This notebook is designed to take the drug viability data and the +long-format omics tables (RNA, global, and phospho), then compute and +plot correlations between drug response and molecular features. These +long-format omics tables are batch corrected when necessary and have +been returned from the Normalize Omics Notebook.
+The analyze_modality will load the inputs from Synapse,
+build the matrices, save the plots, and return the correlation results
+so we can quickly inspect them. This markdown file also uses a
+pdf_to_png_if_possible function to help display the plots
+directly in the knitted markdown file.
These scripts contain the functions that we will call below.
+These are the four main inputs we need. Drug viability plus the three +long-format omics tables.
+# Drugs
+drugs <- readr::read_tsv(synGet("syn69947322")$path)
+
+# RNA
+rlong <- readr::read_csv(synGet("syn71333780")$path)
+
+# Global
+glong <- readr::read_csv(synGet("syn70078416")$path)
+
+# Phospho
+plong <- readr::read_csv(synGet("syn70078415")$path)
+Below, we run the same workflow three times, once per modality. The +only thing that changes is which omics table we use and which column +identifies the feature.
+The RNA step calls analyze_modality() (end-to-end)
+which:
+- Builds a sample x drug matrix and a sample x feature matrix.
+- Saves drug summary plots to the output directory (most_efficacious,
+most_variable, and the heatmap).
+- Computes Spearman correlations between drug response and molecular
+features.
+- Returns the results as a list so we can inspect them directly in
+R.
For global and phospho, we call
+analyze_modality_correlations() only, which:
+- Reuses the drug matrix from the RNA run.
+- Builds the sample x feature matrix for that modality.
+- Computes Spearman correlations between drug response and molecular
+features.
+- Saves the correlation feature summary plot.
A few plot notes (these get written to outdir):
+- most_efficacious.pdf: drugs with the lowest mean
+viability across samples. This is basically a “top hits” plot.
+- most_variable.pdf: drugs with the highest variability
+across samples. These are the ones that may be more
+sample-dependent.
+- drug_heatmap_large_viability.pdf: a large
+sample-by-drug heatmap, but only for drugs measured in all samples. This
+is useful for clustering and spotting outliers. Samples/patients are
+separated by lines, within each are 1-3 tumors.
+- cor_features_by_drug.pdf: counts of significant
+correlated features per drug (split by direction). This is a quick way
+to see which drugs have real signal in this modality.
Note: most_efficacious,
+most_variable, and
+drug_heatmap_large_viability are based on the drug response
+table, not the omics table.
+They get recreated on every end-to-end run, but they are the same plots.
+Because of that, we run the end-to-end workflow once (RNA), and then for
+global/phospho we only run the modality-only correlation step.
Note 2: The end-to-end analyze_modality
+function may take a long time to run (~30-45 min).
corr_rna <- analyze_modality(
+ fits = drugs,
+ df_long = rlong,
+ sample_col = "Specimen",
+ feature_col = "feature_id",
+ value_col = "correctedAbundance",
+ metric = "uM_viability",
+ heatmap_filename = "rna_drug_heatmap_large_viability.pdf",
+ outdir = "new_figs"
+)
+
+# Copy the shared filenames to RNA-specific names so later modalities don't overwrite what we want to show here
+dir.create("new_figs", showWarnings = FALSE, recursive = TRUE)
+file.copy("new_figs/most_efficacious.pdf", "new_figs/rna_most_efficacious.pdf", overwrite = TRUE)
+file.copy("new_figs/most_variable.pdf", "new_figs/rna_most_variable.pdf", overwrite = TRUE)
+file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/rna_cor_features_by_drug.pdf", overwrite = TRUE)
+For RNA, the “features” are the rows we test in the correlation step,
+and they come directly from the feature_id column in the
+RNA long table (rlong). In most cases
+feature_id is a gene ID (or a transcript ID, depending on
+how the RNA file was generated). The workflow builds a sample × feature
+matrix from RNA, then correlates every RNA feature against drug
+viability for each drug.
If a drug shows a large number of significant RNA correlations (after +FDR filtering), that is a strong sign the RNA signal is tracking +response for that drug. In other words, the transcriptome is separating +the sensitive vs resistant samples for that drug in a consistent +way.
+Below are the plots I show right after running RNA. This is the only +place I show the drug-only plots, because those are based on the drug +response table and are essentially the same no matter which omics table +we are using.
+Positive Correlation indicates a higher +viability which means that there is lower +effect of the drug / higher resistance to the +drug.
+Negative Correlation indicates a lower +viability which means that there is greater +effect of the drug / higher sensitivity to the +drug.
+Metric used: uM_viability
Files shown for RNA:
+- new_figs/rna_most_efficacious.pdf
+- new_figs/rna_most_variable.pdf
+- new_figs/rna_drug_heatmap_large_viability.pdf
+- new_figs/rna_cor_features_by_drug.pdf
## Converting page 1 to rna_most_efficacious_1.png... done!
+## Converting page 1 to rna_most_variable_1.png... done!
+## Converting page 1 to rna_drug_heatmap_large_viability_1.png... done!
+## Converting page 1 to rna_cor_features_by_drug_1.png... done!
+corr_global <- analyze_modality_correlations(
+ df_long = glong,
+ sample_col = "Specimen",
+ feature_col = "Gene",
+ value_col = "correctedAbundance",
+ drug_mat = corr_rna$drug_mat,
+ outdir = "new_figs",
+ fdr_thresh = 0.25
+)
+
+# Copy the shared filenames to global-specific names
+file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/global_cor_features_by_drug.pdf", overwrite = TRUE)
+For global proteomics, the features are proteins. In theory, these +may be closer to the phenotype than RNA.
+Global correlation plot
+## Converting page 1 to global_cor_features_by_drug_1.png... done!
+corr_phospho <- analyze_modality_correlations(
+ df_long = plong,
+ sample_col = "Specimen",
+ feature_col = "site",
+ value_col = "correctedAbundance",
+ drug_mat = corr_rna$drug_mat,
+ outdir = "new_figs",
+ fdr_thresh = 0.25
+)
+
+# Copy the shared filenames to phospho-specific names
+file.copy("new_figs/cor_features_by_drug.pdf", "new_figs/phospho_cor_features_by_drug.pdf", overwrite = TRUE)
+For phospho, the features are phosphorylation sites. It is normal for +this to be noisier and higher-dimensional, but when a drug shows a +cluster of correlated sites, it can be really informative for the +mechanism - however, there is just one correlation here with 2 +features.
+Here is the phospho correlation plot - for some reason, this one +doesn’t always knit correctly, so I’m adding the table too.
+## Converting page 1 to phospho_cor_features_by_drug_1.png... done!
+corr_phospho$cor_summary
+## # A tibble: 5 × 4
+## drug direction features meanCor
+## <chr> <chr> <int> <dbl>
+## 1 Daporinad pos 1 0.847
+## 2 ML 210 neg 1 -0.853
+## 3 NVP-BEZ235 pos 2 0.864
+## 4 Tofacitinib pos 1 0.868
+## 5 Tovorafenib pos 1 0.891
+This script runs pathway enrichment using leapR, using the drug
+viability data plus the long-format omics tables (RNA, global, and
+phospho).
+These long-format omics tables are batch corrected when necessary and
+have been returned from the Normalize Omics Notebook.
The main idea is:
+- Split samples into “sensitive” vs “resistant” for each drug (based on
+viability).
+- Run enrichment on each side.
+- Save the enrichment tables and plots so we can quickly scan which
+pathways show up for which drugs.
This file is written to be simple and repeatable. It loads the inputs +from Synapse, runs the same workflow per modality, and stores results in +a cache file so we do not re-run heavy work unless we need to.
+Note: This will take a couple of hours to run the +first time. Do not overwrite the cache unless +completely necessary.
+These scripts contain the functions that we will call below.
+These are the four main inputs we need - just like in +analyze_modality. Drug viability plus the three long-format omics +tables.
+# Drugs
+drugs <- readr::read_tsv(synGet("syn69947322")$path)
+
+# RNA
+rlong <- readr::read_csv(synGet("syn71333780")$path)
+
+# Global
+glong <- readr::read_csv(synGet("syn70078416")$path)
+
+# Phospho
+plong <- readr::read_csv(synGet("syn70078415")$path)
+Below, we run the same workflow three times, once per (omics)
+modality. The main things that change are:
+- which omics table we use - which column represents the feature - which
+geneset library we test (krbpaths vs kinasesubstrates)
Each call to run_leapr_directional_one_cached() does a
+few things:
+- Splits samples into “sensitive” vs “resistant” for each drug (based on
+viability).
+- Runs enrichment for each drug and direction.
+- Writes CSV outputs (if write_csvs = TRUE).
+- Saves an .Rdata cache so future runs are fast unless inputs or
+settings change.
One note about plots: run_leapr_directional_one_cached()
+computes the enrichment results, but it does not automatically save the
+pathway barplots.
+Those are generated by save_leapr_plots(). In this Rmd, I
+intentionally save plots for just mirda and
+olaparib so we do not generate hundreds of drug PDFs by
+accident.
So, If you want more plots, just add more drugs to the
+drugs = c(...) list for each modality.
For RNA, the features come from the feature_id column in
+the RNA long table (rlong). In most cases, this is a gene
+or transcript ID. Here we use the krbpaths gene set
+collection.
res_bio_rna <- run_leapr_directional_one_cached(
+ drugs = drugs,
+ df_long = rlong,
+ sample_col = "Specimen",
+ feature_col = "feature_id",
+ value_col = "correctedAbundance",
+ omic_label = "rna",
+ cache_path = "../leapR_RNA_krbpaths_enrichment_direction_split.Rdata",
+ write_csvs = TRUE,
+ always_rerun = FALSE,
+ test_one = FALSE,
+ geneset_name = "krbpaths"
+)
+
+# This is where pathway plot PDFs get created.
+# I keep the list short on purpose so the report stays readable.
+save_leapr_plots(
+ res_bio_rna,
+ omic_label = "rna",
+ drugs = c("Mirdametinib", "olaparib"),
+ top_n = 15
+)
+Below are the RNA pathway plots created by
+save_leapr_plots(). These files get written to
+figs/.
## Converting page 1 to pathways_Mirdametinib_rna_resistant_top15_1.png... done!
+## Converting page 1 to pathways_Mirdametinib_rna_sensitive_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_rna_resistant_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_rna_sensitive_top15_1.png... done!
+For global proteomics, the features are proteins in the
+Gene column. We keep the same krbpaths gene
+set collection so RNA and global are easier to compare.
res_bio_global <- run_leapr_directional_one_cached(
+ drugs = drugs,
+ df_long = glong,
+ sample_col = "Specimen",
+ feature_col = "Gene",
+ value_col = "correctedAbundance",
+ omic_label = "global",
+ cache_path = "../leapR_Global_krbpaths_enrichment_direction_split.Rdata",
+ write_csvs = TRUE,
+ always_rerun = FALSE,
+ test_one = FALSE,
+ geneset_name = "krbpaths"
+)
+
+save_leapr_plots(
+ res_bio_global,
+ omic_label = "global",
+ drugs = c("Mirdametinib", "olaparib"),
+ top_n = 15
+)
+Here are the global pathway plots. Same idea as RNA, just using +protein abundance instead of RNA abundance.
+## Converting page 1 to pathways_Mirdametinib_global_resistant_top15_1.png... done!
+## Converting page 1 to pathways_Mirdametinib_global_sensitive_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_global_resistant_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_global_sensitive_top15_1.png... done!
+For phospho, the features are phosphorylation sites in the
+site column. Here we use a phospho-specific gene set
+library, kinasesubstrates, because site-level maps more
+cleanly to kinase activity than to general pathway collections.
res_bio_phospho <- run_leapr_directional_one_cached(
+ drugs = drugs,
+ df_long = plong,
+ sample_col = "Specimen",
+ feature_col = "site",
+ value_col = "correctedAbundance",
+ omic_label = "phospho",
+ cache_path = "../leapR_Phospho_kinasesubstrates_enrichment_direction_split.Rdata",
+ write_csvs = TRUE,
+ always_rerun = FALSE,
+ test_one = FALSE,
+ geneset_name = "kinasesubstrates"
+)
+
+save_leapr_plots(
+ res_bio_phospho,
+ omic_label = "phospho",
+ drugs = c("Mirdametinib", "olaparib"),
+ top_n = 15
+)
+Here are the phospho pathway plots. For mirdametinib and olaparib, no +significant pathways are seen for the phospho data using drug +viability.
+## Converting page 1 to pathways_Mirdametinib_phospho_resistant_top15_1.png... done!
+## Converting page 1 to pathways_Mirdametinib_phospho_sensitive_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_phospho_resistant_top15_1.png... done!
+## Converting page 1 to pathways_olaparib_phospho_sensitive_top15_1.png... done!
+Overall we can see that the number of drugs with significant pathways +vary significantly by omics type. RNA demonstrates 231, Global +proteomics has 167, while phosphoproteomics only shows 44 with just 1-4 +effected pathways per drug.
+alpha <- 0.05 #uses SignedBH_pvalue
+page_size <- 85 #number of drugs per plot. If too many, the names will overlap.
+plot_sig_pathways_one_omic_paged(res_bio_rna, "rna", alpha = alpha, page_size = page_size)
+plot_sig_pathways_one_omic_paged(res_bio_global, "global", alpha = alpha, page_size = page_size)
+plot_sig_pathways_one_omic_paged(res_bio_phospho,"phospho",alpha = alpha, page_size = page_size)
+always_rerun = TRUE.test_one = TRUE (useful for debugging).drugs = c(...) list in each save_leapr_plots()
+call.figs/ with names
+like:
+